<a href="https://colab.research.google.com/github/yashajayrathod/Data-EngineeringC/blob/main/ConstructiveIQ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import euclidean_distances

In [None]:
# Load the Dataset
from google.colab import files
uploaded = files.upload()
# Assuming materials.csv is uploaded, read the file
materials = pd.read_csv('materials.csv')



Saving materials.csv to materials (4).csv


In [None]:
print(materials.shape)

(1000, 2)


In [None]:
uploaded = files.upload()
test_pairs = pd.read_csv('test_pairs.csv')

Saving test_pairs.csv to test_pairs (1).csv


In [None]:

print(test_pairs.shape)

(500, 2)


In [None]:
# Preview the dataset
print("First few rows of materials dataset:")
print(materials.head())

First few rows of materials dataset:
   ID                               Material_Description
0   1  INSULATION GASKET KIT - 2" - 300# - DOUBLE COM...
1   2  ASSEMBLY COMPRESSOR - 10" - 150# - HOT DIP GAL...
2   3  SPUR GEAR PINION SHAFT - 10" - 150# - SCH.XS A...
3   4  SUCTION HEADER - 6" - 600# - HOT DIP GALVANIZE...
4   5  MOVABLE STOOL - 6" - 150# - DUAL CERTIFIED, DR...


In [None]:
# Data Exploration: Check for null values
print("\nChecking for missing values in the dataset:")
print(materials.isnull().sum())


Checking for missing values in the dataset:
ID                      0
Material_Description    0
dtype: int64


In [None]:
# Text Preprocessing

# Convert material descriptions to lowercase
materials['clean_description'] = materials['Material_Description'].str.lower()

# Remove punctuation and special characters
materials['clean_description'] = materials['clean_description'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))


In [None]:
# Download NLTK packages for tokenization and lemmatization
nltk.download('punkt')
nltk.download('wordnet')

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Tokenize and lemmatize the words
materials['clean_description'] = materials['clean_description'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

# Show cleaned data
print("\nCleaned material descriptions:")
print(materials[['Material_Description', 'clean_description']].head())



Cleaned material descriptions:
                                Material_Description  \
0  INSULATION GASKET KIT - 2" - 300# - DOUBLE COM...   
1  ASSEMBLY COMPRESSOR - 10" - 150# - HOT DIP GAL...   
2  SPUR GEAR PINION SHAFT - 10" - 150# - SCH.XS A...   
3  SUCTION HEADER - 6" - 600# - HOT DIP GALVANIZE...   
4  MOVABLE STOOL - 6" - 150# - DUAL CERTIFIED, DR...   

                                   clean_description  
0  insulation gasket kit double compression with ...  
1  assembly compressor hot dip galvanized drawing...  
2  spur gear pinion shaft schxs astm a grb seamle...  
3  suction header hot dip galvanized drawing no p...  
4   movable stool dual certified drawing no prjr iso  


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Vectorization using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
tfidf = TfidfVectorizer(lowercase=True, stop_words='english')
tfidf_matrix = tfidf.fit_transform(materials['clean_description'])

# Show the shape of the TF-IDF matrix
print("\nShape of the TF-IDF matrix:")
print(tfidf_matrix.shape)

# Optional: You can print feature names (the vocabulary learned by TF-IDF)
print("\nTF-IDF Feature Names (first 10):")
print(tfidf.get_feature_names_out()[:10])

# Optionally, show a sample of the TF-IDF matrix
print("\nSample of the TF-IDF matrix (first 5 rows):")
print(tfidf_matrix[:5].toarray())


Shape of the TF-IDF matrix:
(1000, 58)

TF-IDF Feature Names (first 10):
['air' 'asme' 'assembly' 'astm' 'ball' 'barrel' 'bellow' 'cable'
 'certified' 'compression']

Sample of the TF-IDF matrix (first 5 rows):
[[0.         0.19046231 0.         0.         0.         0.
  0.         0.         0.         0.35428813 0.         0.
  0.         0.         0.27562829 0.11136349 0.         0.
  0.         0.         0.39556824 0.         0.         0.
  0.         0.         0.35428813 0.         0.         0.39556824
  0.         0.         0.39556824 0.         0.         0.
  0.         0.         0.         0.         0.17137213 0.35428813
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.2266565  0.40406494 0.         0.         0.
  0.         0.         0.         0.         0.50550954 0.
  0.         0.39609103 0.         0.13252626 0.         0.
  0. 

In [None]:
# --- Word2Vec Embeddings ---
# Tokenize descriptions
materials['tokenized_description'] = materials['clean_description'].apply(lambda x: word_tokenize(x))

# Train Word2Vec model
w2v_model = Word2Vec(sentences=materials['tokenized_description'], vector_size=100, window=5, min_count=1)
# Print a sample of word vectors
print("\nWord vector for 'steel':")
print(w2v_model.wv['steel'])



Word vector for 'steel':
[-1.52885616e-02  6.64749295e-02 -4.21601087e-02  6.72345757e-02
 -1.00404605e-01 -2.93692470e-01  1.70459256e-01  3.74528557e-01
 -1.92792013e-01 -3.27101052e-01  2.73630116e-02 -2.45969102e-01
  8.56357347e-03  1.43906370e-01  9.05815586e-02 -1.20122954e-01
  1.67572901e-01 -9.61685851e-02 -4.75215167e-02 -3.08807820e-01
  2.10382059e-01 -1.41951302e-02  2.60401815e-01 -1.11829080e-01
  2.17747569e-04 -2.35064290e-02 -2.12124616e-01  2.94731949e-02
 -6.73388094e-02  3.85196544e-02  1.51961699e-01 -3.73749733e-02
  2.13348731e-01 -3.72431755e-01  4.86928038e-03  1.33517191e-01
  5.96144088e-02  3.74537669e-02 -1.43242285e-01 -9.43176076e-02
  1.53755695e-02 -1.19382642e-01  6.05836092e-03  3.01628504e-02
  9.51964259e-02 -1.23749427e-01 -1.59930199e-01 -9.79817286e-02
  1.33164540e-01  1.12259433e-01  1.00482002e-01 -1.30778760e-01
 -9.06372592e-02 -1.17707863e-01  4.57572155e-02 -7.23674595e-02
  1.33563653e-01 -9.27082077e-02 -6.28456846e-02  4.01346385e-02

In [None]:
# Function to get the average word2vec vector for a description
def get_avg_word2vec(tokens, model, vector_size):
    # Return the average of all word vectors for words in the tokens list
    vec = np.zeros(vector_size)
    count = 0
    for word in tokens:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    if count > 0:
        vec /= count
    return vec

# Create an array of average Word2Vec vectors for each material description
vector_size = w2v_model.vector_size
avg_w2v_vectors = np.array([get_avg_word2vec(desc, w2v_model, vector_size) for desc in materials['tokenized_description']])

# Show the shape of the Word2Vec vector matrix
print("Word2Vec matrix shape:", avg_w2v_vectors.shape)

# Optionally, display the first few averaged vectors
print("\nSample of averaged Word2Vec vectors (first 5):")
print(avg_w2v_vectors[:5])

Word2Vec matrix shape: (1000, 100)

Sample of averaged Word2Vec vectors (first 5):
[[-0.01841371  0.06735764 -0.0510124   0.05793531 -0.09912897 -0.29787587
   0.18722055  0.3905565  -0.20040448 -0.32707486  0.0320042  -0.24411822
   0.00301199  0.1449091   0.07415974 -0.13921215  0.18685052 -0.10044854
  -0.06661959 -0.32749226  0.20919403 -0.01993958  0.28275068 -0.12535955
  -0.00641657 -0.04614534 -0.21308315  0.02368523 -0.0886104   0.04238924
   0.1712152  -0.04505572  0.22843935 -0.37953155 -0.00273505  0.13798612
   0.05039443  0.04325749 -0.13049625 -0.1153054   0.00259891 -0.12707146
   0.01576783  0.03730688  0.10414253 -0.12476447 -0.16717905 -0.10554431
   0.1477737   0.12236704  0.0919294  -0.13399137 -0.08968    -0.13652571
   0.04665227 -0.08687657  0.14661095 -0.09055839 -0.07131308  0.04441106
   0.04303614 -0.15246347  0.19146341  0.07934599 -0.17050284  0.30182057
  -0.05127768  0.16770494 -0.33238961  0.13880662 -0.03231328  0.13670196
   0.25365197  0.05078916  0.

In [None]:

# Cosine similarity between all TF-IDF vectors
cosine_sim_matrix = cosine_similarity(tfidf_matrix)


In [None]:
# Step 5: Define function to retrieve Cosine Similarity for pairs
def get_cosine_similarity(id1, id2, similarity_matrix):
    return similarity_matrix[id1-1, id2-1]  # Assuming IDs are 1-based, arrays are 0-based



In [None]:
# Step 6: Create a mapping from material ID to the row index in materials.csv
id_to_index = {id: idx for idx, id in enumerate(materials['ID'])}

# Step 7: Update the get_cosine_similarity function to handle index lookup
def get_cosine_similarity(id1, id2, similarity_matrix, id_to_index):
    try:
        index1 = id_to_index[id1]
        index2 = id_to_index[id2]
        return similarity_matrix[index1, index2]
    except KeyError:
        # If the ID is not found in the id_to_index mapping, return a default similarity (e.g., 0)
        return 0.0

# Step 8: Apply Cosine Similarity on the test pairs using the updated function
test_pairs['Similarity_Score'] = test_pairs.apply(lambda row: get_cosine_similarity(row['ID_1'], row['ID_2'], cosine_sim_matrix, id_to_index), axis=1)

# Step 9: Preview the result
print(test_pairs.head())

# Step 10: Save results to a submission CSV
test_pairs[['ID_1', 'ID_2', 'Similarity_Score']].to_csv('submission.csv', index=False)

# Download the submission file
files.download('submission.csv')


   ID_1  ID_2  Similarity_Score
0   375   932          0.076284
1   588    22          0.055578
2   876   724          0.014835
3   270   154          0.197916
4   512   544          0.048491


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Import required libraries for Jaccard Similarity
from sklearn.metrics import jaccard_score

# Function to calculate Jaccard similarity between two material descriptions (bag of words approach)
def get_jaccard_similarity(id1, id2, materials):
    set1 = set(materials.loc[materials['ID'] == id1, 'Material_Description'].values[0].split())
    set2 = set(materials.loc[materials['ID'] == id2, 'Material_Description'].values[0].split())
    return len(set1.intersection(set2)) / len(set1.union(set2))

# Apply Jaccard similarity for each test pair
test_pairs['Jaccard_Similarity'] = test_pairs.apply(lambda row: get_jaccard_similarity(row['ID_1'], row['ID_2'], materials), axis=1)

# Preview the results
print(test_pairs[['ID_1', 'ID_2', 'Jaccard_Similarity']].head())

# Save Jaccard results to a CSV (optional)
test_pairs[['ID_1', 'ID_2', 'Jaccard_Similarity']].to_csv('jaccard_submission.csv', index=False)
files.download('jaccard_submission.csv')


   ID_1  ID_2  Jaccard_Similarity
0   375   932            0.136364
1   588    22            0.192308
2   876   724            0.153846
3   270   154            0.217391
4   512   544            0.115385


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Step 1: Create a mapping from material ID to the row index in materials.csv
id_to_index = {id: idx for idx, id in enumerate(materials['ID'])}

# Step 2: Define the function to retrieve Euclidean distance for pairs
def get_euclidean_distance(id1, id2, distance_matrix, id_to_index):
    try:
        index1 = id_to_index[id1]
        index2 = id_to_index[id2]
        return distance_matrix[index1, index2]
    except KeyError:
        # If the ID is not found, return a default distance (e.g., high distance)
        return np.inf  # or return a specific value like 1.0 or some other default

# Step 3: Apply Euclidean Distance on the test pairs
test_pairs['Euclidean_Distance'] = test_pairs.apply(
    lambda row: get_euclidean_distance(row['ID_1'], row['ID_2'], euclidean_dist_matrix, id_to_index),
    axis=1
)

# Step 4: Preview the result
print(test_pairs[['ID_1', 'ID_2', 'Euclidean_Distance']].head())

# Optionally save the Euclidean Distance results to a CSV
test_pairs[['ID_1', 'ID_2', 'Euclidean_Distance']].to_csv('euclidean_submission.csv', index=False)
files.download('euclidean_submission.csv')


   ID_1  ID_2  Euclidean_Distance
0   375   932            1.359203
1   588    22            1.374352
2   876   724            1.403684
3   270   154            1.266558
4   512   544            1.379499


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Check available columns in materials DataFrame
print(materials.columns)


Index(['ID', 'Material_Description', 'Description_Length', 'Count_steel',
       'Count_plastic', 'Count_copper', 'Similarity_Score'],
      dtype='object')


In [None]:
# Step 1: Feature Engineering
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Load the training data
materials = pd.read_csv('materials.csv')

# Calculate additional features
materials['Description_Length'] = materials['Material_Description'].apply(len)

# Optional: Count specific material-related terms (example: "steel", "plastic")
specific_terms = ['steel', 'plastic', 'aluminum']
for term in specific_terms:
    materials[f'Count_{term}'] = materials['Material_Description'].str.lower().str.count(term)

# Load test pairs
test_pairs = pd.read_csv('test_pairs.csv')

# Create a function to get features for each pair
def create_features(pair):
    id1, id2 = pair['ID_1'], pair['ID_2']
    desc1 = materials.loc[materials['ID'] == id1, 'Material_Description'].values[0]
    desc2 = materials.loc[materials['ID'] == id2, 'Material_Description'].values[0]

    features = {
        'Description_Length_1': len(desc1),
        'Description_Length_2': len(desc2),
        'Similarity_Score': get_cosine_similarity(id1, id2, cosine_sim_matrix, id_to_index)
    }

    for term in specific_terms:
        features[f'Count_{term}_1'] = desc1.lower().count(term)
        features[f'Count_{term}_2'] = desc2.lower().count(term)

    return pd.Series(features)

# Step 2: Prepare dataset for modeling
features_df = test_pairs.apply(create_features, axis=1)

# Combine features with similarity score for the training phase
# You should have a target variable for training; assuming it's available in materials
# For demonstration, we use a random similarity score as the target
# Replace this with your actual target variable for training
y = np.random.rand(len(features_df))  # Replace with actual similarity scores if available

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_df, y, test_size=0.2, random_state=42)

# Step 3: Train a RandomForest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 4: Predict similarity scores on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Step 5: Predict similarity scores for the actual test pairs
# Use the same features created earlier
predictions = model.predict(features_df)

# Add predictions to the test_pairs DataFrame
test_pairs['Predicted_Similarity_Score'] = predictions

# Step 6: Save the results to a submission file
test_pairs[['ID_1', 'ID_2', 'Predicted_Similarity_Score']].to_csv('enhanced_submission.csv', index=False)

# Download the enhanced submission file
from google.colab import files
files.download('enhanced_submission.csv')


Mean Squared Error: 0.11372899068798402


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np

# Step 2: Load the Test Pairs and Predicted Scores
# Load your test pairs
test_pairs = pd.read_csv('test_pairs.csv')  # Ensure the path is correct

# Example: Predicted similarity scores (you should replace this with your actual predictions)
# Let's say these are your predictions
predicted_similarity_scores = np.random.rand(len(test_pairs))  # Replace with your actual predicted scores
test_pairs['Predicted_Similarity_Score'] = predicted_similarity_scores

# Step 3: Load True Labels for Test Pairs
# Replace this with the actual ground truth labels
# For example, let's create some dummy true labels for demonstration
# Assuming we have some ground truth similarities for the pairs
# Ensure true_labels has the same length as test_pairs
true_labels = np.random.randint(0, 2, size=len(test_pairs))  # Replace with actual labels

# Step 4: Define Average Precision Function
def average_precision(y_true, y_pred, k):
    top_k_indices = np.argsort(y_pred)[::-1][:k]  # Indices of the top K predictions (descending order)
    correct_predictions = 0
    ap = 0.0

    for i, idx in enumerate(top_k_indices):
        if y_true[idx] == 1:  # Assuming 1 indicates a similar pair
            correct_predictions += 1
            ap += correct_predictions / (i + 1)  # Precision at rank i + 1

    return ap / min(k, np.sum(y_true)) if np.sum(y_true) > 0 else 0.0

# Step 5: Define MAP@K Function
def mean_average_precision_at_k(test_pairs, true_labels, k):
    map_k = 0.0
    num_queries = len(test_pairs)

    for i in range(num_queries):
        y_true_pair = true_labels # Access the true label for the current pair
        y_pred_pair = test_pairs['Predicted_Similarity_Score'].values[i] # Access the prediction for the current pair
        map_k += average_precision(y_true_pair, y_pred_pair, k)

    return map_k / num_queries

# Step 6: Set the value of K for evaluation
k = 5  # You can change this to any desired value

# Step 7: Calculate MAP@K
map_k = mean_average_precision_at_k(test_pairs, true_labels, k)

# Step 8: Print the MAP@K Result
print(f'Mean Average Precision at {k}: {map_k}')

Mean Average Precision at 5: 0.0


In [None]:
import pandas as pd
import numpy as np

# Load the test pairs
test_pairs = pd.read_csv('test_pairs.csv')

# Generate random similarity scores between 0 and 1
test_pairs['Similarity_Score'] = np.random.rand(len(test_pairs))

# Save the submission file
test_pairs[['ID_1', 'ID_2', 'Similarity_Score']].to_csv('submission.csv', index=False)

# Print confirmation
print("Submission file 'submission.csv' created successfully.")

Submission file 'submission.csv' created successfully.
