In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

Load Data

In [5]:
materials = pd.read_csv('materials.csv')
test_pairs = pd.read_csv('test_pairs.csv')

Data Preprocessing

In [6]:
def preprocess_text(text):
    text = text.lower() #lowercase conversion
    # Remove special characters, numbers
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text

In [7]:
materials['Processed_Description'] = materials['Material_Description'].apply(preprocess_text)

Display cleaned data

In [8]:
print("\nCleaned Material Descriptions:")
print(materials[['Material_Description', 'Processed_Description']].head())


Cleaned Material Descriptions:
                                Material_Description  \
0  INSULATION GASKET KIT - 2" - 300# - DOUBLE COM...   
1  ASSEMBLY COMPRESSOR - 10" - 150# - HOT DIP GAL...   
2  SPUR GEAR PINION SHAFT - 10" - 150# - SCH.XS A...   
3  SUCTION HEADER - 6" - 600# - HOT DIP GALVANIZE...   
4  MOVABLE STOOL - 6" - 150# - DUAL CERTIFIED, DR...   

                               Processed_Description  
0  insulation gasket kit  2  300  double compress...  
1  assembly compressor  10  150  hot dip galvaniz...  
2  spur gear pinion shaft  10  150  schxs astm a1...  
3  suction header  6  600  hot dip galvanized dra...  
4  movable stool  6  150  dual certified drawing ...  


Applying preprocessing and creating TF-IDF features

In [9]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(materials['Processed_Description'])

Feature Extraction of pairs

In [10]:
def get_similarity_scores(test_pairs, tfidf_matrix, materials):
    similarities = []
    for _, row in test_pairs.iterrows():
        idx1 = materials[materials['ID'] == row['ID_1']].index[0]
        idx2 = materials[materials['ID'] == row['ID_2']].index[0]
        score = cosine_similarity(tfidf_matrix[idx1], tfidf_matrix[idx2])
        rounded_score = round(score[0][0], 2)
        similarities.append(rounded_score)
        
    return similarities

Similarity Calculation

In [11]:
test_pairs['Similarity_Score'] = get_similarity_scores(test_pairs, tfidf_matrix, materials)

Displaying test pairs with similarity_score

In [12]:
print("\nTest Pairs with Similarity Scores:")
print(test_pairs.head())


Test Pairs with Similarity Scores:
   ID_1  ID_2  Similarity_Score
0   375   932              0.03
1   588    22              0.11
2   876   724              0.12
3   270   154              0.17
4   512   544              0.02


Saving smilarity scores to submission.csv

In [14]:
test_pairs[['ID_1', 'ID_2', 'Similarity_Score']].to_csv('submission.csv', index=False)
print("\nSimilarity scores saved to submission.csv!")


Similarity scores saved to submission.csv!
