In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Datasets loading,
vehicles_dataset = pd.read_excel('Dataset_vehiculos.xlsx')
fasecolda_guide_dataset = pd.read_csv('Guia_CSV_331.csv')

In [3]:
# Data cleaning, 
#double square brakets ensures result is still data frame and not series. Also bcs multiple columns selection
vehicles_reference = vehicles_dataset[['REFERENCIA']]
vehicles_reference.loc[:,'REFERENCIA'] = vehicles_reference['REFERENCIA'].str.lower().fillna('')
print("*Vehicles Reference*")
print(vehicles_reference.head())

*Vehicles Reference*
                             REFERENCIA
0  [3] vibrant mt 1250cc 2ab abs aa r13
1              morning 1.1 at 1100cc aa
2                                      
3            jhr + mt 2700cc td 4x2 abs
4     spice 1.4 mt 1400cc 4p abs 2ab ct


In [4]:
fasecolda_references = fasecolda_guide_dataset[['Referencia1', 'Referencia2', 'Referencia3', 'Codigo']]
fasecolda_references_combined = fasecolda_references.melt(id_vars=['Codigo'], value_name='ReferenciaCombinada').drop(columns=['variable'])
fasecolda_references_combined.loc[:,'ReferenciaCombinada'] = fasecolda_references_combined['ReferenciaCombinada'].str.lower().fillna('')
print("*Fasecolda References Combined*")
print(fasecolda_references_combined.head())

*Fasecolda References Combined*
   Codigo ReferenciaCombinada
0  101001                2141
1  208003            wrangler
2  208004            wrangler
3  206001               eagle
4  301001                qute


In [5]:
# Semantic similarity processing
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer().fit(pd.concat([vehicles_reference['REFERENCIA'], fasecolda_references_combined['ReferenciaCombinada']]))
# Transform the text data into TF-IDF vectors
vehicles_tfidf = vectorizer.transform(vehicles_reference['REFERENCIA'])
fasecolda_tfidf = vectorizer.transform(fasecolda_references_combined['ReferenciaCombinada'])
# Compute cosine similarity
similarity_matrix = cosine_similarity(vehicles_tfidf, fasecolda_tfidf)
# Find the best matches
matches = similarity_matrix.argmax(axis=1)
# Get the matched codes
matched_codes = fasecolda_references_combined.iloc[matches].reset_index().set_index('index')['Codigo']
vehicles_dataset['Codigo_Fasecolda'] = matched_codes.values
vehicles_dataset.head()

Unnamed: 0,PLACA,MARCA,LINEA,REFERENCIA,Codigo_Fasecolda,Referencia_asignada
0,1,KIA,PICANTO,[3] VIBRANT MT 1250CC 2AB ABS AA R13,4601251,
1,2,KIA,PICANTO,MORNING 1.1 AT 1100CC AA,3201294,
2,3,CHEVROLET,N300,,101001,
3,4,JAC,HFC1035KN,JHR + MT 2700CC TD 4X2 ABS,11311041,
4,5,KIA,RIO UB EX,SPICE 1.4 MT 1400CC 4P ABS 2AB CT,4601149,
