In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
cur_path = "./drive/Othercomputers/Vivian's MacBook Pro 2021/DS5720/FinalProject"
os.chdir(cur_path)
!pwd

/content/drive/Othercomputers/Vivian's MacBook Pro 2021/DS5720/FinalProject


# 1. Import data

In [3]:
import pandas as pd
task = pd.read_csv("./data/task2/train-v0.2.csv")
mega_table = pd.read_csv("./data/product_catalogue-v0.2.csv")
merged_df = pd.merge(task, mega_table, how = 'left', left_on = ['product_id', 'query_locale'], right_on = ['product_id', 'product_locale'])
merged_df = merged_df.fillna('')

## 1.1 select language

In [4]:
lang = 'es'
merged_df = merged_df[merged_df['product_locale']==lang]

# 2. Add Features

## 1.1 if query is in project description

In [5]:
def is_included_in_title(row):
  return int(row['query'].lower() in row['product_title'].lower())
    
def is_included_in_description(row):
  return int(row['query'].lower() in row['product_description'].lower())

def is_included_in_bullet_point(row):
  return int(row['query'].lower() in row['product_bullet_point'].lower())

def is_included_in_brand(row):
  return int(row['query'].lower() in row['product_brand'].lower())
    
def is_included_in_color_name(row):
  return int(row['query'].lower() in row['product_color_name'].lower())

def is_included_in_any(row):
  return row['is_included_in_title'] or row['is_included_in_description'] or row['is_included_in_bullet_point'] or row['is_included_in_brand'] or row['is_included_in_color_name']

In [6]:
merged_df['is_included_in_title'] = merged_df.apply(is_included_in_title, axis=1)
merged_df['is_included_in_description'] = merged_df.apply(is_included_in_description, axis=1)
merged_df['is_included_in_bullet_point'] = merged_df.apply(is_included_in_bullet_point, axis=1)
merged_df['is_included_in_brand'] = merged_df.apply(is_included_in_brand, axis=1)
merged_df['is_included_in_color_name'] = merged_df.apply(is_included_in_color_name, axis=1)

In [7]:
merged_df['is_included_in_any'] = merged_df.apply(is_included_in_any, axis=1)


## 1.2 get embedding similarity

In [10]:
# def get_query_encoding(row):
#   return model.encode(row['query'])
from scipy.spatial.distance import cosine

def get_sim2(row):
  q = row['query']
  p = row['product_title']
  try:
    p_embed = query_embeddings[query_embeddings['Query'] == q]['combined'].values[0]
    q_embed = product_embeddings[product_embeddings['Product'] == p]['combined'].values[0]
    sim = 1-cosine(p_embed, q_embed)
    return sim
  except:
    return 0
  

In [11]:
query_embeddings = pd.read_csv("./data/query_embedding.csv")
query_embedding_only = query_embeddings.drop(['Unnamed: 0', 'Query'], axis =1)
query_embedding_only['combined'] = query_embedding_only.values.tolist()
query_embeddings['combined'] = query_embedding_only['combined']

In [12]:
product_embeddings = pd.read_csv("./data/product_embedding.csv")
product_embedding_only = product_embeddings.drop(['Unnamed: 0', 'Product'], axis =1)
product_embedding_only['combined'] = product_embedding_only.values.tolist()
product_embeddings['combined'] = product_embedding_only['combined']

In [16]:
from tqdm import tqdm

# Create new `pandas` methods which use `tqdm` progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()

merged_df['sim'] = merged_df.progress_apply(get_sim2, axis=1)

100%|██████████| 249721/249721 [1:13:36<00:00, 56.54it/s]


# 3. KNN

In [18]:
X = merged_df[['is_included_in_title', 'is_included_in_description', 'is_included_in_bullet_point', 'is_included_in_brand', 'is_included_in_color_name', 'is_included_in_any', 'sim']]
y = merged_df['esci_label'].replace(['exact', 'substitute', 'complement', 'irrelevant'], [4,3,2,1])
y

0         4
1         4
2         4
3         4
4         4
         ..
249716    3
249717    2
249718    2
249719    2
249720    2
Name: esci_label, Length: 249721, dtype: int64

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [37]:
for n in range(75, 200, 4):
  neigh = KNeighborsClassifier(n_neighbors=n)
  neigh.fit(X_train, y_train)
  y_pred = neigh.predict(X_test)
  acc = accuracy_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred, average='micro')
  print(f"{n}: Acc={acc}, f1={f1}")

75: Acc=0.5632462867682749, f1=0.5632462867682749
79: Acc=0.5640593146296476, f1=0.5640593146296476
83: Acc=0.5641806620716435, f1=0.5641806620716435
87: Acc=0.5641806620716435, f1=0.5641806620716435
91: Acc=0.5642049315600427, f1=0.5642049315600427
95: Acc=0.5646417823512281, f1=0.5646417823512281
99: Acc=0.5644840306766333, f1=0.5644840306766333
103: Acc=0.5649330162120183, f1=0.5649330162120183
107: Acc=0.5654790797009999, f1=0.5654790797009999
111: Acc=0.5654669449568003, f1=0.5654669449568003
115: Acc=0.5659159304921852, f1=0.5659159304921852
119: Acc=0.5659644694689836, f1=0.5659644694689836
123: Acc=0.5658188525385884, f1=0.5658188525385884
127: Acc=0.5658916610037861, f1=0.5658916610037861
131: Acc=0.5661950296087759, f1=0.5661950296087759
135: Acc=0.5662678380739734, f1=0.5662678380739734
139: Acc=0.5662557033297738, f1=0.5662557033297738
143: Acc=0.566219299097175, f1=0.566219299097175
147: Acc=0.5659766042131832, f1=0.5659766042131832
151: Acc=0.5661343558877779, f1=0.566134

In [35]:
list(range(25, 75, 2))

[25,
 27,
 29,
 31,
 33,
 35,
 37,
 39,
 41,
 43,
 45,
 47,
 49,
 51,
 53,
 55,
 57,
 59,
 61,
 63,
 65,
 67,
 69,
 71,
 73]

0.47537860401902726

In [None]:
merged_df.to_csv("./data/merged_temp.csv", index=False)