In [276]:
import pandas as pd
import numpy as np
import nltk
import re
from collections import defaultdict
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy import spatial
from sklearn.manifold import TSNE

In [277]:
sample_df = pd.read_csv('data/sample_df.csv')
metadata = pd.read_csv('data/metadata_df.csv')


In [278]:
meta_df = sample_df.merge(metadata,left_on='item',right_on='asin',how='inner')[['item','user','rating','timestamp','title','description']]

In [279]:
def clean_text(line):
    line = line.strip() # Removing leading/trailing whitespace
    line = re.sub('\[.*\]', '', line) # Remove character heading
    line = re.sub('[^\w\s]', '', line) # Remove punctuation
    line = re.sub(r'\w*\d\w*', '', line)
    line = line.lower() # convert to lower case
    return line
def product_description_gen(df):
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    df.title = df.title.map(lambda x:tokenizer.tokenize(x))
    df.title = df.title.map(lambda x: " ".join(x))
    df.description = df.description.map(lambda x: x.strip('[]'))
    df.description = df.description.map(lambda x:tokenizer.tokenize(x))                                      
    df.description = df.description.map(lambda x:" ".join(x))                                    
#     df['t_desc'] = df['title'] + " " + df['description']
    df['t_desc'] = df['title']
    df['t_desc'] = df['t_desc'].map(lambda x: clean_text(x))
    df.drop(columns = ['title','description'],inplace=True)
    return df

In [280]:
cleaned_df = product_description_gen(meta_df.copy())
cleaned_df.head()

Unnamed: 0,item,user,rating,timestamp,t_desc
0,B000E7QRQK,A1BWG0TDRBYBEG,5.0,1486425600,petsafe scatmat power adapter volts
1,B000E7WI0O,A1PGC1GEBY9CZU,5.0,1454630400,auto antenna for xm satellite radio receivers ...
2,B000E7WI0O,A2PSMNDRT6PRB4,5.0,1448841600,auto antenna for xm satellite radio receivers ...
3,B000E7WI0O,AV1S632VKK72V,5.0,1445212800,auto antenna for xm satellite radio receivers ...
4,B000E7WI0O,A2IRVI53P73NAO,5.0,1416182400,auto antenna for xm satellite radio receivers ...


In [281]:
item_df = cleaned_df.groupby(['t_desc'])['item'].unique().reset_index()
item_df['count'] = item_df['item'].map(lambda x:len(x))
item_df = item_df.sort_values(by='count',ascending=False).reset_index(drop=True)
item_df.head()

Unnamed: 0,t_desc,item,count
0,taotronics anc foldable bluetooth headphones d...,"[B00B9A0HNA, B00JQ1V78I, B00KBQCSPI, B00RSUFT5...",6
1,lg electronics inch led tv model,"[B00BB0ZTM2, B00BB0ZTMM, B00BB9ORUS, B00IL7C8O...",6
2,headphone adapter amp splitter headphone jack ...,"[B007GC4L7S, B00A8S5KIG, B00FEQ429U, B00JDBDPH...",6
3,samsung inch ultra hd smart led tv model,"[B00T48CVWE, B00Z0IBFYM, B01C5TFNSM, B01DR4T8H...",5
4,amazonbasics usb cable a male to micro b fe...,"[B00B7WGH48, B00BCY1AA2, B00NH12R1O, B00NH13BI...",5


In [282]:
temp = item_df.loc[item_df['count']>1]
repeated=[]
for items in temp.item.values:
    for item in items:
        repeated.append(item)
final_df = cleaned_df[~cleaned_df['item'].isin(repeated)]

In [283]:
final_df.t_desc = final_df.t_desc.map(lambda x:tokenizer.tokenize(x)) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [284]:
#preparing pre-trained GLOVE embedding dictionary
embeddings_dict = {}
glove_file = "data/glove.6B.50d.txt"
with open(glove_file, 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [286]:
final_df

Unnamed: 0,item,user,rating,timestamp,t_desc
0,B000E7QRQK,A1BWG0TDRBYBEG,5.0,1486425600,"[petsafe, scatmat, power, adapter, volts]"
1,B000E7WI0O,A1PGC1GEBY9CZU,5.0,1454630400,"[auto, antenna, for, xm, satellite, radio, rec..."
2,B000E7WI0O,A2PSMNDRT6PRB4,5.0,1448841600,"[auto, antenna, for, xm, satellite, radio, rec..."
3,B000E7WI0O,AV1S632VKK72V,5.0,1445212800,"[auto, antenna, for, xm, satellite, radio, rec..."
4,B000E7WI0O,A2IRVI53P73NAO,5.0,1416182400,"[auto, antenna, for, xm, satellite, radio, rec..."
...,...,...,...,...,...
103636,B01CQO1JYS,A3PRNX5PAHT6BH,5.0,1537488000,"[golfbuddy, voice, golf, gps, rangefinder]"
103637,B01DP5SS1O,AP2AY8VC30NF,5.0,1536105600,"[bull, tech, battery, replacement, new, laptop..."
103638,B01DUF3UVC,A3R792U3UKMCYE,5.0,1536969600,"[sony, stereo, cd, cassette, boombox, home, au..."
103639,B01E9EZ0B6,AP3YA6FGML19K,5.0,1536624000,"[ultimate, ears, ue, boom, brainfreeze, wirele..."


In [287]:
#creating embedding for unknown token
with open(glove_file, 'r') as f:
    for i, line in enumerate(f):
        pass
n_vec = i + 1
hidden_dim = len(line.split(' ')) - 1
vecs = np.zeros((n_vec, hidden_dim), dtype=np.float32)

with open(glove_file, 'r') as f:
    for i, line in enumerate(f):
        vecs[i] = np.array([float(n) for n in line.split(' ')[1:]], dtype=np.float32)

average_vec = np.mean(vecs, axis=0)
embeddings_dict['<unk>']=np.array(average_vec)
print(average_vec)

[-0.12920076 -0.28866628 -0.01224866 -0.05676644 -0.20210965 -0.08389011
  0.33359843  0.16045167  0.03867431  0.17833012  0.04696583 -0.00285802
  0.29099807  0.04613704 -0.20923874 -0.06613114 -0.06822549  0.07665912
  0.3134014   0.17848536 -0.1225775  -0.09916984 -0.07495987  0.06413227
  0.14441176  0.60894334  0.17463093  0.05335403 -0.01273871  0.03474107
 -0.8123879  -0.04688699  0.20193407  0.2031118  -0.03935686  0.06967544
 -0.01553638 -0.03405238 -0.06528071  0.12250231  0.13991883 -0.17446303
 -0.08011883  0.0849521  -0.01041659 -0.13705009  0.20127155  0.10069408
  0.00653003  0.01685157]


In [301]:
#generate embeddings for each item id
unique_items = final_df.item.unique()
item_dict = defaultdict(list)
for item in tqdm(unique_items):
    words = final_df[final_df['item']==item]['t_desc'].iat[0]
    item_arr = np.zeros((len(words),50),dtype=np.float32)
    for i,word in enumerate(words):
        if word not in embeddings_dict.keys():
            item_arr[i] = embeddings_dict['<unk>']
        else:
            item_arr[i] = embeddings_dict[word]
    avg_arr = np.mean(item_arr,axis = 0)
    avg_arr = '|'.join(str(x) for x in avg_arr)
    item_dict['item'].append(item)
    item_dict['embedding'].append(avg_arr)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11755/11755 [01:01<00:00, 192.08it/s]


In [303]:
embeddings_df = pd.DataFrame(item_dict)
embeddings_df.to_csv('data/embeddings.csv',index=False)