# Synonym detection

**NOTE**: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch4/1.ch4-setting-up-the-retrotech-dataset.ipynb) notebook.

In [1]:
!pip install gensim
import pandas as pd
import numpy as np
from tqdm import tqdm
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet 
import re
from nltk.tokenize import RegexpTokenizer
from gensim.models import Word2Vec
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors



[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Step 1: data cleaning
perform minimum stemming on queries and drop rare queries with count of 1

In [2]:
product_description = pd.read_json("../data/temp/product_description.json")
signals = pd.read_json("../data/temp/signal_sample.json")
signals["query"] = signals["query_s"].apply(lambda x: re.sub("s$","", x.lower())) #conduct minimum stemming

In [3]:
aggr_signals = pd.DataFrame(signals["query"].value_counts()).reset_index()
aggr_signals.columns = ["query", "count"]
aggr_signals.head()

Unnamed: 0,query,count
0,lcd tv,85
1,ipad,83
2,laptop,80
3,iphone 4,71
4,hp touchpad,67


In [4]:
aggr_signals = aggr_signals[aggr_signals["count"] > 1]
aggr_signals.shape[0]

1348

### Step 2: find synonym from wordnet
get candidate from wordnet if both query and candidate are included in wordnet. 

In [5]:
synonym = []
queries = []

for query in tqdm(aggr_signals["query"]):
    for synset in wordnet.synsets(query):
        for lemma in synset.lemmas():
            candidate = lemma.name().replace("_", " ")
            if candidate in list(aggr_signals["query"]) and candidate != query:
                queries.append(query)
                synonym.append(candidate)
                
wordnet_result = pd.DataFrame({"queries":queries,"synonym":synonym}).drop_duplicates()
wordnet_result.head()

100%|██████████| 1348/1348 [00:02<00:00, 625.73it/s]


Unnamed: 0,queries,synonym
0,laptop,laptop computer
1,headphone,earphone
2,headphone,phone
3,tv,television
5,vacuum,hoover


### Step 3: find synonym from word2vec

In [6]:
# combine queries and product descriptions to be fed into word2vec model
tokenizer = RegexpTokenizer(r'\w+')
tokenized_description = [tokenizer.tokenize(text.lower()) for text in product_description.longDescription]
tokenized_query = [tokenizer.tokenize(text) for text in aggr_signals["query"]]
tokenized_all = tokenized_query + tokenized_description

In [7]:
# compute word2vec for each word
model = Word2Vec(tokenized_all, vector_size=150, window=8, min_count=1, workers=-1)
word2vec = dict(zip(model.wv.index_to_key, model.wv.vectors))

In [8]:
# build word weights dictionary to weigh word vectors, so that rare words get more weights

tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(tokenized_all)

max_idf = max(tfidf.idf_) #the default idf is the max of idf's for unseen words
weights = defaultdict(lambda: max_idf, [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

In [9]:
query_vectors = []
tokenized_query_clean = [x for x in tokenized_query if x != []]
queries_cleaned = [" ".join(x) for x in tokenized_query_clean]

for tokens in tokenized_query_clean:
    tmp = []
    for token in tokens:
        if token in word2vec.keys():
            tmp.append(word2vec[token] * weights[token])
        else:
            tmp.append(np.zeros(150))
    query_vectors.append(np.mean(tmp, axis=0).tolist())

In [10]:
knn = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute', n_jobs=-1)
knn.fit(np.stack(query_vectors))

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=5, p=2, radius=1.0)

In [11]:
queries = []
synonym = []
cosine = []

for i in tqdm(range(len(queries_cleaned))):
    synonym_candidates = knn.kneighbors(np.expand_dims(query_vectors[i],axis=0), n_neighbors=6)
    query_index = synonym_candidates[1][0].tolist()[1:] #drop first candidate which is the same of the original query
    cosine_similarity = [1-x for x in synonym_candidates[0][0].tolist()[1:]]
    query = queries_cleaned[i]
    for j in range(0,5):
        queries.append(query)
        synonym.append(queries_cleaned[query_index[j]])
        cosine.append(cosine_similarity[j])

100%|██████████| 1347/1347 [02:27<00:00,  9.11it/s]


In [14]:
synonym_candidates = pd.DataFrame({"queries":queries, "synonym":synonym, "cosine":cosine})
synonym_candidates.head(20)

Unnamed: 0,queries,synonym,cosine
0,lcd tv,lcd,0.739623
1,lcd tv,tv,0.68721
2,lcd tv,sony lcd,0.524663
3,lcd tv,hd tv,0.512388
4,lcd tv,lcd monitor,0.491351
5,ipad,ipad 2,0.868382
6,ipad,ipad case,0.818139
7,ipad,ipad 1,0.813488
8,ipad,ipad 2 case,0.731535
9,ipad,ipad keyboard,0.716148


In [15]:
# apply cosine similarity threshold to further filter candidate synonyms
cosine_threshold = 0.7
word2vec_result = synonym_candidates[synonym_candidates.cosine >= cosine_threshold][["queries","synonym"]]
word2vec_result

Unnamed: 0,queries,synonym
0,lcd tv,lcd
5,ipad,ipad 2
6,ipad,ipad case
7,ipad,ipad 1
8,ipad,ipad 2 case
...,...,...
6685,led,led tv
6686,led,55 led
6700,hd dvd,hd dvd player
6701,hd dvd,hd


### Step 4: combine wordnet and word2vec lists
the pair query A and synonym B is the same as the pair query B and synonym A, sort query and synonym to keep only 1 pair.

In [16]:
combined_result = wordnet_result.append(word2vec_result)

def sort_pair (row): 
    return "_".join(sorted([row['queries'], row['synonym']])) 

combined_result["sorted_pair"] = combined_result.apply(lambda row: sort_pair(row), axis=1)
final_result = combined_result.groupby("sorted_pair").first().reset_index().drop(["sorted_pair"],axis=1)
len(final_result)

570

In [17]:
final_result

Unnamed: 0,queries,synonym
0,360 elite,elite
1,360 elite,xbox 360 elite
2,360,xbox 360
3,3d glasse,samsung 3d glasse
4,3d,3d tv
...,...,...
565,xbox,xbox console
566,xbox,xbox game
567,xbox,xbox live
568,xm radio,xm
