# Synonym detection

**NOTE**: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch4/1.ch4-setting-up-the-retrotech-dataset.ipynb) notebook.

In [1]:
# !pip install gensim
import pandas as pd
# import numpy as np
# from tqdm import tqdm
# import nltk
# nltk.download('wordnet')
# from nltk.corpus import wordnet 
# import re
# from nltk.tokenize import RegexpTokenizer
# from gensim.models import Word2Vec
# from collections import defaultdict
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.neighbors import NearestNeighbors
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("aips-ch6").getOrCreate()

In [2]:
# !cp 'synonym detection-v2.ipynb' '/Users/kabdelfatah/Downloads/ai-powered-search/'
!pwd

/home/jovyan/notebooks/ch6


In [3]:
signals = pd.read_json("../data/temp/signal_sample.json")
signals.to_csv("../data/temp/signal_sample.csv")

In [4]:
signals.head()

Unnamed: 0,query_s,user_id_s,doc_id_s
0,jillian micheals,b9acb9cf6d812521c48260f53f5febede8090e28,2620821
1,pda,7c2b958123c82df86321be5caf4dd7934f1880c1,1232905
2,sony radio,abda6be4b049ffb94ae07a6aa67c4ff5391b1125,7022839
3,car,1e61403ce24dabe228850f4d7aca4e1f83d22252,7310411
4,pioneer speakers,872826507ab4c66ef099dc8a05606c131ad5c4fb,2994224


In [5]:
csvFile = "../data/temp/signal_sample.csv"
csvDF = spark.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load(csvFile)
csvDF.registerTempTable('signals')


In [6]:
# spark.sql('''select count(1) from signals where type='query' ''').show(100)

In [7]:
# keyword_click_product = spark.sql('''
# select q.user, keyword,upc 
# from
# (select user,query_id,lower(target) as keyword from signals
# where type='query') as q
# join
# (select user, query_id,target as upc from signals
# where type='click') as c on q.query_id = c.query_id and q.user=c.user
# group by q.user, keyword,upc 
# ''')
# keyword_click_product.registerTempTable('keyword_click_product')
# spark.sql('''select count(1) as keyword_click_product from keyword_click_product''').show() #647684

In [8]:
# spark.sql('''
# select query_s, count(1) as n_users from signals group by query_s
# ''').registerTempTable('keyword_user_oc')
# spark.sql('''select count(1) as keyword_user_oc from keyword_user_oc''').show()


In [9]:
# spark.sql('''
# select query1, query2,count(distinct user_id_s) as cooc from(
#     select k1.query_s as query1, k2.query_s as query2, k1.user_id_s
#     from
#     signals as k1 
#     join
#     signals as k2
#     on k1.user_id_s = k2.user_id_s
#     where k1.query_s > k2.query_s 
# ) query_pair
# group by query1, query2
# ''').registerTempTable('query_pair_cooc')
# spark.sql('''select count(1) as query_pair_cooc from query_pair_cooc''').show()
# spark.sql('''select * from query_pair_cooc''').show(100)


In [10]:
spark.sql('''
select lower(query_s) as keyword, user_id_s, doc_id_s as upc from signals
''').registerTempTable('keyword_click_product')

In [11]:
spark.sql('''
select k1.keyword as k1, k2.keyword as k2, sum(p1) n_users1,sum(p2) n_users1, sum(p1+p2) as users_cooc, count(1) n_products
from
(select keyword, upc, count(1) as p1 from keyword_click_product group by keyword, upc) as k1 
join
(select keyword, upc, count(1) as p2 from keyword_click_product group by keyword, upc) as k2
on k1.upc = k2.upc
where k1.keyword > k2.keyword 
group by k1.keyword, k2.keyword
''').registerTempTable('keyword_click_product_cooc')
spark.sql('''select count(1) as keyword_click_product_cooc from keyword_click_product_cooc''').show()


+--------------------------+
|keyword_click_product_cooc|
+--------------------------+
|                      9318|
+--------------------------+



In [12]:
spark.sql('''
select keyword, count(1) as n_users from keyword_click_product group by keyword
''').registerTempTable('keyword_click_product_oc')
spark.sql('''select count(1) as keyword_click_product_oc from keyword_click_product_oc''').show()


+------------------------+
|keyword_click_product_oc|
+------------------------+
|                    4886|
+------------------------+



In [13]:
spark.sql('''
select k1.keyword as k1, k2.keyword as k2, k1_k2.users_cooc, k1.n_users as n_users1,k2.n_users as n_users2,
k1_k2.users_cooc/(k1.n_users*k2.n_users) as pmi2
from
keyword_click_product_cooc as k1_k2 
join
keyword_click_product_oc as k1 on k1_k2.k1 = k1.keyword
join
keyword_click_product_oc as k2 on k1_k2.k2 = k2.keyword
''').registerTempTable('related_keywords_pmi')
spark.sql('''select count(1) as related_keywords_pmi from related_keywords_pmi''').show()


+--------------------+
|related_keywords_pmi|
+--------------------+
|                9318|
+--------------------+



In [14]:
data = spark.sql('''
select * from(
select k1, k2, pmi2, 
row_number() over (PARTITION BY k1 order by pmi2 desc ) rnum
from related_keywords_pmi 
where users_cooc > 10 and pmi2 > 0.01 
and k1 in ('lcd tv', 'ipad', 'laptop', 'iphone 4') ) x where rnum <= 20 ''').show(500)


+------+--------------------+--------------------+----+
|    k1|                  k2|                pmi2|rnum|
+------+--------------------+--------------------+----+
|laptop|        apple laptop|              0.0325|   1|
|laptop|               apple|0.019444444444444445|   2|
|  ipad|   hp with dre beats| 0.36585365853658536|   1|
|  ipad|        apple ipad 2| 0.36585365853658536|   2|
|  ipad|     apple computers| 0.18292682926829268|   3|
|  ipad|             i pad 2| 0.17073170731707318|   4|
|  ipad|               i pad| 0.09024390243902439|   5|
|  ipad|          apple ipad| 0.08536585365853659|   6|
|  ipad|             hp ipad| 0.06707317073170732|   7|
|  ipad|            hp touch| 0.03902439024390244|   8|
|  ipad|        hp touch pad|0.028455284552845527|   9|
|  ipad|               apple| 0.02100271002710027|  10|
|  ipad|         hp touchpad|0.018847006651884702|  11|
|  ipad|           hp tablet| 0.01662971175166297|  12|
|  ipad|                  hp| 0.0112570356472795

### Step 1: data cleaning
perform minimum stemming on queries and drop rare queries with count of 1

In [15]:
# product_description = pd.read_json("../data/temp/product_description.json")
# signals = pd.read_json("../data/temp/signal_sample.json")
# signals["query"] = signals["query_s"].apply(lambda x: re.sub("s$","", x.lower())) #conduct minimum stemming

In [16]:
csvFile = "../data/retrotech/products.csv"
import csv
reader = csv.reader(open(csvFile,'r'))
products = []
prod2 = []
for r in reader:
    if len(r)> 5:
        prod2.append(r)
        continue
    products.append(r)
    

In [17]:
# len(products), len(prod2)
products[0:2]

[['upc', 'name', 'manufacturer', 'shortDescription', 'longDescription'],
 ['096009010836', 'Fists of Bruce Lee - Dolby - DVD', '\\N', '\\N', '\\N']]

In [18]:
aggr_signals = aggr_signals[aggr_signals["count"] > 1]
aggr_signals.shape[0]

NameError: name 'aggr_signals' is not defined

### Step 2: find synonym from wordnet
get candidate from wordnet if both query and candidate are included in wordnet. 

In [19]:
synonym = []
queries = []

for query in tqdm(aggr_signals["query"]):
    for synset in wordnet.synsets(query):
        for lemma in synset.lemmas():
            candidate = lemma.name().replace("_", " ")
            if candidate in list(aggr_signals["query"]) and candidate != query:
                queries.append(query)
                synonym.append(candidate)
                
wordnet_result = pd.DataFrame({"queries":queries,"synonym":synonym}).drop_duplicates()
wordnet_result.head()

NameError: name 'tqdm' is not defined

### Step 3: find synonym from word2vec

In [20]:
# combine queries and product descriptions to be fed into word2vec model
tokenizer = RegexpTokenizer(r'\w+')
tokenized_description = [tokenizer.tokenize(text.lower()) for text in product_description.longDescription]
tokenized_query = [tokenizer.tokenize(text) for text in aggr_signals["query"]]
tokenized_all = tokenized_query + tokenized_description

NameError: name 'RegexpTokenizer' is not defined

In [21]:
# compute word2vec for each word
model = Word2Vec(tokenized_all, vector_size=150, window=8, min_count=1, workers=-1)
word2vec = dict(zip(model.wv.index_to_key, model.wv.vectors))

NameError: name 'Word2Vec' is not defined

In [22]:
# build word weights dictionary to weigh word vectors, so that rare words get more weights

tfidf = TfidfVectorizer(analyzer=lambda x: x)
tfidf.fit(tokenized_all)

max_idf = max(tfidf.idf_) #the default idf is the max of idf's for unseen words
weights = defaultdict(lambda: max_idf, [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

NameError: name 'TfidfVectorizer' is not defined

In [23]:
query_vectors = []
tokenized_query_clean = [x for x in tokenized_query if x != []]
queries_cleaned = [" ".join(x) for x in tokenized_query_clean]

for tokens in tokenized_query_clean:
    tmp = []
    for token in tokens:
        if token in word2vec.keys():
            tmp.append(word2vec[token] * weights[token])
        else:
            tmp.append(np.zeros(150))
    query_vectors.append(np.mean(tmp, axis=0).tolist())

NameError: name 'tokenized_query' is not defined

In [24]:
knn = NearestNeighbors(n_neighbors=5, metric='cosine', algorithm='brute', n_jobs=-1)
knn.fit(np.stack(query_vectors))

NameError: name 'NearestNeighbors' is not defined

In [25]:
queries = []
synonym = []
cosine = []

for i in tqdm(range(len(queries_cleaned))):
    synonym_candidates = knn.kneighbors(np.expand_dims(query_vectors[i],axis=0), n_neighbors=6)
    query_index = synonym_candidates[1][0].tolist()[1:] #drop first candidate which is the same of the original query
    cosine_similarity = [1-x for x in synonym_candidates[0][0].tolist()[1:]]
    query = queries_cleaned[i]
    for j in range(0,5):
        queries.append(query)
        synonym.append(queries_cleaned[query_index[j]])
        cosine.append(cosine_similarity[j])

NameError: name 'tqdm' is not defined

In [26]:
synonym_candidates = pd.DataFrame({"queries":queries, "synonym":synonym, "cosine":cosine})
synonym_candidates.head(20)

Unnamed: 0,queries,synonym,cosine


In [27]:
# apply cosine similarity threshold to further filter candidate synonyms
cosine_threshold = 0.7
word2vec_result = synonym_candidates[synonym_candidates.cosine >= cosine_threshold][["queries","synonym"]]
word2vec_result

Unnamed: 0,queries,synonym


### Step 4: combine wordnet and word2vec lists
the pair query A and synonym B is the same as the pair query B and synonym A, sort query and synonym to keep only 1 pair.

In [28]:
combined_result = wordnet_result.append(word2vec_result)

def sort_pair (row): 
    return "_".join(sorted([row['queries'], row['synonym']])) 

combined_result["sorted_pair"] = combined_result.apply(lambda row: sort_pair(row), axis=1)
final_result = combined_result.groupby("sorted_pair").first().reset_index().drop(["sorted_pair"],axis=1)
len(final_result)

NameError: name 'wordnet_result' is not defined

In [29]:
final_result

NameError: name 'final_result' is not defined