## Import Libraries

In [36]:
import nltk
import pandas as pd
import string


## Load csv files to data frames


In [37]:
train_df = pd.read_csv('data/train.csv',encoding = "latin1")
test_df = pd.read_csv('data/test.csv',encoding = "latin1")
attributes_df = pd.read_csv('data/attributes.csv',encoding = "latin1")
product_descriptions_df = pd.read_csv('data/product_descriptions.csv',encoding = "latin1")
sample_submission_df = pd.read_csv('data/sample_submission.csv', encoding = "latin1")


In [38]:
print(train_df.dtypes)

id                 int64
product_uid        int64
product_title     object
search_term       object
relevance        float64
dtype: object


In [39]:
print(product_descriptions_df.dtypes)

product_uid             int64
product_description    object
dtype: object


In [40]:
print(product_descriptions_df.dtypes)

product_uid             int64
product_description    object
dtype: object


In [41]:
print(product_descriptions_df.head())

   product_uid                                product_description
0       100001  Not only do angles make joints stronger, they ...
1       100002  BEHR Premium Textured DECKOVER is an innovativ...
2       100003  Classic architecture meets contemporary design...
3       100004  The Grape Solar 265-Watt Polycrystalline PV So...
4       100005  Update your bathroom with the Delta Vero Singl...


# Data Cleaning

Download NLTK's stopwords list and WordNetLemmatizer

In [42]:
lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()
stopwords=nltk.corpus.stopwords.words('english')
#nltk.download('stopwords')
#nltk.download('wordnet')


Process and tokenize the raw text by:
    1. Convert to lower case
    2. Remove apostrophe
    3. Remove Punctuation
    4. Lemmatize

In [43]:
def process(text, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
   
    # convert text to lower case
    lowercase = str(text).lower()
    
    #remove 's from string
    apoRemoved = lowercase.replace("'s","")
    
    #convert don't to dont
    apoRemoved = apoRemoved.replace("'","")
    
    #handle other punctuations
    transtable = str.maketrans(string.punctuation,"                                ")
    brokenWords = apoRemoved.translate(transtable)
    
    #convert string to list of words
    listOfWords =  nltk.word_tokenize(brokenWords)
    
    #lemmatize text
    lemmatizedList=[lemmatizer.lemmatize(word) for word in listOfWords]
   
    return lemmatizedList


In [44]:
# test case
text = "Dogs Here's don't cars."
print(process(text))

['dog', 'here', 'dont', 'car']


In [45]:
# process train_df
def process_train_df(df, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    
    newdf=df
    for i,row in newdf.iterrows():
#         if i>5:
#             break
#         print("raw: ",text)
        newdf.at[i,'product_title'] = process(row['product_title'])
#         print("processed: ",df.iloc[i]['product_title'])
    return newdf


In [46]:
processed_train_df = process_train_df(train_df)
lentr = len(processed_train_df['product_uid'])
print(processed_train_df.head())

   id  product_uid                                      product_title  \
0   2       100001           [simpson, strong, tie, 12, gauge, angle]   
1   3       100001           [simpson, strong, tie, 12, gauge, angle]   
2   9       100002  [behr, premium, textured, deckover, 1, gal, sc...   
3  16       100005  [delta, vero, 1, handle, shower, only, faucet,...   
4  17       100005  [delta, vero, 1, handle, shower, only, faucet,...   

          search_term  relevance  
0       angle bracket       3.00  
1           l bracket       2.50  
2           deck over       3.00  
3    rain shower head       2.33  
4  shower only faucet       2.67  


In [47]:
print(lentr)

74067


In [48]:
# process 
def process_product_descriptions_df(df, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    
    newdf=df
    for i,row in newdf.iterrows():
#         if i>5:
#             break
#         print("raw: ",text)
        newdf.at[i,'product_description'] = process(row['product_description'])
#         print("processed: ",df.iloc[i]['product_title'])
    return newdf

In [50]:
processed_product_descriptions_df = process_product_descriptions_df(product_descriptions_df)
print(processed_product_descriptions_df.head())

   product_uid                                product_description
0       100001  [not, only, do, angle, make, joint, stronger, ...
1       100002  [behr, premium, textured, deckover, is, an, in...
2       100003  [classic, architecture, meet, contemporary, de...
3       100004  [the, grape, solar, 265, watt, polycrystalline...
4       100005  [update, your, bathroom, with, the, delta, ver...


In [51]:
print(processed_product_descriptions_df["product_description"].head(5) )

0    [not, only, do, angle, make, joint, stronger, ...
1    [behr, premium, textured, deckover, is, an, in...
2    [classic, architecture, meet, contemporary, de...
3    [the, grape, solar, 265, watt, polycrystalline...
4    [update, your, bathroom, with, the, delta, ver...
Name: product_description, dtype: object


In [52]:
# print(processed_product_descriptions_df["product_description"].head(5) )
# from nltk.corpus import wordnet
# import numpy as np
# query = 'shower head'
# qa= query.split(" ")
# c=0
# w1 = wordnet.synsets("dog")[0]
# w2 = wordnet.synsets("animal")[0]
# print(w1.wup_similarity(w2))
# sLength = len(processed_product_descriptions_df["product_description"])
# print(sLength)
# e = pd.Series(np.random.randn(sLength))
# processed_product_descriptions_df = processed_product_descriptions_df.assign(e=e.values)
# print(processed_product_descriptions_df.head())
# print('-----------------------------------------')
# total_sim=0
# word_count=0
# for index,rr in processed_product_descriptions_df.iterrows():
#     #print(lin)
#     lin = rr["product_description"]
#     query=rr[""]
#     query=rr[]
    
#     for word1 in lin:
#         w1 = wordnet.synsets(word1)
#         for word2 in qa:
#             w2 = wordnet.synsets(word2)
#             if w1 and w2:
#                 sim = w1[0].wup_similarity(w2[0])
#                 #print(sim)
#                 if sim!=None and sim > .1:
#                     total_sim=total_sim+sim
#                     word_count=word_count+1
#     if word_count==0:
#          rr['e']=None
#     else:
#         rr['e'] = total_sim/word_count
#     print(rr['e'])
        
        
                
#     c=c+1
#     if c> 4:
#         break
# print(c)
    #if c> 4:
        #break
# query = "shower head"
# from lsa.search.machine import SearchMachine
# sm = SearchMachine(latent_dimensions=150, index_backend='lsa.keeper.backends.JsonIndexBackend',
#                    keep_index_info={'path_to_index_folder': 'index'},
#                    db_backend='lsa.db.mysql.MySQLBackend',
#                    db_credentials={'db_name': 'news', 'user': 'root', 'password': 'one2012gtr'},
#                    tables_info={
#                        'news_news': {'fields': ('title', 'text'), 'pk_field_name': 'id', 'prefix': '', 'where': 'id < 300'}
#                    },
#                    decimals=3,
#                    use_tf_idf=False
#                    )

# sm.build_index()
# res = sm.search('natural language query', with_distances=True, limit=10)
# print(res)

SyntaxError: invalid syntax (<ipython-input-52-7d30f4c50fe8>, line 22)

In [54]:
processed_product_descriptions_df.head()

Unnamed: 0,product_uid,product_description
0,100001,"[not, only, do, angle, make, joint, stronger, ..."
1,100002,"[behr, premium, textured, deckover, is, an, in..."
2,100003,"[classic, architecture, meet, contemporary, de..."
3,100004,"[the, grape, solar, 265, watt, polycrystalline..."
4,100005,"[update, your, bathroom, with, the, delta, ver..."


In [55]:
df_merge = pd.merge(processed_train_df,processed_product_descriptions_df, on='product_uid', how='left')

In [56]:
df_merge.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description
0,2,100001,"[simpson, strong, tie, 12, gauge, angle]",angle bracket,3.0,"[not, only, do, angle, make, joint, stronger, ..."
1,3,100001,"[simpson, strong, tie, 12, gauge, angle]",l bracket,2.5,"[not, only, do, angle, make, joint, stronger, ..."
2,9,100002,"[behr, premium, textured, deckover, 1, gal, sc...",deck over,3.0,"[behr, premium, textured, deckover, is, an, in..."
3,16,100005,"[delta, vero, 1, handle, shower, only, faucet,...",rain shower head,2.33,"[update, your, bathroom, with, the, delta, ver..."
4,17,100005,"[delta, vero, 1, handle, shower, only, faucet,...",shower only faucet,2.67,"[update, your, bathroom, with, the, delta, ver..."


In [86]:
print(df_merge["product_description"].head(5) )
from nltk.corpus import wordnet
import numpy as np
query = 'shower head'
qa= query.split(" ")
c=0
w1 = wordnet.synsets("dog")[0]
w2 = wordnet.synsets("animal")[0]
print(w1.wup_similarity(w2))
sLength = len(df_merge["product_description"])
print(sLength)
e = pd.Series(np.random.randn(sLength))
df_merge = df_merge.assign(e=e.values)
e2 = pd.Series(np.random.randn(sLength))
df_merge = df_merge.assign(e2=e2.values)
e3 = pd.Series(np.random.randn(sLength))
df_merge = df_merge.assign(e3=e3.values)
e4 = pd.Series(np.random.randn(sLength))
df_merge = df_merge.assign(e4=e4.values)
print(df_merge.head())
print('-----------------------------------------')
total_sim=0
word_count=0
from fuzzywuzzy import fuzz
for index,rr in df_merge.iterrows():
    print('-----------------------------------------')
    rr['e']=None
    rr['e2']=None
    rr['e3']=None
    rr['e4']=None
    
    
    
    #print(lin)
    total_sim=0
    total_sim2=0
    word_count=0
    word_count2=0
    lin = rr["product_title"]
    query=rr["search_term"]
    qa= query.split(" ")
    lin2= rr["product_description"]
    for word1 in lin:
        w1 = wordnet.synsets(word1)
        for word2 in qa:
            sim2=fuzz.ratio(word1, word2)
            if sim2!=None and sim2 > 35:
                    total_sim2=total_sim2+sim2
                    word_count2=word_count2+1
            
            w2 = wordnet.synsets(word2)
            
            if w1 and w2:
                sim = w1[0].wup_similarity(w2[0])
                #print(sim)
                if sim!=None and sim > .5:
                    total_sim=total_sim+sim
                    word_count=word_count+1
    if word_count==0:
         rr['e']=None
    else:
        rr['e'] = total_sim/word_count
    #print(rr['e'])
    if word_count2==0:
         rr['e3']=None
    else:
        rr['e3'] = total_sim2/word_count2
    #print(rr['e3'])
    print( str(rr['e'])+'\t'+str(rr['e2'])+'\t'+str(rr['e3'])+'\t'+str(rr['e4']) )
    total_sim=0
    total_sim2=0
    word_count=0
    word_count=0
    word_count2=0
    for word1 in lin2:
        w1 = wordnet.synsets(word1)
        for word2 in qa:
            sim2=fuzz.ratio(word1, word2)
            if sim2!=None and sim2 > 35:
                    total_sim2=total_sim2+sim2
                    word_count2=word_count2+1
            
            w2 = wordnet.synsets(word2)
            if w1 and w2:
                sim = w1[0].wup_similarity(w2[0])
                #print(sim)
                if sim!=None and sim > .5:
                    total_sim=total_sim+sim
                    word_count=word_count+1
    if word_count==0:
         rr['e2']=None
    else:
        rr['e2'] = total_sim/word_count
        
    if word_count2==0:
         rr['e4']=None
    else:
#         print('----')
        
#         print(total_sim2)
#         print(word_count2)
#         print('----')
        
        rr['e4'] = total_sim2/word_count2
   # print(rr['e2'])
    
#     delimiter = ' '
#     s1 = delimiter.join(lin)
#     s2 = delimiter.join(lin2)
#     r1=fuzz.ratio(query, s1)
#     rr['e3']=r1
#     r1=fuzz.ratio(query, s2)
    #rr['e4']=None
    print( str(rr['e'])+'\t'+str(rr['e2'])+'\t'+str(rr['e3'])+'\t'+str(rr['e4']) ) 
    
    
    
    
    
    
    
    
    
    

0    [not, only, do, angle, make, joint, stronger, ...
1    [not, only, do, angle, make, joint, stronger, ...
2    [behr, premium, textured, deckover, is, an, in...
3    [update, your, bathroom, with, the, delta, ver...
4    [update, your, bathroom, with, the, delta, ver...
Name: product_description, dtype: object
0.875
74067
   id  product_uid                                      product_title  \
0   2       100001           [simpson, strong, tie, 12, gauge, angle]   
1   3       100001           [simpson, strong, tie, 12, gauge, angle]   
2   9       100002  [behr, premium, textured, deckover, 1, gal, sc...   
3  16       100005  [delta, vero, 1, handle, shower, only, faucet,...   
4  17       100005  [delta, vero, 1, handle, shower, only, faucet,...   

          search_term  relevance  \
0       angle bracket       3.00   
1           l bracket       2.50   
2           deck over       3.00   
3    rain shower head       2.33   
4  shower only faucet       2.67   

                

0.7666666666666666	None	53.2	None
0.7666666666666666	0.8909090909090908	53.2	48.80821917808219
-----------------------------------------
1.0	None	48.0	None
1.0	0.7515881206283681	48.0	48.42307692307692
-----------------------------------------
0.5964912280701754	None	46.0	None
0.5964912280701754	0.6060538250387875	46.0	43.73684210526316
-----------------------------------------
0.8444444444444444	None	89.0	None
0.8444444444444444	0.8666666666666666	89.0	53.225806451612904
-----------------------------------------
0.8305555555555555	None	74.66666666666667	None
0.8305555555555555	0.7754033071183708	74.66666666666667	52.97849462365591
-----------------------------------------
0.8444444444444444	None	63.166666666666664	None
0.8444444444444444	0.6997739050345662	63.166666666666664	49.95652173913044
-----------------------------------------
1.0	None	67.66666666666667	None
1.0	0.7444158036021427	67.66666666666667	57.285714285714285
-----------------------------------------
0.8030228758169935	

KeyboardInterrupt: 

In [63]:
from fuzzywuzzy import fuzz
r1=fuzz.ratio("this is a test", "this is a test!")
print(r1)

97
