## Import Libraries

In [1]:
import nltk
import pandas as pd
import string


## Load csv files to data frames


In [2]:
train_df = pd.read_csv('data/train.csv',encoding = "latin1")
test_df = pd.read_csv('data/test.csv',encoding = "latin1")
attributes_df = pd.read_csv('data/attributes.csv',encoding = "latin1")
product_descriptions_df = pd.read_csv('data/product_descriptions.csv',encoding = "latin1")
sample_submission_df = pd.read_csv('data/sample_submission.csv', encoding = "latin1")


In [3]:
print(train_df.dtypes)

id                 int64
product_uid        int64
product_title     object
search_term       object
relevance        float64
dtype: object


In [4]:
print(product_descriptions_df.dtypes)

product_uid             int64
product_description    object
dtype: object


In [5]:
print(product_descriptions_df.dtypes)

product_uid             int64
product_description    object
dtype: object


In [6]:
print(product_descriptions_df.head())

   product_uid                                product_description
0       100001  Not only do angles make joints stronger, they ...
1       100002  BEHR Premium Textured DECKOVER is an innovativ...
2       100003  Classic architecture meets contemporary design...
3       100004  The Grape Solar 265-Watt Polycrystalline PV So...
4       100005  Update your bathroom with the Delta Vero Singl...


# Data Cleaning

Download NLTK's stopwords list and WordNetLemmatizer

In [7]:
lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()
stopwords=nltk.corpus.stopwords.words('english')
#nltk.download('stopwords')
#nltk.download('wordnet')


Process and tokenize the raw text by:
    1. Convert to lower case
    2. Remove apostrophe
    3. Remove Punctuation
    4. Lemmatize

In [8]:
def process(text, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
   
    # convert text to lower case
    lowercase = str(text).lower()
    
    #remove 's from string
    apoRemoved = lowercase.replace("'s","")
    
    #convert don't to dont
    apoRemoved = apoRemoved.replace("'","")
    
    #handle other punctuations
    transtable = str.maketrans(string.punctuation,"                                ")
    brokenWords = apoRemoved.translate(transtable)
    
    #convert string to list of words
    listOfWords =  nltk.word_tokenize(brokenWords)
    
    #lemmatize text
    lemmatizedList=[lemmatizer.lemmatize(word) for word in listOfWords]
   
    return lemmatizedList


In [9]:
# test case
text = "Dogs Here's don't cars."
print(process(text))

['dog', 'here', 'dont', 'car']


In [10]:
# process train_df
def process_train_df(df, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    
    newdf=df
    for i,row in newdf.iterrows():
#         if i>5:
#             break
#         print("raw: ",text)
        newdf.at[i,'product_title'] = process(row['product_title'])
#         print("processed: ",df.iloc[i]['product_title'])
    return newdf


In [11]:
processed_train_df = process_train_df(train_df)
print(processed_train_df.head())

   id  product_uid                                      product_title  \
0   2       100001           [simpson, strong, tie, 12, gauge, angle]   
1   3       100001           [simpson, strong, tie, 12, gauge, angle]   
2   9       100002  [behr, premium, textured, deckover, 1, gal, sc...   
3  16       100005  [delta, vero, 1, handle, shower, only, faucet,...   
4  17       100005  [delta, vero, 1, handle, shower, only, faucet,...   

          search_term  relevance  
0       angle bracket       3.00  
1           l bracket       2.50  
2           deck over       3.00  
3    rain shower head       2.33  
4  shower only faucet       2.67  


In [12]:
# process 
def process_product_descriptions_df(df, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    
    newdf=df
    for i,row in newdf.iterrows():
#         if i>5:
#             break
#         print("raw: ",text)
        newdf.at[i,'product_description'] = process(row['product_description'])
#         print("processed: ",df.iloc[i]['product_title'])
    return newdf

In [13]:
processed_product_descriptions_df = process_product_descriptions_df(product_descriptions_df)
print(processed_product_descriptions_df.head())

   product_uid                                product_description
0       100001  [not, only, do, angle, make, joint, stronger, ...
1       100002  [behr, premium, textured, deckover, is, an, in...
2       100003  [classic, architecture, meet, contemporary, de...
3       100004  [the, grape, solar, 265, watt, polycrystalline...
4       100005  [update, your, bathroom, with, the, delta, ver...


In [14]:
print(processed_product_descriptions_df["product_description"].head(5) )

0    [not, only, do, angle, make, joint, stronger, ...
1    [behr, premium, textured, deckover, is, an, in...
2    [classic, architecture, meet, contemporary, de...
3    [the, grape, solar, 265, watt, polycrystalline...
4    [update, your, bathroom, with, the, delta, ver...
Name: product_description, dtype: object


In [47]:
#print(processed_product_descriptions_df["product_description"].head(5) )
from nltk.corpus import wordnet
import numpy as np
query = 'shower head'
qa= query.split(" ")
c=0
w1 = wordnet.synsets("dog")[0]
w2 = wordnet.synsets("animal")[0]
print(w1.wup_similarity(w2))
sLength = len(processed_product_descriptions_df["product_description"])
print(sLength)
e = pd.Series(np.random.randn(sLength))
processed_product_descriptions_df = processed_product_descriptions_df.assign(e=e.values)
print(processed_product_descriptions_df.head())
print('-----------------------------------------')
total_sim=0
word_count=0
for index,rr in processed_product_descriptions_df.iterrows():
    #print(lin)
    lin = rr["product_description"]
    for word1 in lin:
        w1 = wordnet.synsets(word1)
        for word2 in qa:
            w2 = wordnet.synsets(word2)
            if w1 and w2:
                sim = w1[0].wup_similarity(w2[0])
                #print(sim)
                if sim!=None and sim > .8:
                    total_sim=total_sim+sim
                    word_count=word_count+1
    if word_count==0:
         rr['e']=None
    else:
        rr['e'] = total_sim/word_count
    print(rr['e'])
        
        
                
#     c=c+1
#     if c> 4:
#         break
# print(c)
    #if c> 4:
        #break
# query = "shower head"
# from lsa.search.machine import SearchMachine
# sm = SearchMachine(latent_dimensions=150, index_backend='lsa.keeper.backends.JsonIndexBackend',
#                    keep_index_info={'path_to_index_folder': 'index'},
#                    db_backend='lsa.db.mysql.MySQLBackend',
#                    db_credentials={'db_name': 'news', 'user': 'root', 'password': 'one2012gtr'},
#                    tables_info={
#                        'news_news': {'fields': ('title', 'text'), 'pk_field_name': 'id', 'prefix': '', 'where': 'id < 300'}
#                    },
#                    decimals=3,
#                    use_tf_idf=False
#                    )

# sm.build_index()
# res = sm.search('natural language query', with_distances=True, limit=10)
# print(res)

0.875
124428
   product_uid                                product_description         e
0       100001  [not, only, do, angle, make, joint, stronger, ... -1.862074
1       100002  [behr, premium, textured, deckover, is, an, in...  1.861570
2       100003  [classic, architecture, meet, contemporary, de...  1.045406
3       100004  [the, grape, solar, 265, watt, polycrystalline...  0.630615
4       100005  [update, your, bathroom, with, the, delta, ver...  0.727016
-----------------------------------------
None
None
None
None
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.9583333333333334
0.9583333333333334
0.9583333333333334
0.9583333333333334
0.9583333333333334
0.9722222222222222
0.9722222222222222
0.9722222222222222
0.9821428571428571
0.9821428571428571
0.9821428571428571
0.9821428571428571
0.9821428571428571
0.9821428571428571
0.9821428571428571
0.9821428571428571
0.9821428571428571
0.9821428571428571
0.9821428571428571
0.9821428571428571
0.9821428571428571
0.9821428571428571
0.9821428571428571


KeyboardInterrupt: 