## Import Libraries

In [24]:
import nltk
import pandas as pd
import string


## Load csv files to data frames


In [25]:
train_df = pd.read_csv('data/train.csv',encoding = "latin1")
test_df = pd.read_csv('data/test.csv',encoding = "latin1")
attributes_df = pd.read_csv('data/attributes.csv',encoding = "latin1")
product_descriptions_df = pd.read_csv('data/product_descriptions.csv',encoding = "latin1")
sample_submission_df = pd.read_csv('data/sample_submission.csv', encoding = "latin1")


In [26]:
print(train_df.dtypes)

id                 int64
product_uid        int64
product_title     object
search_term       object
relevance        float64
dtype: object


In [28]:
print(product_descriptions_df.dtypes)

product_uid             int64
product_description    object
dtype: object


In [27]:
print(product_descriptions_df.dtypes)

product_uid             int64
product_description    object
dtype: object


In [29]:
print(product_descriptions_df.head())

   product_uid                                product_description
0       100001  Not only do angles make joints stronger, they ...
1       100002  BEHR Premium Textured DECKOVER is an innovativ...
2       100003  Classic architecture meets contemporary design...
3       100004  The Grape Solar 265-Watt Polycrystalline PV So...
4       100005  Update your bathroom with the Delta Vero Singl...


# Data Cleaning

Download NLTK's stopwords list and WordNetLemmatizer

In [30]:
lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()
stopwords=nltk.corpus.stopwords.words('english')
# nltk.download('stopwords')
# nltk.download('wordnet')


Process and tokenize the raw text by:
    1. Convert to lower case
    2. Remove apostrophe
    3. Remove Punctuation
    4. Lemmatize

In [31]:
def process(text, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
   
    # convert text to lower case
    lowercase = str(text).lower()
    
    #remove 's from string
    apoRemoved = lowercase.replace("'s","")
    
    #convert don't to dont
    apoRemoved = apoRemoved.replace("'","")
    
    #handle other punctuations
    transtable = str.maketrans(string.punctuation,"                                ")
    brokenWords = apoRemoved.translate(transtable)
    
    #convert string to list of words
    listOfWords =  nltk.word_tokenize(brokenWords)
    
    #lemmatize text
    lemmatizedList=[lemmatizer.lemmatize(word) for word in listOfWords]
   
    return lemmatizedList


In [32]:
# test case
text = "Dogs Here's don't cars."
print(process(text))

['dog', 'here', 'dont', 'car']


In [33]:
# process train_df
def process_train_df(df, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    
    newdf=df
    for i,row in newdf.iterrows():
#         if i>5:
#             break
#         print("raw: ",text)
        newdf.at[i,'product_title'] = process(row['product_title'])
#         print("processed: ",df.iloc[i]['product_title'])
    return newdf


In [34]:
processed_train_df = process_train_df(train_df)
print(processed_train_df.head())

   id  product_uid                                      product_title  \
0   2       100001           [simpson, strong, tie, 12, gauge, angle]   
1   3       100001           [simpson, strong, tie, 12, gauge, angle]   
2   9       100002  [behr, premium, textured, deckover, 1, gal, sc...   
3  16       100005  [delta, vero, 1, handle, shower, only, faucet,...   
4  17       100005  [delta, vero, 1, handle, shower, only, faucet,...   

          search_term  relevance  
0       angle bracket       3.00  
1           l bracket       2.50  
2           deck over       3.00  
3    rain shower head       2.33  
4  shower only faucet       2.67  


In [35]:
# process 
def process_product_descriptions_df(df, lemmatizer=nltk.stem.wordnet.WordNetLemmatizer()):
    
    newdf=df
    for i,row in newdf.iterrows():
#         if i>5:
#             break
#         print("raw: ",text)
        newdf.at[i,'product_description'] = process(row['product_description'])
#         print("processed: ",df.iloc[i]['product_title'])
    return newdf

In [36]:
processed_product_descriptions_df = process_product_descriptions_df(product_descriptions_df)
print(processed_product_descriptions_df.head())

   product_uid                                product_description
0       100001  [not, only, do, angle, make, joint, stronger, ...
1       100002  [behr, premium, textured, deckover, is, an, in...
2       100003  [classic, architecture, meet, contemporary, de...
3       100004  [the, grape, solar, 265, watt, polycrystalline...
4       100005  [update, your, bathroom, with, the, delta, ver...
