# Amazon Fashion Apparel Recommendation With NLP and Deep Learning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

## Loading the dataset

In [2]:
data = pd.read_json("shirts_data.json")

In [3]:
print("No of Data Points : ",data.shape[0])
print("No of Features : ",data.shape[1])

No of Data Points :  183138
No of Features :  19


This particular dataset has around 200,000 data points and 19 features

## Overview of the Dataset

In [4]:
data.head()

Unnamed: 0,sku,asin,product_type_name,formatted_price,author,color,brand,publisher,availability,reviews,large_image_url,availability_type,small_image_url,editorial_review,title,model,medium_image_url,manufacturer,editorial_reivew
0,,B016I2TS4W,SHIRT,,,,FNC7C,,,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,Minions Como Superheroes Ironman Women's O Nec...,Minions Como Superheroes Ironman Long Sleeve R...,,https://images-na.ssl-images-amazon.com/images...,,
1,,B01N49AI08,SHIRT,,,,FIG Clothing,,,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,Sizing runs on the small side. FIG® recommends...,FIG Clothing Womens Izo Tunic,,https://images-na.ssl-images-amazon.com/images...,,
2,,B01JDPCOHO,SHIRT,,,,FIG Clothing,,,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,Sizing runs on the small side. FIG® recommends...,FIG Clothing Womens Won Top,,https://images-na.ssl-images-amazon.com/images...,,
3,,B01N19U5H5,SHIRT,,,,Focal18,,,"[True, https://www.amazon.com/reviews/iframe?a...",https://images-na.ssl-images-amazon.com/images...,,https://images-na.ssl-images-amazon.com/images...,100% Brand New & Fashion<br> Quantity: 1 Piece...,Focal18 Sailor Collar Bubble Sleeve Blouse Shi...,,https://images-na.ssl-images-amazon.com/images...,,
4,,B004GSI2OS,SHIRT,$26.26,,Onyx Black/ Stone,FeatherLite,,Usually ships in 6-10 business days,"[False, https://www.amazon.com/reviews/iframe?...",https://images-na.ssl-images-amazon.com/images...,now,https://images-na.ssl-images-amazon.com/images...,,Featherlite Ladies' Long Sleeve Stain Resistan...,,https://images-na.ssl-images-amazon.com/images...,,


We can observe that there are 19 features present in the dataset

In [5]:
data.columns

Index(['sku', 'asin', 'product_type_name', 'formatted_price', 'author',
       'color', 'brand', 'publisher', 'availability', 'reviews',
       'large_image_url', 'availability_type', 'small_image_url',
       'editorial_review', 'title', 'model', 'medium_image_url',
       'manufacturer', 'editorial_reivew'],
      dtype='object')

### Which features are useful for our problem statement?

ASIN - Amazon Standard Identification Number

In [6]:
data = data[['asin', 'brand', 'color', 'medium_image_url', 'product_type_name', 'title', 'formatted_price']]

In [7]:
data.head()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
0,B016I2TS4W,FNC7C,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Minions Como Superheroes Ironman Long Sleeve R...,
1,B01N49AI08,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Izo Tunic,
2,B01JDPCOHO,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,FIG Clothing Womens Won Top,
3,B01N19U5H5,Focal18,,https://images-na.ssl-images-amazon.com/images...,SHIRT,Focal18 Sailor Collar Bubble Sleeve Blouse Shi...,
4,B004GSI2OS,FeatherLite,Onyx Black/ Stone,https://images-na.ssl-images-amazon.com/images...,SHIRT,Featherlite Ladies' Long Sleeve Stain Resistan...,$26.26


### How many product types are there in total?

In [8]:
data['product_type_name'].describe()

count     183138
unique        72
top        SHIRT
freq      167794
Name: product_type_name, dtype: object

There are 72 unique categories of products in the dataset with SHIRT having the top frequency

### The unique product types in the dataset are:

In [9]:
data['product_type_name'].unique()

array(['SHIRT', 'SWEATER', 'APPAREL', 'OUTDOOR_RECREATION_PRODUCT',
       'BOOKS_1973_AND_LATER', 'PANTS', 'HAT', 'SPORTING_GOODS', 'DRESS',
       'UNDERWEAR', 'SKIRT', 'OUTERWEAR', 'BRA', 'ACCESSORY',
       'ART_SUPPLIES', 'SLEEPWEAR', 'ORCA_SHIRT', 'HANDBAG',
       'PET_SUPPLIES', 'SHOES', 'KITCHEN', 'ADULT_COSTUME',
       'HOME_BED_AND_BATH', 'MISC_OTHER', 'BLAZER',
       'HEALTH_PERSONAL_CARE', 'TOYS_AND_GAMES', 'SWIMWEAR',
       'CONSUMER_ELECTRONICS', 'SHORTS', 'HOME', 'AUTO_PART',
       'OFFICE_PRODUCTS', 'ETHNIC_WEAR', 'BEAUTY',
       'INSTRUMENT_PARTS_AND_ACCESSORIES', 'POWERSPORTS_PROTECTIVE_GEAR',
       'SHIRTS', 'ABIS_APPAREL', 'AUTO_ACCESSORY', 'NONAPPARELMISC',
       'TOOLS', 'BABY_PRODUCT', 'SOCKSHOSIERY',
       'POWERSPORTS_RIDING_SHIRT', 'EYEWEAR', 'SUIT', 'OUTDOOR_LIVING',
       'POWERSPORTS_RIDING_JACKET', 'HARDWARE', 'SAFETY_SUPPLY',
       'ABIS_DVD', 'VIDEO_DVD', 'GOLF_CLUB', 'MUSIC_POPULAR_VINYL',
       'HOME_FURNITURE_AND_DECOR', 'TABLET_COMPUTER',

### What are the top 10 frequent product_types?

In [10]:
from collections import Counter

In [11]:
n = 10
dfFrequent = data['product_type_name'].value_counts()[:n].index.tolist()

In [12]:
dfFrequent

['SHIRT',
 'APPAREL',
 'BOOKS_1973_AND_LATER',
 'DRESS',
 'SPORTING_GOODS',
 'SWEATER',
 'OUTERWEAR',
 'OUTDOOR_RECREATION_PRODUCT',
 'ACCESSORY',
 'UNDERWEAR']

Using Counter function

In [13]:
product_count = Counter(list(data['product_type_name']))
product_count.most_common(10)

[('SHIRT', 167794),
 ('APPAREL', 3549),
 ('BOOKS_1973_AND_LATER', 3336),
 ('DRESS', 1584),
 ('SPORTING_GOODS', 1281),
 ('SWEATER', 837),
 ('OUTERWEAR', 796),
 ('OUTDOOR_RECREATION_PRODUCT', 729),
 ('ACCESSORY', 636),
 ('UNDERWEAR', 425)]

### What are the unique colors in the dataset?

In [14]:
data['color'].describe()

count     64956
unique     7380
top       Black
freq      13207
Name: color, dtype: object

- The most occuring color is Black.
- There are 13,207 records with the color black

### What are the top 5 colors from the dataset?

In [15]:
color_count = Counter(list(data['color']))
type(color_count)

collections.Counter

In [16]:
color_counter = color_count.most_common(5)

In [17]:
color_counter[1:]

[('Black', 13207), ('White', 8616), ('Blue', 3570), ('Red', 2289)]

### What percentage of data is missing from the colors feature?

There are 118,182 missing records in the dataset with no color

In [18]:
color_counter[0]

(None, 118182)

### What is the highest price of a record in the dataset?

In [19]:
data.describe()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
count,183138,182987,64956,183138,183138,183138,28395
unique,183138,10577,7380,170782,72,175985,3135
top,B074P72S4B,Zago,Black,https://images-na.ssl-images-amazon.com/images...,SHIRT,Nakoda Cotton Self Print Straight Kurti For Women,$19.99
freq,1,223,13207,23,167794,77,945


The highest price for an item in the dataset is $19.99

### PDF of price feature in dataset
The mean price from the chart is : 

### How many unique brands are there in the dataset?

In [20]:
print (data['brand'].describe())

count     182987
unique     10577
top         Zago
freq         223
Name: brand, dtype: object


There are about 10577 unique brands

### What are the 10 most common brands in the dataset?

In [21]:
brand_count = Counter(list(data['brand']))
brand_count.most_common(10)

[('Zago', 223),
 ('XQS', 222),
 ('Yayun', 215),
 ('YUNY', 198),
 ('XiaoTianXin-women clothes', 193),
 ('Generic', 192),
 ('Boohoo', 190),
 ('Alion', 188),
 ('Abetteric', 187),
 ('TheMogan', 187)]

The most common brand is Zago with 223 products closely followed by XQS

### General Description of Title Feature

In [22]:
print(data['title'].describe())

count                                                183138
unique                                               175985
top       Nakoda Cotton Self Print Straight Kurti For Women
freq                                                     77
Name: title, dtype: object


There are 183138 total products with same no.of total titles , and thereby a title exists for each of product

The most common title is of the 'Nakoda Cotton Self Print Straight Kurti For Women'and it is repeated for about 77 times

### Data Cleaning 

#### Converting the title to lowercase

In [23]:
data['title']= data['title'].str.lower()

In [24]:
data.head()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
0,B016I2TS4W,FNC7C,,https://images-na.ssl-images-amazon.com/images...,SHIRT,minions como superheroes ironman long sleeve r...,
1,B01N49AI08,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,fig clothing womens izo tunic,
2,B01JDPCOHO,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,fig clothing womens won top,
3,B01N19U5H5,Focal18,,https://images-na.ssl-images-amazon.com/images...,SHIRT,focal18 sailor collar bubble sleeve blouse shi...,
4,B004GSI2OS,FeatherLite,Onyx Black/ Stone,https://images-na.ssl-images-amazon.com/images...,SHIRT,featherlite ladies' long sleeve stain resistan...,$26.26


#### Drop records with no price

In [25]:
data

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
0,B016I2TS4W,FNC7C,,https://images-na.ssl-images-amazon.com/images...,SHIRT,minions como superheroes ironman long sleeve r...,
1,B01N49AI08,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,fig clothing womens izo tunic,
2,B01JDPCOHO,FIG Clothing,,https://images-na.ssl-images-amazon.com/images...,SHIRT,fig clothing womens won top,
3,B01N19U5H5,Focal18,,https://images-na.ssl-images-amazon.com/images...,SHIRT,focal18 sailor collar bubble sleeve blouse shi...,
4,B004GSI2OS,FeatherLite,Onyx Black/ Stone,https://images-na.ssl-images-amazon.com/images...,SHIRT,featherlite ladies' long sleeve stain resistan...,$26.26
...,...,...,...,...,...,...,...
183133,B01MSALTSO,TOOGOO(R),Black,https://images-na.ssl-images-amazon.com/images...,OUTERWEAR,toogoo(r) women's tops spring autumn casual pu...,$14.58
183134,B015W98YQK,VOGUE CODE,Monochrome Plaid,https://images-na.ssl-images-amazon.com/images...,SHIRT,vogue code vintage v neck plaid shirt sleevele...,
183135,B075756PGC,Wrangler,Pink,https://images-na.ssl-images-amazon.com/images...,SHIRT,wrangler george strait for her long sleeve pin...,
183136,B074L8FVTT,susana monaco,Rose,https://images-na.ssl-images-amazon.com/images...,SHIRT,susana monaco womens susana monoco sleeveless ...,$44.99


### Pickling the pre processed data

Its always necessary to pickle the data files. Pickling is the process where we store our pre processed datasets in a binary file that is lightweight and can be loaded onto any computer regardless of the OS. Here we have pickled the 183k data points that are in our dataset

In [26]:
data.to_pickle("pickled_data_v1")

### Download the images using the `medium_image_url` feature of our dataset

We have used the following libraries to download and store the images that we recieve for all the available images. This may take some time to process so be patient
- PIL
- ImageIO
- BytesIO
- requests

### Remove all records without images

In [27]:
data_images = data.loc[~data['medium_image_url'].isnull()]

In [28]:
data_images.describe()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
count,183138,182987,64956,183138,183138,183138,28395
unique,183138,10577,7380,170782,72,175913,3135
top,B074P72S4B,Zago,Black,https://images-na.ssl-images-amazon.com/images...,SHIRT,nakoda cotton self print straight kurti for women,$19.99
freq,1,223,13207,23,167794,77,945


In [29]:
from PIL import Image
import requests
from io import BytesIO
try:
    for index,row in data_images.iterrows():
        link = row['medium_image_url']
        response = requests.get(link)
        img = Image.open(BytesIO(response.content)) # once the request is received we open the file which is in binary format and get its content
        img.save('images/183k_images/'+row['asin']+'.jpeg') # then we use img.save to save the image received from the request
except:
    pass

### Removing Duplicate Records

In [30]:
data = pd.read_pickle("pickled_data_v1")

In [31]:
print(sum(data.duplicated('title')))

7225


There are many images that are same but differ only in the size(S,M,L,XL). All these image records have different ASIN numbers so that Amazon can track the ASIN numbers for the inventory of the apparels

In our case, for removing duplicate records we make use of the title because there are less number of duplicates(7225) compared to the total number of records(183,000)

**Images with the same color, different ASIN and sizes**

<table>
<tr> 
<td><img src="remdupli/B00AQ4GMCK.jpeg",width=100,height=100> :B00AQ4GMCK</td>
<td><img src="remdupli/B00AQ4GMTS.jpeg",width=100,height=100> :B00AQ4GMTS</td>
</tr>
<tr> 
<td><img src="remdupli/B00AQ4GMLQ.jpeg",width=100,height=100> :B00AQ4GMLQ</td>
<td><img src="remdupli/B00AQ4GN3I.jpeg",width=100,height=100> :B00AQ4GN3I</td>
</tr>
</table>

All the below products have the exact same title but different ASIN numbers and different colored T Shirts

<table>
<tr> 
<td><img src="remdupli/B00G278GZ6.jpeg",width=100,height=100> :B00G278GZ6</td>
<td><img src="remdupli/B00G278W6O.jpeg",width=100,height=100> :B00G278W6O</td>
</tr>
<tr> 
<td><img src="remdupli/B00G278Z2A.jpeg",width=100,height=100> :B00G278Z2A</td>
<td><img src="remdupli/B00G2786X8.jpeg",width=100,height=100> :B00G2786X8</td>
</tr>
</table>

### Why should we remove these duplicates?

Since we are building a recommendation system to recommend apparels to the customers, if we dont remove the duplicates, the system may just recommend the same images from the dataset to the use which is useless!<br>
This would really ruin the customer experience as if the customer is searching for a small sized shirt he would not care about a medium sized same shirt.<br>
Also giving exactly same product with a different color as a recommendation is not the best possible recommendation to the user

### Are short product_titles useful? 

Short product titles are certainly not of much use to our model as we will see later on so we are going to remove the records with title lengths < 5 words

Examples of such titles are : `shirts`, `men's shirt XL`,`small shirt` etc...

In [32]:
data_sorted = data[data['title'].apply(lambda x: len(x.split())>4)]
print(data_sorted.shape[0])

178026


### Sorting the titles in alphabetical order?

In [33]:
data_sorted.sort_values('title',inplace=True, ascending=False)
data_sorted.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sorted.sort_values('title',inplace=True, ascending=False)


Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
27547,B073W7P8KK,Nation LTD,Blue,https://images-na.ssl-images-amazon.com/images...,DRESS,❀nation women stripe blouse long sleeve shirt ...,
31277,B01M0PWMZ8,Anglin,White,https://images-na.ssl-images-amazon.com/images...,SHIRT,✽anglin✽ women striped floral long sleeve roun...,
30453,B01M02GWRG,Anglin,White,https://images-na.ssl-images-amazon.com/images...,SHIRT,✽anglin✽ women striped floral long sleeve roun...,
32485,B01N0ADXM0,Anglin,Red,https://images-na.ssl-images-amazon.com/images...,SHIRT,✽anglin✽ women fashion stripe dress round coll...,
26767,B01MTQAU86,Anglin,Black,https://images-na.ssl-images-amazon.com/images...,SHIRT,✽anglin✽ women autumn winter christmas printin...,


#### Some of the titles are very similar! Do they affect our model performance?

In the title column we can see that some of the titles are almost very similar except for some words which **differ at the end of the two or more titles**

Here are some more examples of titles that are very similar except for the fact that the color of the apparel (or) the size differes at the end. We can see that these types of titles also occur in sequences

<pre>
Titles 1:
16. woman's place is in the house and the senate shirts for Womens XXL White
17. woman's place is in the house and the senate shirts for Womens M Grey

Title 2:
25. tokidoki The Queen of Diamonds Women's Shirt X-Large
26. tokidoki The Queen of Diamonds Women's Shirt Small
27. tokidoki The Queen of Diamonds Women's Shirt Large
</pre>

Append an index to each of the records so that its easier to traverse via i and j pointers in next step

In [34]:
indices = []
for i,row in data_sorted.iterrows():
    indices.append(i)

In [35]:
indices[:5]

[27547, 31277, 30453, 32485, 26767]

### Two Pointer Approach Solution

In [36]:
import itertools

In [37]:
level1_dedpuli = [] # to store the deduplicated records
i = 0
j = 0

In [38]:
total_data_points = data_sorted.shape

In [39]:
total_data_points

(178026, 7)

In [40]:
total_data_points = total_data_points[0]

In [41]:
import itertools
level1_dedupli= []
i = 0
j = 0
num_data_points = data_sorted.shape[0]
while i < num_data_points and j < num_data_points:
    
    previous_i = i

    # store the list of words of ith string in a
    a = data['title'].loc[indices[i]].split()

    # search for the similar products sequentially 
    j = i+1
    while j < num_data_points:

        # store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'Small']
        b = data['title'].loc[indices[j]].split()

        # store the maximum length of two strings
        length = max(len(a), len(b))

        # count is used to store the number of words that are matched in both strings
        count  = 0

        # itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings
        # example: a =['a', 'b', 'c', 'd']
        # b = ['a', 'b', 'd']
        # itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]
        for k in itertools.zip_longest(a,b): 
            if (k[0] == k[1]):
                count += 1

        if (length - count) > 2: # number of words in which both sentences
            # if both strings are differ by more than 2 words we include the 1st string index
            level1_dedupli.append(data_sorted['asin'].loc[indices[i]])


            # start searching for similar apperals corresponds 2nd string
            i = j
            break
        else:
            j += 1
    if previous_i == i:
        break

**List of indices after removing duplicates**

In [42]:
level1_dedupli[:5]

['B073W7P8KK', 'B01M0PWMZ8', 'B01N0ADXM0', 'B01MTQAU86', 'B073W7HXFM']

**Update the dataset**

In [43]:
data = data.loc[data['asin'].isin(level1_dedupli)]

In [44]:
data.head()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
0,B016I2TS4W,FNC7C,,https://images-na.ssl-images-amazon.com/images...,SHIRT,minions como superheroes ironman long sleeve r...,
3,B01N19U5H5,Focal18,,https://images-na.ssl-images-amazon.com/images...,SHIRT,focal18 sailor collar bubble sleeve blouse shi...,
4,B004GSI2OS,FeatherLite,Onyx Black/ Stone,https://images-na.ssl-images-amazon.com/images...,SHIRT,featherlite ladies' long sleeve stain resistan...,$26.26
5,B00TAEHGGS,Fitscloth,Grape,https://images-na.ssl-images-amazon.com/images...,SHIRT,[fits cloth] grape solid modern long sleeve pl...,
6,B012YX2ZPI,HX-Kingdom Fashion T-shirts,White,https://images-na.ssl-images-amazon.com/images...,SHIRT,women's unique 100% cotton t - special olympic...,$9.99


In [45]:
data.describe()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
count,151127,151020,48556,151127,151127,151127,17498
unique,151127,10344,6381,147076,70,151127,2852
top,B074P72S4B,XQS,Black,https://images-na.ssl-images-amazon.com/images...,SHIRT,rafaella women's petite size gold embellished ...,$19.99
freq,1,220,10041,14,140041,1,514


All duplicates removed, dataset stripped to 151127 records

In [46]:
data.to_pickle('150k_apperal_data')

### Text Pre Processing with NLP

In [47]:
data.head()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price
0,B016I2TS4W,FNC7C,,https://images-na.ssl-images-amazon.com/images...,SHIRT,minions como superheroes ironman long sleeve r...,
3,B01N19U5H5,Focal18,,https://images-na.ssl-images-amazon.com/images...,SHIRT,focal18 sailor collar bubble sleeve blouse shi...,
4,B004GSI2OS,FeatherLite,Onyx Black/ Stone,https://images-na.ssl-images-amazon.com/images...,SHIRT,featherlite ladies' long sleeve stain resistan...,$26.26
5,B00TAEHGGS,Fitscloth,Grape,https://images-na.ssl-images-amazon.com/images...,SHIRT,[fits cloth] grape solid modern long sleeve pl...,
6,B012YX2ZPI,HX-Kingdom Fashion T-shirts,White,https://images-na.ssl-images-amazon.com/images...,SHIRT,women's unique 100% cotton t - special olympic...,$9.99


In [48]:
pd.set_option('display.max_colwidth',None)
data [['title']]

Unnamed: 0,title
0,minions como superheroes ironman long sleeve round neck t-shirt for women
3,focal18 sailor collar bubble sleeve blouse shirt women mori girl casual top harajuku
4,"featherlite ladies' long sleeve stain resistant tapered twill shirt, 2xl, onyx black/ stone"
5,[fits cloth] grape solid modern long sleeve plain t shirt
6,women's unique 100% cotton t - special olympics world games 2015 white size l
...,...
183131,"women's cold shoulder button down shirt, long sleeve casual blouse tops"
183132,ulla popken women's plus size crochet lace accent open front jacket 712277
183134,vogue code vintage v neck plaid shirt sleeveless blouse mandarin collar women shirt
183135,wrangler george strait for her long sleeve pink print western shirt


## Punctuation removal

In [49]:
import string

In [50]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [51]:
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
data['title_punc'] = data['title'].apply(lambda x:remove_punctuation(x))
data.head()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price,title_punc
0,B016I2TS4W,FNC7C,,https://images-na.ssl-images-amazon.com/images/I/41cfoWwna2L._SL160_.jpg,SHIRT,minions como superheroes ironman long sleeve round neck t-shirt for women,,minions como superheroes ironman long sleeve round neck tshirt for women
3,B01N19U5H5,Focal18,,https://images-na.ssl-images-amazon.com/images/I/41f2cJAoMlL._SL160_.jpg,SHIRT,focal18 sailor collar bubble sleeve blouse shirt women mori girl casual top harajuku,,focal18 sailor collar bubble sleeve blouse shirt women mori girl casual top harajuku
4,B004GSI2OS,FeatherLite,Onyx Black/ Stone,https://images-na.ssl-images-amazon.com/images/I/31VspXbakvL._SL160_.jpg,SHIRT,"featherlite ladies' long sleeve stain resistant tapered twill shirt, 2xl, onyx black/ stone",$26.26,featherlite ladies long sleeve stain resistant tapered twill shirt 2xl onyx black stone
5,B00TAEHGGS,Fitscloth,Grape,https://images-na.ssl-images-amazon.com/images/I/41WxPn2WduL._SL160_.jpg,SHIRT,[fits cloth] grape solid modern long sleeve plain t shirt,,fits cloth grape solid modern long sleeve plain t shirt
6,B012YX2ZPI,HX-Kingdom Fashion T-shirts,White,https://images-na.ssl-images-amazon.com/images/I/41EpudaOiWL._SL160_.jpg,SHIRT,women's unique 100% cotton t - special olympics world games 2015 white size l,$9.99,womens unique 100 cotton t special olympics world games 2015 white size l


### we need the title as a string so converting it from a string to an object

In [52]:
data['title']=data['title'].astype(str)

In [53]:
data.dtypes

asin                 object
brand                object
color                object
medium_image_url     object
product_type_name    object
title                object
formatted_price      object
title_punc           object
dtype: object

## Tokenization

In [54]:
#defining function for tokenization
import re
def tokenization(text):
    tokens = re.split('W+',text)
    return tokens
#applying function to the column
data['title_tokenized']= data['title_punc'].apply(lambda x: tokenization(x))

In [55]:
data.head()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price,title_punc,title_tokenized
0,B016I2TS4W,FNC7C,,https://images-na.ssl-images-amazon.com/images/I/41cfoWwna2L._SL160_.jpg,SHIRT,minions como superheroes ironman long sleeve round neck t-shirt for women,,minions como superheroes ironman long sleeve round neck tshirt for women,[minions como superheroes ironman long sleeve round neck tshirt for women]
3,B01N19U5H5,Focal18,,https://images-na.ssl-images-amazon.com/images/I/41f2cJAoMlL._SL160_.jpg,SHIRT,focal18 sailor collar bubble sleeve blouse shirt women mori girl casual top harajuku,,focal18 sailor collar bubble sleeve blouse shirt women mori girl casual top harajuku,[focal18 sailor collar bubble sleeve blouse shirt women mori girl casual top harajuku]
4,B004GSI2OS,FeatherLite,Onyx Black/ Stone,https://images-na.ssl-images-amazon.com/images/I/31VspXbakvL._SL160_.jpg,SHIRT,"featherlite ladies' long sleeve stain resistant tapered twill shirt, 2xl, onyx black/ stone",$26.26,featherlite ladies long sleeve stain resistant tapered twill shirt 2xl onyx black stone,[featherlite ladies long sleeve stain resistant tapered twill shirt 2xl onyx black stone]
5,B00TAEHGGS,Fitscloth,Grape,https://images-na.ssl-images-amazon.com/images/I/41WxPn2WduL._SL160_.jpg,SHIRT,[fits cloth] grape solid modern long sleeve plain t shirt,,fits cloth grape solid modern long sleeve plain t shirt,[fits cloth grape solid modern long sleeve plain t shirt]
6,B012YX2ZPI,HX-Kingdom Fashion T-shirts,White,https://images-na.ssl-images-amazon.com/images/I/41EpudaOiWL._SL160_.jpg,SHIRT,women's unique 100% cotton t - special olympics world games 2015 white size l,$9.99,womens unique 100 cotton t special olympics world games 2015 white size l,[womens unique 100 cotton t special olympics world games 2015 white size l]


## Removal of stopwords

In [56]:
import nltk
nltk.download()
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [57]:
#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output
data['title_stopwords']= data['title_tokenized'].apply(lambda x:remove_stopwords(x))

In [58]:
#importing the Stemming function from nltk library
#from nltk.stem.porter import PorterStemmer
#defining the object for stemming
#porter_stemmer = PorterStemmer()

In [59]:
#defining a function for stemming
#def stemming(text):
 #   stem_text = [porter_stemmer.stem(word) for word in text]
  #  return stem_text


In [60]:
#data['title.stemmed']=data['title_stopwords'].apply(lambda x: stemming(x))
#lemmatization is better than stemming

## Lemmatization

In [61]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

In [62]:
#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text
data['title_lemm']=data['title_stopwords'].apply(lambda x:lemmatizer(x))

In [63]:
data.head()

Unnamed: 0,asin,brand,color,medium_image_url,product_type_name,title,formatted_price,title_punc,title_tokenized,title_stopwords,title_lemm
0,B016I2TS4W,FNC7C,,https://images-na.ssl-images-amazon.com/images/I/41cfoWwna2L._SL160_.jpg,SHIRT,minions como superheroes ironman long sleeve round neck t-shirt for women,,minions como superheroes ironman long sleeve round neck tshirt for women,[minions como superheroes ironman long sleeve round neck tshirt for women],[minions como superheroes ironman long sleeve round neck tshirt for women],[minions como superheroes ironman long sleeve round neck tshirt for women]
3,B01N19U5H5,Focal18,,https://images-na.ssl-images-amazon.com/images/I/41f2cJAoMlL._SL160_.jpg,SHIRT,focal18 sailor collar bubble sleeve blouse shirt women mori girl casual top harajuku,,focal18 sailor collar bubble sleeve blouse shirt women mori girl casual top harajuku,[focal18 sailor collar bubble sleeve blouse shirt women mori girl casual top harajuku],[focal18 sailor collar bubble sleeve blouse shirt women mori girl casual top harajuku],[focal18 sailor collar bubble sleeve blouse shirt women mori girl casual top harajuku]
4,B004GSI2OS,FeatherLite,Onyx Black/ Stone,https://images-na.ssl-images-amazon.com/images/I/31VspXbakvL._SL160_.jpg,SHIRT,"featherlite ladies' long sleeve stain resistant tapered twill shirt, 2xl, onyx black/ stone",$26.26,featherlite ladies long sleeve stain resistant tapered twill shirt 2xl onyx black stone,[featherlite ladies long sleeve stain resistant tapered twill shirt 2xl onyx black stone],[featherlite ladies long sleeve stain resistant tapered twill shirt 2xl onyx black stone],[featherlite ladies long sleeve stain resistant tapered twill shirt 2xl onyx black stone]
5,B00TAEHGGS,Fitscloth,Grape,https://images-na.ssl-images-amazon.com/images/I/41WxPn2WduL._SL160_.jpg,SHIRT,[fits cloth] grape solid modern long sleeve plain t shirt,,fits cloth grape solid modern long sleeve plain t shirt,[fits cloth grape solid modern long sleeve plain t shirt],[fits cloth grape solid modern long sleeve plain t shirt],[fits cloth grape solid modern long sleeve plain t shirt]
6,B012YX2ZPI,HX-Kingdom Fashion T-shirts,White,https://images-na.ssl-images-amazon.com/images/I/41EpudaOiWL._SL160_.jpg,SHIRT,women's unique 100% cotton t - special olympics world games 2015 white size l,$9.99,womens unique 100 cotton t special olympics world games 2015 white size l,[womens unique 100 cotton t special olympics world games 2015 white size l],[womens unique 100 cotton t special olympics world games 2015 white size l],[womens unique 100 cotton t special olympics world games 2015 white size l]
