# Experiment Log

## A snapshot of what worked and what didn't.

In [1]:
# Importing the required libraries
import nltk
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [2]:
# Importing Dataset
data = pd.read_csv("flipkart_inventory.csv")

In [3]:
# Dropping irrelevant columns
data = data.drop(['uniq_id', 'crawl_timestamp', 'product_url', 'pid', 'retail_price', 'discounted_price', 'image',
          'is_FK_Advantage_product', 'product_rating', 'overall_rating', 'brand',
                 'product_name', 'product_specifications'], axis = 1)

# Experiment #1


## Using ' >>' , not '>>'

While splitting the product_category_tree, I realized it was important to split by ' >>' & not '>>'. This was because some products had a branch product_category_tree, while others only had the root category present.

In [4]:
# Create column for primary_category
data["primary_category"] = ""
# Removing the starting and ending tags.
data["product_category_tree"] = data["product_category_tree"].str.slice(2, -2, 1)
print(data['product_category_tree'])

0        Clothing >> Women's Clothing >> Lingerie, Slee...
1        Furniture >> Living Room Furniture >> Sofa Bed...
2        Footwear >> Women's Footwear >> Ballerinas >> ...
3        Clothing >> Women's Clothing >> Lingerie, Slee...
4        Pet Supplies >> Grooming >> Skin & Coat Care >...
                               ...                        
19995    Baby Care >> Baby & Kids Gifts >> Stickers >> ...
19996    Baby Care >> Baby & Kids Gifts >> Stickers >> ...
19997    Baby Care >> Baby & Kids Gifts >> Stickers >> ...
19998    Baby Care >> Baby & Kids Gifts >> Stickers >> ...
19999    Baby Care >> Baby & Kids Gifts >> Stickers >> ...
Name: product_category_tree, Length: 20000, dtype: object


Demonstrating by using '>>' to split product_category_tree.

In [5]:
# Lets split by >>
data['primary_category'] = data['product_category_tree'].str.split('>>').str[0] # split by >>
print(data['primary_category'])

0            Clothing 
1           Furniture 
2            Footwear 
3            Clothing 
4        Pet Supplies 
             ...      
19995       Baby Care 
19996       Baby Care 
19997       Baby Care 
19998       Baby Care 
19999       Baby Care 
Name: primary_category, Length: 20000, dtype: object


In [6]:
value_count = data['primary_category'].value_counts()
print(value_count[value_count > 3])

Clothing                                               6198
Jewellery                                              3531
Footwear                                               1227
Mobiles & Accessories                                  1099
Automotive                                             1012
Home Decor & Festive Needs                              929
Beauty and Personal Care                                710
Home Furnishing                                         700
Kitchen & Dining                                        647
Computers                                               578
Watches                                                 530
Baby Care                                               483
Tools & Hardware                                        391
Toys & School Supplies                                  330
Pens & Stationery                                       313
Bags, Wallets & Belts                                   265
Furniture                               

**You will see that there are two rows having the title 'Sunglasses' consisting of 35 & 5 items each.**
Why is this happening?



In [7]:
# Print rows which contain Sunglasses as Primary Category
data[data["primary_category"].str.contains("Sunglasses")]

Unnamed: 0,product_category_tree,description,primary_category
496,Sunglasses,YNA Aviator Sunglasses - Buy YNA Aviator Sungl...,Sunglasses
497,Sunglasses,Ditu&Kritu Warrior Aviator Sunglasses - Buy Di...,Sunglasses
665,Sunglasses,Yna Wayfarer Sunglasses - Buy Yna Wayfarer Sun...,Sunglasses
3246,Sunglasses >> Dark Image Wayfarer Sunglasses,Key Features of Dark Image Wayfarer Sunglasses...,Sunglasses
3256,Sunglasses >> AAO+ Aviator Sunglasses,Key Features of AAO+ Aviator Sunglasses Face T...,Sunglasses
3257,Sunglasses >> AAO+ Aviator Sunglasses,Key Features of AAO+ Aviator Sunglasses Face T...,Sunglasses
3258,Sunglasses >> EYE GLASS Wayfarer Sunglasses,Key Features of EYE GLASS Wayfarer Sunglasses ...,Sunglasses
3262,Sunglasses >> HERDY Aviator Sunglasses,Key Features of HERDY Aviator Sunglasses Face ...,Sunglasses
3263,Sunglasses >> Zyaden Oval Sunglasses,Key Features of Zyaden Oval Sunglasses Face Ty...,Sunglasses
3265,Sunglasses >> Elligator Aviator Sunglasses,Key Features of Elligator Aviator Sunglasses F...,Sunglasses


There are 3 types of Sunglasses present.

* Sunglasses >> Dark Image Wayfarer Sunglasses
* Olvin Aviator Sunglasses
* Sunglasses

For the branched categories, '>>' works fine. However, for the products having only a root category, they don't have a '>>' present. Therefore two categories are being created for Sunglasses as "Sunglasses" & "Sunglasses ". This issue is resolved by using " >>".

In [8]:
# Splitting product_category_tree using " >>"
data['primary_category'] = data['product_category_tree'].str.split(' >>').str[0] # split by >>
data[data["primary_category"].str.contains("Sunglasses")]

Unnamed: 0,product_category_tree,description,primary_category
496,Sunglasses,YNA Aviator Sunglasses - Buy YNA Aviator Sungl...,Sunglasses
497,Sunglasses,Ditu&Kritu Warrior Aviator Sunglasses - Buy Di...,Sunglasses
665,Sunglasses,Yna Wayfarer Sunglasses - Buy Yna Wayfarer Sun...,Sunglasses
3246,Sunglasses >> Dark Image Wayfarer Sunglasses,Key Features of Dark Image Wayfarer Sunglasses...,Sunglasses
3256,Sunglasses >> AAO+ Aviator Sunglasses,Key Features of AAO+ Aviator Sunglasses Face T...,Sunglasses
3257,Sunglasses >> AAO+ Aviator Sunglasses,Key Features of AAO+ Aviator Sunglasses Face T...,Sunglasses
3258,Sunglasses >> EYE GLASS Wayfarer Sunglasses,Key Features of EYE GLASS Wayfarer Sunglasses ...,Sunglasses
3262,Sunglasses >> HERDY Aviator Sunglasses,Key Features of HERDY Aviator Sunglasses Face ...,Sunglasses
3263,Sunglasses >> Zyaden Oval Sunglasses,Key Features of Zyaden Oval Sunglasses Face Ty...,Sunglasses
3265,Sunglasses >> Elligator Aviator Sunglasses,Key Features of Elligator Aviator Sunglasses F...,Sunglasses


In [9]:
# Checking the value count for Sunglasses
value_count = data['primary_category'].value_counts()
print(value_count[value_count > 3])

Clothing                                               6198
Jewellery                                              3531
Footwear                                               1227
Mobiles & Accessories                                  1099
Automotive                                             1012
Home Decor & Festive Needs                              929
Beauty and Personal Care                                710
Home Furnishing                                         700
Kitchen & Dining                                        647
Computers                                               578
Watches                                                 530
Baby Care                                               483
Tools & Hardware                                        391
Toys & School Supplies                                  330
Pens & Stationery                                       313
Bags, Wallets & Belts                                   265
Furniture                               

Note that now Sunglasses have 35 + 5 = 40 products.

# Experiment #2

## Using additional stopwords

Stopwords helped increase the accuracy by 0.04%. Although not a good amount, I felt that I could improve it further by removing words from the dataset that I think dont have any significance.

In [10]:
# Importing Dataset and implementing data processing
data = pd.read_csv("flipkart_inventory.csv")
data = data.drop(['uniq_id', 'crawl_timestamp', 'product_url', 'pid', 'retail_price', 'discounted_price', 'image',
          'is_FK_Advantage_product', 'product_rating', 'overall_rating', 'brand',
                 'product_name', 'product_specifications'], axis = 1)
data.drop([553, 17299] , inplace = True)
data["product_category_tree"] = data["product_category_tree"].str.slice(2, -2, 1)
data['primary_category'] = data['product_category_tree'].str.split(' >>').str[0] # split by >>


In [11]:
# Data preprocessing of Description column
data['description'] = data['description'].str.lower()  # Lowercase
data['description'] = data['description'].str.replace('http\S+', '')  # Remove links
data["description"] = data['description'].str.replace('[^\w\s]',' ')  # Remove Punctuations
data["description"] = data['description'].str.replace('\w*\d\w*','')  # Remove Strings containing numbers i.e. codes & NUmbers
data["description"] = data['description'].str.replace(r'\b\w\b', '').str.replace(r'\s+', ' ')  # Remove single alphabets & extra spaces
data['description'][45]

'specifications of vishudh printed women straight kurta kurta details sleeve sleeveless number of contents in sales package pack of fabric polyester type straight neck round neck general details pattern printed occasion festive ideal for women in the box kurta additional details style code pink olive fabric care gentle machine wash in lukewarm water do not bleach'

In [12]:
# Stopwords from NLTK corpus
stop_words = set(stopwords.words('english'))
print(stop_words)

{"don't", 'himself', 'i', 'how', 'having', 'of', 'ain', 'any', 'no', 're', 'before', 'yours', 'itself', 'their', 'this', 'isn', 'between', 'all', "you'll", 'a', "won't", 'she', "didn't", 'will', 'hers', 'o', 'off', 'don', 'now', 'shan', 'm', 'after', 'we', "it's", 'until', 've', 'here', "needn't", 'should', "aren't", 'are', 'few', 'on', 'shouldn', 'what', "shouldn't", 'needn', "shan't", 'have', 'once', 'with', 'down', "wouldn't", 'does', 'over', 'doing', 'both', 'him', 'an', "haven't", 'herself', 'such', 'where', "hasn't", 'as', 'these', "that'll", "couldn't", 'during', 'me', 'against', 'her', 'very', 'll', 'its', 'be', 'been', 'from', 'doesn', 'out', 'couldn', 'at', "doesn't", 'themselves', 'while', 'y', 'some', 'your', 'most', "weren't", 'my', 'just', 'weren', "you're", 'so', 'own', 'up', 'or', 'and', 'other', 'into', 'mustn', 'which', 'it', 'there', "she's", 'being', 'wouldn', 'that', 'you', 'too', 'in', 'had', 'myself', 'those', 'more', 'am', 'then', 'is', 'wasn', 'ourselves', 'do'

In [13]:
# Removing stopwords from description column
data["description"] = data["description"].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop_words))
print(data["description"])

0        key features alisha solid women cycling shorts...
1        fabhomedecor fabric double sofa bed finish col...
2        key features aw bellies sandals wedges heel ca...
3        key features alisha solid women cycling shorts...
4        specifications sicons purpose arnica dog shamp...
                               ...                        
19995    buy walldesign small vinyl sticker rs online w...
19996    buy wallmantra large vinyl stickers sticker rs...
19997    buy elite collection medium acrylic sticker rs...
19998    buy elite collection medium acrylic sticker rs...
19999    buy elite collection medium acrylic sticker rs...
Name: description, Length: 19998, dtype: object


In [14]:
# Testing Current Accuracy (using NLTK stopwords only)
X = data['description']
y = data['primary_category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state= 42)
# print("X Training Set : ", X_train.shape)
# print("X Test Set : ", X_test.shape)
clf_tfidf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC())])

# Feed the training data through the pipeline
clf_tfidf_lsvc2.fit(X_train, y_train)

predictions = clf_tfidf_lsvc2.predict(X_test.astype('U').values)
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.9645


In [15]:
# Checking frequency of words to act as additional stopwords
value_count = data.description.str.split(expand=True).stack().value_counts()
print(value_count[value_count > 3000])

rs                17782
buy               11395
women             10574
com               10393
flipkart          10223
online             9925
free               8621
products           8592
day                8218
genuine            7998
delivery           7917
replacement        7894
cash               7866
shipping           7853
guarantee          7432
features           7199
price              6705
shirt              5794
color              5743
type               5377
details            5300
specifications     5154
men                5032
material           4974
casual             4972
fabric             4958
general            4772
collection         4623
cm                 4537
india              4516
set                4473
box                4428
cotton             4313
pack               4132
solid              3961
number             3767
neck               3652
key                3615
ideal              3614
quality            3608
gold               3605
package         

In [16]:
# creating a set of stopwords
stop_words_data = {'rs', 'buy', 'com', 'flipkart', 'online', 'free', 'products', 'day', 'genuine', 'delivery',
                  'replacement', 'cash', 'shipping', 'guarantee', 'price', 'india', 'product',}

In [17]:
# Removing the new set of stopwords from description column
data["description"] = data["description"].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop_words_data))

In [18]:
# Testing Model Accuracy using additional stopwords
X = data['description']
y = data['primary_category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state= 42)
# print("X Training Set : ", X_train.shape)
# print("X Test Set : ", X_test.shape)
clf_tfidf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC())])

# Feed the training data through the pipeline
clf_tfidf_lsvc2.fit(X_train, y_train)

predictions = clf_tfidf_lsvc2.predict(X_test.astype('U').values)
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.9631666666666666


## Removing these additional stopwords decreased model accuracy by 0.14%.