# Product tagging using Machine Learning

---

In [1]:
# Data processing
import pandas as pd
import numpy as np
from collections import Counter

# Visualisation
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import seaborn as sns

# Language processing
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import nltk
# nltk.download('stopwords') # if you haven't downloaded this yet, you need to now.


# Machine Learning - model training
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn.multiclass import OneVsRestClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
filepath = '/Users/wolfsinem/product-tagging/data/flipkart_com-ecommerce_sample.csv'
df = pd.read_csv((filepath))

In [3]:
df.head(1)

Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications
0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2FF9KEDEFGF,999.0,379.0,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."


In [4]:
model_df = df[['product_name','description']]

In [5]:
model_df.shape

(20000, 2)

In [6]:
pd.options.mode.chained_assignment = None 
model_df['tags'] = ""

In [7]:
model_df

Unnamed: 0,product_name,description,tags
0,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,
1,FabHomeDecor Fabric Double Sofa Bed,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,
2,AW Bellies,Key Features of AW Bellies Sandals Wedges Heel...,
3,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,
4,Sicons All Purpose Arnica Dog Shampoo,Specifications of Sicons All Purpose Arnica Do...,
...,...,...,...
19995,WallDesign Small Vinyl Sticker,Buy WallDesign Small Vinyl Sticker for Rs.730 ...,
19996,Wallmantra Large Vinyl Stickers Sticker,Buy Wallmantra Large Vinyl Stickers Sticker fo...,
19997,Elite Collection Medium Acrylic Sticker,Buy Elite Collection Medium Acrylic Sticker fo...,
19998,Elite Collection Medium Acrylic Sticker,Buy Elite Collection Medium Acrylic Sticker fo...,


#### Set all strings to lowercase

In [8]:
df['product_name'].str.lower()
df['description'].str.lower()

0        key features of alisha solid women's cycling s...
1        fabhomedecor fabric double sofa bed (finish co...
2        key features of aw bellies sandals wedges heel...
3        key features of alisha solid women's cycling s...
4        specifications of sicons all purpose arnica do...
                               ...                        
19995    buy walldesign small vinyl sticker for rs.730 ...
19996    buy wallmantra large vinyl stickers sticker fo...
19997    buy elite collection medium acrylic sticker fo...
19998    buy elite collection medium acrylic sticker fo...
19999    buy elite collection medium acrylic sticker fo...
Name: description, Length: 20000, dtype: object

In [9]:
test_string = model_df['description'][0]
test_string

"Key Features of Alisha Solid Women's Cycling Shorts Cotton Lycra Navy, Red, Navy,Specifications of Alisha Solid Women's Cycling Shorts Shorts Details Number of Contents in Sales Package Pack of 3 Fabric Cotton Lycra Type Cycling Shorts General Details Pattern Solid Ideal For Women's Fabric Care Gentle Machine Wash in Lukewarm Water, Do Not Bleach Additional Details Style Code ALTHT_3P_21 In the Box 3 shorts"

### Function to create tags

In [10]:
def tokenize_string(sentence):  
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    new_words = tokenizer.tokenize(sentence)
    new_words = [token.lower() for token in new_words] # set to a lower case
    
    stop_words = set(stopwords.words('english')) 
    manual_filtered_words = {'details','fabric','key','features','sales','number','contents','type','general', 'specifications'}

    filtered_sentence = [w for w in new_words if not w in stop_words and not w in manual_filtered_words]
    count_terms = Counter(filtered_sentence).most_common(10) # fill e.g. (5) for most common 5 terms
    return [item[0] for item in count_terms] # this function extracts the first element of each sublist, so only the terms and not how many times it occured

#### Generating tags with the above function

In [11]:
term_lists = tokenize_string(test_string)
term_lists

['shorts',
 'solid',
 'women',
 'cycling',
 'alisha',
 'cotton',
 'lycra',
 'navy',
 '3',
 'red']

As you can see there are some numbers being used as tags. We will take these out.

#### Now we need to for-loop this so the function generates tags for each of the comments

In [None]:
# this function takes out every number within a string because we don't need it as tags since it doesnt give any context
# print([x for x in token_lists[] if not any(c.isdigit() for c in x)])

In [12]:
token_lists = []
for i in model_df['description']:
    token_lists.append([x for x in tokenize_string(str((i))) if not any(c.isdigit() for c in x)])

In [13]:
token_lists

[['shorts',
  'solid',
  'women',
  'cycling',
  'alisha',
  'cotton',
  'lycra',
  'navy',
  'red'],
 ['product',
  'sofa',
  'color',
  'bed',
  'warranty',
  'material',
  'avoid',
  'double',
  'black',
  'finish'],
 ['aw',
  'bellies',
  'heel',
  'material',
  'casual',
  'warranty',
  'pair',
  'shoes',
  'use',
  'shoe'],
 ['shorts',
  'solid',
  'women',
  'cycling',
  'alisha',
  'cotton',
  'lycra',
  'black',
  'red'],
 ['sicons',
  'arnica',
  'dog',
  'purpose',
  'shampoo',
  'ml',
  'pet',
  'brand',
  'quantity'],
 ['paper',
  'crystal',
  'weight',
  'gandhi',
  'finish',
  'silver',
  'eternal',
  'super',
  'series',
  'weights'],
 ['shorts',
  'solid',
  'women',
  'cycling',
  'alisha',
  'cotton',
  'lycra',
  'red',
  'navy'],
 ['product',
  'sofa',
  'color',
  'bed',
  'warranty',
  'material',
  'avoid',
  'double',
  'brown',
  'finish'],
 ['casuals',
  'dilli',
  'bazaaar',
  'bellies',
  'corporate',
  'material',
  'occasion',
  'ethnic',
  'casual',
  'p

In [14]:
for i in range(len(model_df.index)):
    model_df.at[i,'tags'] = token_lists[i]

In [15]:
model_df

Unnamed: 0,product_name,description,tags
0,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,"[shorts, solid, women, cycling, alisha, cotton..."
1,FabHomeDecor Fabric Double Sofa Bed,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,"[product, sofa, color, bed, warranty, material..."
2,AW Bellies,Key Features of AW Bellies Sandals Wedges Heel...,"[aw, bellies, heel, material, casual, warranty..."
3,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,"[shorts, solid, women, cycling, alisha, cotton..."
4,Sicons All Purpose Arnica Dog Shampoo,Specifications of Sicons All Purpose Arnica Do...,"[sicons, arnica, dog, purpose, shampoo, ml, pe..."
...,...,...,...
19995,WallDesign Small Vinyl Sticker,Buy WallDesign Small Vinyl Sticker for Rs.730 ...,"[walldesign, small, vinyl, sticker, buy, rs, o..."
19996,Wallmantra Large Vinyl Stickers Sticker,Buy Wallmantra Large Vinyl Stickers Sticker fo...,"[wallmantra, large, vinyl, stickers, sticker, ..."
19997,Elite Collection Medium Acrylic Sticker,Buy Elite Collection Medium Acrylic Sticker fo...,"[elite, collection, medium, acrylic, sticker, ..."
19998,Elite Collection Medium Acrylic Sticker,Buy Elite Collection Medium Acrylic Sticker fo...,"[elite, collection, medium, acrylic, sticker, ..."


#### Delete missing values (should be done in the first couple of cells)

In [16]:
model_df.isnull().sum()

product_name    0
description     2
tags            0
dtype: int64

In [17]:
model_df.dropna(inplace=True)

#### dtype 

In [18]:
type(model_df['tags'].iloc[0])

list

# Some analyzing of the words

In [None]:
# descriptions = model_df.description.str.cat(sep=' ')
# tokens = word_tokenize(descriptions)
# vocab = set(tokens)

# freq_dist = nltk.FreqDist(tokens)
# sorted(freq_dist, key=freq_dist.__getitem__, reverse=True)[0:10]

# stop_words = set(stopwords.words('english'))
# tokens = [w for w in tokens if not w in stop_words]

In [None]:
# [....]

In [None]:
all_tags = []
for i in token_lists:
    for j in i:
        all_tags.append(j)

In [None]:
cv = CountVectorizer(tokenizer=lambda x:x.split())
tag_dtm = cv.fit_transform(all_tags)
print(("There are {} unique tags").format(tag_dtm.shape[1]))

In [None]:
# Create and generate a word cloud image of just one string/description
wordcloud = WordCloud(background_color="white").generate(test_string)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
tags_counted_and_sorted = pd.DataFrame({'tag': all_tags}).groupby('tag').size().reset_index().sort_values(0, ascending=False)
tags_counted_and_sorted

In [None]:
tags_counted_and_sorted[:10].plot.barh(x='tag', y=0, figsize=(12,8))

# Machine Learning - Multilabel Classification

---

## Baseline

The baseline is actually what I did in the function above. Tags are generated based on the occurunce of a word. The next step is to actually let the Machine Learning Multi-label classification Technique create these tags itself.

## Preprocessing

In [19]:
# there are a total of 19998 rows and 3 different columns. 
model_df.shape

(19998, 3)

In [62]:
n = 2000

In [63]:
target_variable = model_df['tags'][:n]

In [64]:
mlb = MultiLabelBinarizer()
target_variable = mlb.fit_transform(target_variable)

In [65]:
mlb.classes_

array(['_brn', 'aa', 'aadivasi', ..., 'zoysia', 'zte', 'zunia'],
      dtype=object)

### TfidfVectorizer

---

This method tokenizes documents/texts, learns the vocabulary and inverses the document frequency weightings and allows you to encode new documents. 

We will import this method from <b> sklearn.feature_extraction.text </b>. 
There are a lot of different parameters:

- <b>lowercase;</b> Convert all characters to lowercase, which in our case will be handy
- <b>stop_words;</b> If a string, it is passed to _check_stop_list and the appropriate stop list is returned
- <b>ngram_range;</b> The lower and upper boundary of the range of n-values for different n-grams to be extracted
- <b>max_features;</b> If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus


In [66]:
# Initialise the vectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', 
                             analyzer='word', 
                             ngram_range=(1,3), 
#                              max_features=1000,
                             stop_words='english',
                             token_pattern=r'\w{3,}'
                            )


# fit the independent features
independent_variable = vectorizer.fit_transform(model_df['description'][:n])

print('Independent variable shape: {}'.format(independent_variable.shape))
print('Target variable shape: {}'.format(target_variable.shape))

Independent variable shape: (2000, 78536)
Target variable shape: (2000, 3039)


In [None]:
vectorizer.vocabulary_

#### Train/test sets

In [67]:
# split the data into training en testing sets
X_train, X_test, y_train, y_test = train_test_split(
                                        independent_variable, 
                                        target_variable, 
                                        test_size=0.2, 
                                        random_state=42, 
                                        )

In [68]:
print('X train shape: {}'.format(X_train.shape))
print('y train shape: {}'.format(y_train.shape))

print('------------------------------')

print('X test shape: {}'.format(X_test.shape))
print('y test shape: {}'.format(y_test.shape))

X train shape: (1600, 78536)
y train shape: (1600, 3039)
------------------------------
X test shape: (400, 78536)
y test shape: (400, 3039)


# Model building

---

### Pipeline
### LinearSVC

In [69]:
Linear_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LinearSVC(
                                                class_weight='balanced',
                                                random_state=42,
                                                tol=1e-1,
                                                C=8.385), 
                                                n_jobs=-1)),
            ])

In [70]:
Linear_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('clf',
                 OneVsRestClassifier(estimator=LinearSVC(C=8.385,
                                                         class_weight='balanced',
                                                         dual=True,
                                                         fit_intercept=True,
                                                         intercept_scaling=1,
                                                         loss='squared_hinge',
                                                         max_iter=1000,
                                                         multi_class='ovr',
                                                         penalty='l2',
                                                         random_state=42,
                                                         tol=0.1, verbose=0),
                                     n_jobs=-1))],
         verbose=False)

In [71]:
prediction = Linear_pipeline.predict(X_test)

In [72]:
print('Accuracy for LinearSVC is {}'.format(accuracy_score(y_test, prediction)))

Accuracy for LinearSVC is 0.37


### Logistic Regression

In [None]:
Logistic_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LogisticRegression(
                                                class_weight='balanced',
                                                random_state=0,
                                                tol=1e-1,
                                                C=8.385), 
                                                n_jobs=-1)),
            ])

In [None]:
Logistic_pipeline.fit(X_train, y_train)

In [None]:
prediction_2 = Logistic_pipeline.predict(X_test)

In [None]:
print('Accuracy for LinearSVC is {}'.format(accuracy_score(y_test, prediction_2)))

### MultinomialNB (Naive Bayes) 

In [None]:
NB_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(MultinomialNB(
                                                fit_prior=True, 
                                                class_prior=None))),
            ])

In [None]:
# NB_pipeline.fit(X_train, y_train)

In [None]:
prediction_3 = NB_pipeline.predict(X_test)

In [None]:
print('Accuracy for Naive Bayes is {}'.format(accuracy_score(y_test, prediction_3)))

# Comparison

In [None]:
# model_names = ['LinearSVC','LogisticRegression','Naive Bayes']
model_names = ['LinearSVC']
scores_10 = [0.3575,0.233,0.0535] #scores tested with half of the dataset
scores_20 = [0.379166] #scores with the whole dataset

In [None]:
score_frame = pd.DataFrame({'Model': model_names, 'Accuracy Score 20k': scores_20})
score_frame

In [None]:
sns.factorplot(y='Model',x='Accuracy Score 20k',data=score_frame,kind='bar',aspect=2)
plt.show()

# Model testing

In [73]:
string_1 = ['Key Features of Alisha Solid Womens Cycling Shorts Cotton Lycra Navy, Red, Navy,Specifications of Alisha Solid Womens Cycling Shorts Shorts Details Number of Contents in Sales Package Pack of 3 Fabric Cotton Lycra Type Cycling Shorts General Details Pattern Solid Ideal For Womens Fabric Care Gentle Machine Wash in Lukewarm Water, Do Not Bleach Additional Details Style Code ALTHT_3P_21 In the Box 3 shorts']

In [74]:
model_string_1 = vectorizer.transform(string_1)
Linear_pipeline.predict(model_string_1)

array([[0, 0, 0, ..., 0, 0, 0]])

### Predicted/generated tags for product description 1

In [76]:
predicted_tags = mlb.inverse_transform(Linear_pipeline.predict(model_string_1))
predicted_tags

[('alisha', 'cotton', 'cycling', 'lycra', 'navy', 'red', 'shorts', 'solid')]

### Tags created by the baseline

In [None]:
baseline_tags = model_df['tags'][0]
baseline_tags

---
## Add tags created by the ML to a whole new dataframe

In [79]:
submission_df = model_df.drop(columns=['tags','product_name'])
sample_df = submission_df[:n]
sample_df.head(5)

Unnamed: 0,description
0,Key Features of Alisha Solid Women's Cycling S...
1,FabHomeDecor Fabric Double Sofa Bed (Finish Co...
2,Key Features of AW Bellies Sandals Wedges Heel...
3,Key Features of Alisha Solid Women's Cycling S...
4,Specifications of Sicons All Purpose Arnica Do...


In [82]:
description_variable = vectorizer.fit_transform(sample_df['description'])
prediction_values = mlb.inverse_transform(Linear_pipeline.predict(description_variable))
sample_df['tags'] = np.asarray(prediction_values)
sample_df.head(5)

Unnamed: 0,description,tags
0,Key Features of Alisha Solid Women's Cycling S...,"(alisha, cotton, cycling, lycra, navy, red, sh..."
1,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,"(avoid, bed, black, color, double, finish, mat..."
2,Key Features of AW Bellies Sandals Wedges Heel...,"(aw, bellies, casual, heel, material, pair, sh..."
3,Key Features of Alisha Solid Women's Cycling S...,"(alisha, black, cotton, cycling, lycra, red, s..."
4,Specifications of Sicons All Purpose Arnica Do...,"(arnica, brand, dog, ml, pet, purpose, quantit..."
