# Product tagging using Machine Learning

---

In [None]:
# Data processing
import pandas as pd
import numpy as np
from collections import Counter

# Visualisation
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Language processing
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import nltk
# nltk.download('stopwords') # if you haven't downloaded this yet, you need to now.


# Machine Learning - model training
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.svm import LinearSVC, SVC

from sklearn.multiclass import OneVsRestClassifier

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
filepath = '/Users/wolfsinem/product-tagging/data/flipkart_com-ecommerce_sample.csv'
df = pd.read_csv((filepath))

In [None]:
df.head(1)

In [None]:
model_df = df[['product_name','description']]

In [None]:
model_df.shape

In [None]:
pd.options.mode.chained_assignment = None 
model_df['tags'] = ""

In [None]:
model_df

#### Set all strings to lowercase

In [None]:
df['product_name'].str.lower()
df['description'].str.lower()

In [None]:
test_string = model_df['description'][0]
test_string

### Function to create tags

In [None]:
def tokenize_string(sentence):  
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    new_words = tokenizer.tokenize(sentence)
    new_words = [token.lower() for token in new_words] # set to a lower case
    
    stop_words = set(stopwords.words('english')) 
    manual_filtered_words = {'details','fabric','key','features','sales','number','contents','type','general', 'specifications'}

    filtered_sentence = [w for w in new_words if not w in stop_words and not w in manual_filtered_words]
    count_terms = Counter(filtered_sentence).most_common(10) # fill e.g. (5) for most common 5 terms
    return [item[0] for item in count_terms] # this function extracts the first element of each sublist, so only the terms and not how many times it occured

#### Generating tags with the above function

In [None]:
term_lists = tokenize_string(test_string)
term_lists

As you can see there are some numbers being used as tags. We will take these out.

#### Now we need to for-loop this so the function generates tags for each of the comments

In [None]:
# this function takes out every number within a string because we don't need it as tags since it doesnt give any context
# print([x for x in token_lists[] if not any(c.isdigit() for c in x)])

In [None]:
token_lists = []
for i in model_df['description']:
    token_lists.append([x for x in tokenize_string(str((i))) if not any(c.isdigit() for c in x)])

In [None]:
token_lists

In [None]:
for i in range(len(model_df.index)):
    model_df.at[i,'tags'] = token_lists[i]

In [None]:
model_df

#### Delete missing values (should be done in the first couple of cells)

In [None]:
model_df.isnull().sum()

In [None]:
model_df.dropna(inplace=True)

#### dtype 

In [None]:
type(model_df['tags'].iloc[0])

# Some analyzing of the words

In [None]:
descriptions = model_df.description.str.cat(sep=' ')
tokens = word_tokenize(descriptions)
vocab = set(tokens)

freq_dist = nltk.FreqDist(tokens)
sorted(freq_dist, key=freq_dist.__getitem__, reverse=True)[0:10]

stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]

In [None]:
all_tags = []
for i in token_lists:
    for j in i:
        all_tags.append(j)

In [None]:
cv = CountVectorizer(tokenizer=lambda x:x.split())
tag_dtm = cv.fit_transform(all_tags)
print(("There are {} unique tags").format(tag_dtm.shape[1]))

In [None]:
# Create and generate a word cloud image of just one string/description
wordcloud = WordCloud(background_color="white").generate(test_string)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
tags_counted_and_sorted = pd.DataFrame({'tag': all_tags}).groupby('tag').size().reset_index().sort_values(0, ascending=False)
tags_counted_and_sorted

In [None]:
tags_counted_and_sorted[:10].plot.barh(x='tag', y=0, figsize=(12,8))

# Machine Learning - Multilabel Classification

---

## Baseline

The baseline is actually what I did in the function above. Tags are generated based on the occurunce of a word. The next step is to actually let the Machine Learning Multi-label classification Technique create these tags itself.

## Preprocessing

In [None]:
# there are a total of 19998 rows and 3 different columns. 
model_df.shape

In [None]:
target_variable = model_df['tags'][:1000]

In [None]:
mlb = MultiLabelBinarizer()
target_variable = mlb.fit_transform(target_variable)

In [None]:
mlb.classes_

#### There are a lot of non-existing words like 'xf', 'aaa', '_brn', we could take these probably out by using the NLTK library

In [None]:
pd.DataFrame(target_variable, columns=mlb.classes_)

In [None]:
model_df['description'][97]

### TfidfVectorizer

---

This method tokenizes documents/texts, learns the vocabulary and inverses the document frequency weightings and allows you to encode new documents. 

We will import this method from <b> sklearn.feature_extraction.text </b>. 
There are a lot of different parameters:

- <b>lowercase;</b> Convert all characters to lowercase, which in our case will be handy
- <b>stop_words;</b> If a string, it is passed to _check_stop_list and the appropriate stop list is returned
- <b>ngram_range;</b> The lower and upper boundary of the range of n-values for different n-grams to be extracted
- <b>max_features;</b> If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus


In [None]:
# Initialise the vectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', 
                             analyzer='word', 
                             ngram_range=(1,3), 
#                              max_features=1000,
                             stop_words='english',
                             token_pattern=r'\w{3,}'
                            )


# fit the independent features
independent_variable = vectorizer.fit_transform(model_df['description'][:1000])

print(independent_variable.shape)
print(target_variable.shape)

In [None]:
vectorizer.vocabulary_

#### Train/test sets

In [None]:
independent_variable

In [None]:
# split the data into training en testing sets
X_train, X_test, y_train, y_test = train_test_split(
                                        independent_variable, 
                                        target_variable, 
                                        test_size=0.2, 
                                        random_state=42, 
                                        )

In [None]:
print(X_train.shape)
print(y_train.shape)

print('-------')

print(X_test.shape)
print(y_test.shape)

# Model building

---

### Pipeline

In [None]:
LogReg_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(LinearSVC(
                                                class_weight='balanced',
                                                random_state=0,tol=1e-1,C=8.385), 
                                                n_jobs=-1)),
            ])

In [None]:
LogReg_pipeline.fit(X_train, y_train)

In [None]:
prediction = LogReg_pipeline.predict(X_test)

In [None]:
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))

# Model testing

In [None]:
string_1 = ['Key Features of Alisha Solid Womens Cycling Shorts Cotton Lycra Navy, Red, Navy,Specifications of Alisha Solid Womens Cycling Shorts Shorts Details Number of Contents in Sales Package Pack of 3 Fabric Cotton Lycra Type Cycling Shorts General Details Pattern Solid Ideal For Womens Fabric Care Gentle Machine Wash in Lukewarm Water, Do Not Bleach Additional Details Style Code ALTHT_3P_21 In the Box 3 shorts']

In [None]:
model_string_1 = vectorizer.transform(string_1)
LogReg_pipeline.predict(model_string_1)

### Predicted/generated tags for product description 1

In [None]:
predicted_tags = mlb.inverse_transform(LogReg_pipeline.predict(model_string_1))
predicted_tags

### Tags created by the baseline

In [None]:
baseline_tags = model_df['tags'][0]
baseline_tags

#### The machine learning model added one extra tag 'white', while our baseline didn't

---

# Add to new dataframe

In [None]:
submission_df = model_df.drop(columns=['tags','product_name'])

In [None]:
sample_df = submission_df[:1000]
sample_df.head(5)

In [None]:
description_variable = vectorizer.fit_transform(sample_df['description'])

In [None]:
preds = mlb.inverse_transform(LogReg_pipeline.predict(description_variable))

In [None]:
sample_df['tags'] = preds

In [None]:
sample_df