## Product Tagging 

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib.pyplot as plt

# https://moonbooks.org/Articles/How-to-sort-a-list-by-the-number-of-occurrences-in-python-/
from collections import Counter

In [2]:
df = pd.read_csv(('/Users/wolfsinem/product-tagging/data/flipkart_com-ecommerce_sample.csv'))

### All the columns in the dataset

In [3]:
df.columns

Index(['uniq_id', 'crawl_timestamp', 'product_url', 'product_name',
       'product_category_tree', 'pid', 'retail_price', 'discounted_price',
       'image', 'is_FK_Advantage_product', 'description', 'product_rating',
       'overall_rating', 'brand', 'product_specifications'],
      dtype='object')

---

#### Since we only need the columns product_name, product_category_tree, description, brand and product_specifications, because these seem interesting, we will only select those for the new dataframe

In [4]:
new_df = df[['product_name','product_category_tree','description','brand','product_specifications']]

In [5]:
new_df

Unnamed: 0,product_name,product_category_tree,description,brand,product_specifications
0,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",Key Features of Alisha Solid Women's Cycling S...,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
1,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",FabHomeDecor Fabric Double Sofa Bed (Finish Co...,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati..."
2,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",Key Features of AW Bellies Sandals Wedges Heel...,AW,"{""product_specification""=>[{""key""=>""Ideal For""..."
3,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",Key Features of Alisha Solid Women's Cycling S...,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
4,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",Specifications of Sicons All Purpose Arnica Do...,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",..."
...,...,...,...,...,...
19995,WallDesign Small Vinyl Sticker,"[""Baby Care >> Baby & Kids Gifts >> Stickers >...",Buy WallDesign Small Vinyl Sticker for Rs.730 ...,WallDesign,"{""product_specification""=>[{""key""=>""Number of ..."
19996,Wallmantra Large Vinyl Stickers Sticker,"[""Baby Care >> Baby & Kids Gifts >> Stickers >...",Buy Wallmantra Large Vinyl Stickers Sticker fo...,Wallmantra,"{""product_specification""=>[{""key""=>""Number of ..."
19997,Elite Collection Medium Acrylic Sticker,"[""Baby Care >> Baby & Kids Gifts >> Stickers >...",Buy Elite Collection Medium Acrylic Sticker fo...,Elite Collection,"{""product_specification""=>[{""key""=>""Number of ..."
19998,Elite Collection Medium Acrylic Sticker,"[""Baby Care >> Baby & Kids Gifts >> Stickers >...",Buy Elite Collection Medium Acrylic Sticker fo...,Elite Collection,"{""product_specification""=>[{""key""=>""Number of ..."


### Select one description to test manually

In [6]:
test_string = new_df['description'][0]
test_string

"Key Features of Alisha Solid Women's Cycling Shorts Cotton Lycra Navy, Red, Navy,Specifications of Alisha Solid Women's Cycling Shorts Shorts Details Number of Contents in Sales Package Pack of 3 Fabric Cotton Lycra Type Cycling Shorts General Details Pattern Solid Ideal For Women's Fabric Care Gentle Machine Wash in Lukewarm Water, Do Not Bleach Additional Details Style Code ALTHT_3P_21 In the Box 3 shorts"

In [9]:
# https://stackoverflow.com/questions/2661778/tag-generation-from-a-text-content 
BAD_CHARS = ".!?,\'\""
words = [ word.strip(BAD_CHARS) for word in test_string.strip().split() if len(word) > 4 ]

In [10]:
words

['Key',
 'Features',
 'of',
 'Alisha',
 'Solid',
 "Women's",
 'Cycling',
 'Shorts',
 'Cotton',
 'Lycra',
 'Navy',
 'Red',
 'Navy,Specifications',
 'of',
 'Alisha',
 'Solid',
 "Women's",
 'Cycling',
 'Shorts',
 'Shorts',
 'Details',
 'Number',
 'of',
 'Contents',
 'in',
 'Sales',
 'Package',
 'Pack',
 'of',
 'Fabric',
 'Cotton',
 'Lycra',
 'Type',
 'Cycling',
 'Shorts',
 'General',
 'Details',
 'Pattern',
 'Solid',
 'Ideal',
 'For',
 "Women's",
 'Fabric',
 'Care',
 'Gentle',
 'Machine',
 'Wash',
 'in',
 'Lukewarm',
 'Water',
 'Do',
 'Not',
 'Bleach',
 'Additional',
 'Details',
 'Style',
 'Code',
 'ALTHT_3P_21',
 'In',
 'the',
 'Box',
 'shorts']

#### As you can see below, the word 'Shorts' occurs the most, following 'Solid' and 'Women's' 
You could make a new column named 'tags' and append these words to the new column

In [None]:
count_terms = Counter(words).most_common() # fill e.g. (3) for most common 3 terms
count_terms

### Trying other products

In [None]:
test_string_2 = new_df['description'][1]
test_string_2

In [None]:
words_2 = [ word.strip(BAD_CHARS) for word in test_string_2.strip().split() if len(word) > 4 ]
count_terms_2 = Counter(words_2).most_common()
count_terms_2

In [None]:
test_string_3 = new_df['description'][2]
test_string_3

In [None]:
words_3 = [ word.strip(BAD_CHARS) for word in test_string_3.strip().split() if len(word) > 4 ]
count_terms_3 = Counter(words_3).most_common()
count_terms_3

In [None]:
test_string_4 = new_df['description'][4]
test_string_4

In [None]:
words_4 = [ word.strip(BAD_CHARS) for word in test_string_4.strip().split() if len(word) > 4 ]
count_terms_4 = Counter(words_4).most_common()
count_terms_4

### Per term, count the number of product_descriptions, contain that word. This is the document frequency df(t). 

- search the word 'shorts' in any other description

In [None]:
list_with_strings = [test_string,test_string_2,test_string_3,test_string_4]

In [None]:
count_terms[0][0]

In [None]:
count_terms[0][1]

In [None]:
def contains_word(s, w):
    return f' {w} ' in f' {s} '

In [None]:
for i in list_with_strings:
    print(contains_word(i,count_terms[0][0])) # if boolean returns more than 1 true, it means the word occurs in other descriptions too

### The TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
docs = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?']

In [None]:
cv = CountVectorizer()
# vectorizer = TfidfVectorizer()

word_count_vector = cv.fit_transform(docs)

print(cv.get_feature_names())
print(word_count_vector.shape) #(sentences/unique words)

In [None]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count_vector)

In [None]:
df_idf = pd.DataFrame(tfidf_transformer.idf_,index=cv.get_feature_names(),columns=["idf_weights"])

In [None]:
df_idf.sort_values(by=['idf_weights']) # the lower the IDF value of a word, the less unique it is

In [None]:
count_vector = cv.transform(docs)

In [None]:
tf_idf_vector = tfidf_transformer.transform(count_vector)

In [None]:
feature_names = cv.get_feature_names()

first_document_vector = tf_idf_vector[0]

df_ = pd.DataFrame(first_document_vector.T.todense(),index=feature_names,columns=['tfidf'])
df_.sort_values(by=['tfidf'],ascending=True)

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.pipeline import Pipeline

# corpus = ['this is the first document',
#            'this document is the second document',
#            'and this is the third one',
#            'is this the first document']


# vocabulary = ['this', 'document', 'first', 'is', 'second', 'the',
#                'and', 'one']


# pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),
#                   ('tfid', TfidfTransformer())]).fit(corpus)


# pipe['count'].transform(corpus).toarray()
# # array([[1, 1, 1, 1, 0, 1, 0, 0],
# #        [1, 2, 0, 1, 1, 1, 0, 0],
# #        [1, 0, 0, 1, 0, 1, 1, 1],
# #        [1, 1, 1, 1, 0, 1, 0, 0]])
# pipe['tfid'].idf_
# # array([1.        , 1.22314355, 1.51082562, 1.        , 1.91629073,
# #        1.        , 1.91629073, 1.91629073])
# pipe.transform(corpus).shape
# (4, 8)

---

In [None]:
description_set = df['description'].drop_duplicates()

In [None]:
description_set[0]

In [None]:
new_df

In [None]:
string_1 = [description_set[0],description_set[1],description_set[2]]
vec_string_1 = cv.fit_transform(string_1)

In [None]:
print(cv.get_feature_names())
print(vec_string_1.shape)

In [None]:
tfidf_transformer_1 = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer_1.fit(vec_string_1)

In [None]:
df_idf_1 = pd.DataFrame(tfidf_transformer_1.idf_,index=cv.get_feature_names(),columns=["idf_weights"])
df_idf_1.sort_values(by=['idf_weights'])

In [None]:
df_idf_1.head(30)