## Product Tagging 

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv(('/Users/wolfsinem/product-tagging/data/flipkart_com-ecommerce_sample.csv'))

### All the columns in the dataset

In [3]:
df.columns

Index(['uniq_id', 'crawl_timestamp', 'product_url', 'product_name',
       'product_category_tree', 'pid', 'retail_price', 'discounted_price',
       'image', 'is_FK_Advantage_product', 'description', 'product_rating',
       'overall_rating', 'brand', 'product_specifications'],
      dtype='object')

---

#### Since we only need the columns product_name, product_category_tree, description, brand and product_specifications, because these seem interesting, we will only select those for the new dataframe

In [4]:
new_df = df[['product_name','product_category_tree','description','brand','product_specifications']]

In [5]:
new_df

Unnamed: 0,product_name,product_category_tree,description,brand,product_specifications
0,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",Key Features of Alisha Solid Women's Cycling S...,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
1,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",FabHomeDecor Fabric Double Sofa Bed (Finish Co...,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati..."
2,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",Key Features of AW Bellies Sandals Wedges Heel...,AW,"{""product_specification""=>[{""key""=>""Ideal For""..."
3,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",Key Features of Alisha Solid Women's Cycling S...,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
4,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",Specifications of Sicons All Purpose Arnica Do...,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",..."
...,...,...,...,...,...
19995,WallDesign Small Vinyl Sticker,"[""Baby Care >> Baby & Kids Gifts >> Stickers >...",Buy WallDesign Small Vinyl Sticker for Rs.730 ...,WallDesign,"{""product_specification""=>[{""key""=>""Number of ..."
19996,Wallmantra Large Vinyl Stickers Sticker,"[""Baby Care >> Baby & Kids Gifts >> Stickers >...",Buy Wallmantra Large Vinyl Stickers Sticker fo...,Wallmantra,"{""product_specification""=>[{""key""=>""Number of ..."
19997,Elite Collection Medium Acrylic Sticker,"[""Baby Care >> Baby & Kids Gifts >> Stickers >...",Buy Elite Collection Medium Acrylic Sticker fo...,Elite Collection,"{""product_specification""=>[{""key""=>""Number of ..."
19998,Elite Collection Medium Acrylic Sticker,"[""Baby Care >> Baby & Kids Gifts >> Stickers >...",Buy Elite Collection Medium Acrylic Sticker fo...,Elite Collection,"{""product_specification""=>[{""key""=>""Number of ..."


### Select one description to test manually

In [8]:
test_string = new_df['description'][0]
test_string

"Key Features of Alisha Solid Women's Cycling Shorts Cotton Lycra Navy, Red, Navy,Specifications of Alisha Solid Women's Cycling Shorts Shorts Details Number of Contents in Sales Package Pack of 3 Fabric Cotton Lycra Type Cycling Shorts General Details Pattern Solid Ideal For Women's Fabric Care Gentle Machine Wash in Lukewarm Water, Do Not Bleach Additional Details Style Code ALTHT_3P_21 In the Box 3 shorts"

In [9]:
# https://stackoverflow.com/questions/2661778/tag-generation-from-a-text-content 
BAD_CHARS = ".!?,\'\""
words = [ word.strip(BAD_CHARS) for word in test_string.strip().split() if len(word) > 4 ]

In [10]:
words

['Features',
 'Alisha',
 'Solid',
 "Women's",
 'Cycling',
 'Shorts',
 'Cotton',
 'Lycra',
 'Navy',
 'Navy,Specifications',
 'Alisha',
 'Solid',
 "Women's",
 'Cycling',
 'Shorts',
 'Shorts',
 'Details',
 'Number',
 'Contents',
 'Sales',
 'Package',
 'Fabric',
 'Cotton',
 'Lycra',
 'Cycling',
 'Shorts',
 'General',
 'Details',
 'Pattern',
 'Solid',
 'Ideal',
 "Women's",
 'Fabric',
 'Gentle',
 'Machine',
 'Lukewarm',
 'Water',
 'Bleach',
 'Additional',
 'Details',
 'Style',
 'ALTHT_3P_21',
 'shorts']

In [20]:
# https://moonbooks.org/Articles/How-to-sort-a-list-by-the-number-of-occurrences-in-python-/
from collections import Counter

#### As you can see below, the word 'Shorts' occurs the most, following 'Solid' and 'Women's' 
You could make a new column named 'tags' and append these words to the new column

In [25]:
count_terms = Counter(words).most_common()
count_terms

[('Shorts', 4),
 ('Solid', 3),
 ("Women's", 3),
 ('Cycling', 3),
 ('Details', 3),
 ('Alisha', 2),
 ('Cotton', 2),
 ('Lycra', 2),
 ('Fabric', 2),
 ('Features', 1),
 ('Navy', 1),
 ('Navy,Specifications', 1),
 ('Number', 1),
 ('Contents', 1),
 ('Sales', 1),
 ('Package', 1),
 ('General', 1),
 ('Pattern', 1),
 ('Ideal', 1),
 ('Gentle', 1),
 ('Machine', 1),
 ('Lukewarm', 1),
 ('Water', 1),
 ('Bleach', 1),
 ('Additional', 1),
 ('Style', 1),
 ('ALTHT_3P_21', 1),
 ('shorts', 1)]

### Trying other products

In [26]:
test_string_2 = new_df['description'][1]
test_string_2

"FabHomeDecor Fabric Double Sofa Bed (Finish Color - Leatherette Black Mechanism Type - Pull Out) Price: Rs. 22,646 • Fine deep seating experience • Save Space with the all new click clack Sofa Bed • Easy to fold and vice versa with simple click clack mechanism • Chrome legs with mango wood frame for long term durability • Double cushioned Sofa Bed to provide you with extra softness to make a fine seating experience • A double bed that can easily sleep two,Specifications of FabHomeDecor Fabric Double Sofa Bed (Finish Color - Leatherette Black Mechanism Type - Pull Out) Installation & Demo Installation & Demo Details Installation and demo for this product is done free of cost as part of this purchase. Our service partner will visit your location within 72 business hours from the delivery of the product. In The Box 1 Sofa Bed General Brand FabHomeDecor Mattress Included No Delivery Condition Knock Down Storage Included No Mechanism Type Pull Out Type Sofa Bed Style Contemporary & Modern 

In [28]:
words_2 = [ word.strip(BAD_CHARS) for word in test_string_2.strip().split() if len(word) > 4 ]
count_terms_2 = Counter(words_2).most_common()
count_terms_2

[('product', 15),
 ('Warranty', 8),
 ('Color', 6),
 ('Material', 6),
 ('Black', 5),
 ('Double', 4),
 ('Leatherette', 4),
 ('Avoid', 4),
 ('exposure', 4),
 ('fabric', 4),
 ('color', 4),
 ('check', 4),
 ('FabHomeDecor', 3),
 ('Fabric', 3),
 ('Mechanism', 3),
 ('Installation', 3),
 ('location', 3),
 ('Included', 3),
 ('Upholstery', 3),
 ('cause', 3),
 ('colour', 3),
 ('cover', 3),
 ('stain', 3),
 ('cleaner', 3),
 ('cloth', 3),
 ('Please', 3),
 ('Primary', 3),
 ('(Finish', 2),
 ('seating', 2),
 ('experience', 2),
 ('click', 2),
 ('clack', 2),
 ('outdoor', 2),
 ('water', 2),
 ('prolonged', 2),
 ('moisture', 2),
 ('direct', 2),
 ('sunlight', 2),
 ('fade', 2),
 ('sharp', 2),
 ('objects', 2),
 ('sofa', 2),
 ('little', 2),
 ('repair', 2),
 ('Vacuum', 2),
 ('sofas', 2),
 ('periodically', 2),
 ('bristled', 2),
 ('brush', 2),
 ('avoid', 2),
 ('spills', 2),
 ('Finish', 2),
 ('Covered', 2),
 ('slightly', 2),
 ('compared', 2),
 ('picture', 2),
 ('displayed', 2),
 ('screen', 2),
 ('lighting', 2),
 ('p

In [29]:
test_string_3 = new_df['description'][2]
test_string_3

'Key Features of AW Bellies Sandals Wedges Heel Casuals,AW Bellies Price: Rs. 499 Material: Synthetic Lifestyle: Casual Heel Type: Wedge Warranty Type: Manufacturer Product Warranty against manufacturing defects: 30 days Care instructions: Allow your pair of shoes to air and de-odorize at regular basis; use shoe bags to prevent any stains or mildew; dust any dry dirt from the surface using a clean cloth; do not use polish or shiner,Specifications of AW Bellies General Ideal For Women Occasion Casual Shoe Details Color Red Outer Material Patent Leather Heel Height 1 inch Number of Contents in Sales Package Pack of 1 In the Box One Pair Of Shoes'

In [32]:
words_3 = [ word.strip(BAD_CHARS) for word in test_string_3.strip().split() if len(word) > 4 ]
count_terms_3 = Counter(words_3).most_common()
count_terms_3

[('Bellies', 3),
 ('Casual', 2),
 ('Type:', 2),
 ('Warranty', 2),
 ('Features', 1),
 ('Sandals', 1),
 ('Wedges', 1),
 ('Casuals,AW', 1),
 ('Price:', 1),
 ('Material:', 1),
 ('Synthetic', 1),
 ('Lifestyle:', 1),
 ('Wedge', 1),
 ('Manufacturer', 1),
 ('Product', 1),
 ('against', 1),
 ('manufacturing', 1),
 ('defects:', 1),
 ('instructions:', 1),
 ('Allow', 1),
 ('shoes', 1),
 ('de-odorize', 1),
 ('regular', 1),
 ('basis;', 1),
 ('prevent', 1),
 ('stains', 1),
 ('mildew;', 1),
 ('surface', 1),
 ('using', 1),
 ('clean', 1),
 ('cloth;', 1),
 ('polish', 1),
 ('shiner,Specifications', 1),
 ('General', 1),
 ('Ideal', 1),
 ('Women', 1),
 ('Occasion', 1),
 ('Details', 1),
 ('Color', 1),
 ('Outer', 1),
 ('Material', 1),
 ('Patent', 1),
 ('Leather', 1),
 ('Height', 1),
 ('Number', 1),
 ('Contents', 1),
 ('Sales', 1),
 ('Package', 1),
 ('Shoes', 1)]

In [34]:
test_string_4 = new_df['description'][4]
test_string_4

'Specifications of Sicons All Purpose Arnica Dog Shampoo (500 ml) General Pet Type Dog Brand Sicons Quantity 500 ml Model Number SH.DF-14 Type All Purpose Fragrance Arnica Form Factor Liquid In the Box Sales Package Shampoo Sicons Dog Fashion Arnica'

In [35]:
words_4 = [ word.strip(BAD_CHARS) for word in test_string_4.strip().split() if len(word) > 4 ]
count_terms_4 = Counter(words_4).most_common()
count_terms_4

[('Sicons', 3),
 ('Arnica', 3),
 ('Purpose', 2),
 ('Shampoo', 2),
 ('Specifications', 1),
 ('General', 1),
 ('Brand', 1),
 ('Quantity', 1),
 ('Model', 1),
 ('Number', 1),
 ('SH.DF-14', 1),
 ('Fragrance', 1),
 ('Factor', 1),
 ('Liquid', 1),
 ('Sales', 1),
 ('Package', 1),
 ('Fashion', 1)]