In [1]:
import sys
import pandas as pd
import numpy as np

sys.path.append('/Users/wolfsinem/product-tagging/')
from extended_df import model_dataframe

sys.path.append('/Users/wolfsinem/product-tagging/product_tagging')
from tags_generator import tokenize_string

#python built in library to calculate the similarity
from difflib import SequenceMatcher
import difflib

import nltk 
# nltk.download('averaged_perceptron_tagger') # download once

In [2]:
df = model_dataframe()

In [3]:
df.head()

Unnamed: 0,product_name,description,tags
0,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,
1,FabHomeDecor Fabric Double Sofa Bed,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,
2,AW Bellies,Key Features of AW Bellies Sandals Wedges Heel...,
3,Alisha Solid Women's Cycling Shorts,Key Features of Alisha Solid Women's Cycling S...,
4,Sicons All Purpose Arnica Dog Shampoo,Specifications of Sicons All Purpose Arnica Do...,


In [4]:
df.shape

(12202, 3)

In [5]:
df.loc[19967]['description']

'Buy Wallmantra Extra Large Vinyl Stickers Sticker for Rs.2194 online. Wallmantra Extra Large Vinyl Stickers Sticker at best prices with FREE shipping & cash on delivery. Only Genuine Products. 30 Day Replacement Guarantee.'

In [6]:
df.loc[19970]['description']

'Buy Wallmantra Extra Large Vinyl Stickers Sticker for Rs.2719 online. Wallmantra Extra Large Vinyl Stickers Sticker at best prices with FREE shipping & cash on delivery. Only Genuine Products. 30 Day Replacement Guarantee.'

In [7]:
def similar(string1, string2):
    """Function to calculate the similarity of two (product) descriptions.
    
    :param string1: This would be the first textual description. 
    :type string1: string
    
    :param string2: This would be the second textual description.
    :type string2: string
    """
    sequence = SequenceMatcher(None, string1, string2)
    score = sequence.ratio()
    return score

#### As you can see, even though I dropped all of the duplicates, product descriptions with a similarity less than 100% stay. 

In [8]:
similar(df.loc[19970]['description'],df.loc[19967]['description'])

0.9954954954954955

#### Description a and b are exactly the same, thus a similarity score of 1, thus 100%

In [9]:
df.loc[0]['description']

"Key Features of Alisha Solid Women's Cycling Shorts Cotton Lycra Navy, Red, Navy,Specifications of Alisha Solid Women's Cycling Shorts Shorts Details Number of Contents in Sales Package Pack of 3 Fabric Cotton Lycra Type Cycling Shorts General Details Pattern Solid Ideal For Women's Fabric Care Gentle Machine Wash in Lukewarm Water, Do Not Bleach Additional Details Style Code ALTHT_3P_21 In the Box 3 shorts"

In [10]:
df.loc[3]['description']

"Key Features of Alisha Solid Women's Cycling Shorts Cotton Lycra Black, Red,Specifications of Alisha Solid Women's Cycling Shorts Shorts Details Number of Contents in Sales Package Pack of 2 Fabric Cotton Lycra Type Cycling Shorts General Details Pattern Solid Ideal For Women's Fabric Care Gentle Machine Wash in Lukewarm Water, Do Not Bleach Additional Details Style Code ALTGHT_11 In the Box 2 shorts"

In [11]:
a = ['Buy Wallmantra Extra Large Vinyl Stickers Sticker for Rs.2194 online. Wallmantra Extra Large Vinyl Stickers Sticker at best prices with FREE shipping & cash on delivery. Only Genuine Products. 30 Day Replacement Guarantee.']
b = ['Buy Wallmantra Extra Large Vinyl Stickers Sticker for Rs.2194 online. Wallmantra Extra Large Vinyl Stickers Sticker at best prices with FREE shipping & cash on delivery. Only Genuine Products. 30 Day Replacement Guarantee.']

similar(a,b)

1.0

In [12]:
similar(df.loc[0]['description'],df.loc[3]['description'])

0.922509225092251

---

In [13]:
product_description = df.loc[5]['description']
product_description

'Key Features of Eternal Gandhi Super Series Crystal Paper Weights  with Silver Finish Crystal  paper weight Product Dimensions :   8cm x  8cm x 5cm A beautiful product Material: Crystal,Eternal Gandhi Super Series Crystal Paper Weights  with Silver Finish (Set Of 1, Clear) Price: Rs. 430 Your office desk will sparkle and shine when you accent tables with this elegant crystal paper weight. The multifaceted crystal features Gandhiji’s bust and his timeless message – “My life is my message – M.K. Gandhi”. A beautiful product to gift to your near and dear ones in family and Business.,Specifications of Eternal Gandhi Super Series Crystal Paper Weights  with Silver Finish (Set Of 1, Clear) General Model Name Gandhi Paper Weight Mark V Dimensions Weight 323 g In the Box Paper Weight Paper Weight Features Paper Weight Material Crystal Paper Weight Finish Silver Finish'

In [14]:
# tags created by the product tagging algorithm
tags = tokenize_string(product_description)
tags

['paper',
 'crystal',
 'weight',
 'gandhi',
 'finish',
 'silver',
 'eternal',
 'super',
 'series',
 'weights',
 'dimensions',
 '8cm',
 'x',
 'beautiful',
 'set',
 '1',
 'clear',
 'message',
 '5cm',
 'rs']

In [None]:
# joined_string = ' '.join(map(str, tags))
# joined_string

In [15]:
pos_tagger = nltk.pos_tag(tags)
pos_tagger

[('paper', 'NN'),
 ('crystal', 'NN'),
 ('weight', 'VBD'),
 ('gandhi', 'JJ'),
 ('finish', 'JJ'),
 ('silver', 'NN'),
 ('eternal', 'JJ'),
 ('super', 'JJ'),
 ('series', 'NN'),
 ('weights', 'NNS'),
 ('dimensions', 'NNS'),
 ('8cm', 'CD'),
 ('x', 'JJ'),
 ('beautiful', 'JJ'),
 ('set', 'VBN'),
 ('1', 'CD'),
 ('clear', 'JJ'),
 ('message', 'NN'),
 ('5cm', 'CD'),
 ('rs', 'NN')]

In [16]:
# nltk.download('tagsets')
nltk.help.upenn_tagset('NN')

NN: noun, common, singular or mass
    common-carrier cabbage knuckle-duster Casino afghan shed thermostat
    investment slide humour falloff slick wind hyena override subhumanity
    machinist ...


In [17]:
nltk.help.upenn_tagset('JJ')

JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...


### Combine words like weight - weights so we dont use the 'same' tag

In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

In [None]:
porter = PorterStemmer()
lancaster=LancasterStemmer()

In [None]:
stemmed_tags = []
for i in tags:
    stemmed_tags.append(porter.stem(i))

In [None]:
stemmed_tags

## Lemmatizer


In [18]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

In [20]:
def lemma_tag(set_tags): 
    """This function uses the NLTK lemmatizer function. Lemmatization, unlike Stemming, 
    reduces the inflected words properly ensuring that the root word belongs to the language
    See: https://www.datacamp.com/community/tutorials/stemming-lemmatization-python

    To reduce the amount of duplicates in a set of tags we will thus use lemmatization.
    Words like 'weight' and 'weights' will be considered the same and be saved
    as 'weight'. 
    """

    lemmatizer = WordNetLemmatizer()

    lemm_set = []
    for word in tokenize_string(set_tags):
        tag = lemmatizer.lemmatize(word)
        lemm_set.append(tag)
    
    lemm_set = list(set(lemm_set))
    lemm_set = [x for x in lemm_set if not any(c.isdigit() for c in x)]
    
    return [i for i in lemm_set if len(i) > 1] # remove words with single character

In [21]:
lst = lemma_tag(df.loc[19900]['description'])
lst

['genuine',
 'pigmented',
 'cash',
 'extra',
 'price',
 'film',
 'polyvinyl',
 'free',
 'imported',
 'best',
 'large',
 'buy',
 'online',
 'delivery',
 'product',
 'shipping',
 'uberlyfe',
 'sticker']

### Using the lemmatizer function we can track down the duplicates in a set of tags and delete them.

___


In [None]:
similar('weight','weights')

In [None]:
similar('walking','walk')

In [None]:
similar('reading','read')

In [None]:
similar('hoping','hope')

In [None]:
similar('helping','help')

In [None]:
similar('stressed','stress')

In [None]:
diff_set = []
for i in tags:
    diff_set.append(difflib.get_close_matches(i, tags))

In [None]:
diff_set

In [None]:
diff_set[0][0]

In [None]:
diff_set[0][1]

In [None]:
def similarity_rate(description):
    """This function calculates the similarity score between words that are similar to eachother in a list of tags.
    
    :param description: A product description. 
    :type description: string
    """
    
    tagged_list = tokenize_string(description)
    
    diff_set = []
    for i in tagged_list:
        diff_set.append(difflib.get_close_matches(i, tagged_list))
    
    scores = []
    for i in range(len(diff_set)):
        if not len(diff_set[i]) <= 1:
            firstW = diff_set[i][0]
            secondW = diff_set[i][1]
            similarityScore = similar(diff_set[i][0],diff_set[i][1])
            scores.append([firstW, secondW, similarityScore])
#             print("Score of similarity for {} and {} is: {}".format(firstW, secondW, similarityScore))
    
    return scores

In [None]:
scores_diff = similarity_rate(product_description)
scores_diff

In [None]:
for i in [sublist[-1] for sublist in scores_diff]:
    pass