In [None]:
import sys
import pandas as pd
import numpy as np

sys.path.append('/Users/wolfsinem/product-tagging/')
from extended_df import model_dataframe

sys.path.append('/Users/wolfsinem/product-tagging/product_tagging')
from tags_generator import tokenize_string

#python built in library to calculate the similarity
from difflib import SequenceMatcher
import difflib

import nltk 
# nltk.download('averaged_perceptron_tagger') # download once

In [None]:
df = model_dataframe()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.loc[19967]['description']

In [None]:
def similar(string1, string2):
    """Function to calculate the similarity of two (product) descriptions.
    
    :param string1: This would be the first textual description. 
    :type string1: string
    
    :param string2: This would be the second textual description.
    :type string2: string
    """
    sequence = SequenceMatcher(None, string1, string2)
    score = sequence.ratio()
    return score

#### As you can see, even though I dropped all of the duplicates, product descriptions with a similarity less than 100% stay. 

In [None]:
similar(df.loc[19970]['description'],df.loc[19967]['description'])

#### Description a and b are exactly the same, thus a similarity score of 1, thus 100%

In [None]:
df.loc[0]['description']

In [None]:
df.loc[3]['description']

In [None]:
a = ['Buy Wallmantra Extra Large Vinyl Stickers Sticker for Rs.2194 online. Wallmantra Extra Large Vinyl Stickers Sticker at best prices with FREE shipping & cash on delivery. Only Genuine Products. 30 Day Replacement Guarantee.']
b = ['Buy Wallmantra Extra Large Vinyl Stickers Sticker for Rs.2194 online. Wallmantra Extra Large Vinyl Stickers Sticker at best prices with FREE shipping & cash on delivery. Only Genuine Products. 30 Day Replacement Guarantee.']

similar(a,b)

In [None]:
similar(df.loc[0]['description'],df.loc[3]['description'])

---

In [None]:
product_description = df.loc[5]['description']
product_description

In [None]:
# tags created by the product tagging algorithm
tags = tokenize_string(product_description)
tags

In [None]:
# joined_string = ' '.join(map(str, tags))
# joined_string

In [None]:
pos_tagger = nltk.pos_tag(tags)
pos_tagger

In [None]:
# nltk.download('tagsets')
nltk.help.upenn_tagset('NN')

In [None]:
nltk.help.upenn_tagset('JJ')

### Combine words like weight - weights so we dont use the 'same' tag

In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

In [None]:
porter = PorterStemmer()
lancaster = LancasterStemmer()

In [None]:
stemmed_tags = []
for i in tags:
    stemmed_tags.append(porter.stem(i))

In [None]:
stemmed_tags

## Lemmatizer


In [None]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

In [None]:
def lemma_tag(set_tags,tags_size): 
    """This function uses the NLTK lemmatizer function in the first part. Lemmatization, unlike Stemming, 
    reduces the inflected words properly ensuring that the root word belongs to the language
    See: https://www.datacamp.com/community/tutorials/stemming-lemmatization-python

    To reduce the amount of duplicates in a set of tags we will thus use lemmatization.
    Words like 'weight' and 'weights' will be considered the same and be saved
    as 'weight'. In addition to that we have a few other conditions to clean the set of tags.
    """

    lemmatizer = WordNetLemmatizer()

    lemm_set = []
    for word in tokenize_string(set_tags):
        tag = lemmatizer.lemmatize(word)
        lemm_set.append(tag)
    
    lemm_set = list(set(lemm_set))
    lemm_set = [x for x in lemm_set if not any(c.isdigit() for c in x)]
    lemm_set = [x for x in lemm_set if not x[-3:] == "ing"]
    
            
    return [i for i in lemm_set if len(i) > 1][:tags_size] # remove words with single character

In [None]:
lst = lemma_tag(df.loc[300]['description'],3)
lst

In [None]:
def extend_df(df, tags_size):
    """This function extends the original dataframe with an extra column 'tags'.
    This function uses both the lemma_tag() and tokenize_user_text_input()
    function to tokenize and clean the set of tags.

    :param df: This would be the orginal df imported by the user.
    :type df: string.
    """

    for i in df.index:
        df.at[i,'tags'] = lemma_tag(df.loc[i]['description'], tags_size)

    return df

In [None]:
extend_df(df,2)

### Imagine we tag every product using the new updated lemm tagger; let's see what the occurence is of each tag we have given to a product

In [None]:
# tag every product description and store it in a new array
df = model_dataframe()
tags = []
for i in df.index:
    tags.append(lemma_tag(df.loc[i]['description']))

In [None]:
# count the occurenece with the Counter function
word_counts = Counter(word for words in tags for word in words)
word_counts.most_common()

In [None]:
print('There are {} unique tags'.format(len(word_counts.most_common())))

In [None]:
word_counts.most_common() # see if we can delete words like buy free flipkart etc.

### Using the lemmatizer function we can track down the duplicates in a set of tags and delete them.

___


In [None]:
similar('weight','weights')

In [None]:
similar('walking','walk')

In [None]:
similar('reading','read')

In [None]:
similar('hoping','hope')

In [None]:
similar('helping','help')

In [None]:
similar('stressed','stress')

In [None]:
diff_set = []
for i in tags:
    diff_set.append(difflib.get_close_matches(i, tags))

In [None]:
diff_set

In [None]:
diff_set[0][0]

In [None]:
diff_set[0][1]

In [None]:
def similarity_rate(description):
    """This function calculates the similarity score between words that are similar to eachother.
    
    :param description: A product description. 
    :type description: string
    """
    
    tagged_list = tokenize_string(description)
    
    diff_set = []
    for i in tagged_list:
        diff_set.append(difflib.get_close_matches(i, tagged_list))
    
    scores = []
    for i in range(len(diff_set)):
        if not len(diff_set[i]) <= 1:
            firstW = diff_set[i][0]
            secondW = diff_set[i][1]
            similarityScore = similar(diff_set[i][0],diff_set[i][1])
            scores.append([firstW, secondW, similarityScore])
#             print("Score of similarity for {} and {} is: {}".format(firstW, secondW, similarityScore))
    
    return scores

In [None]:
scores_diff = similarity_rate(df.loc[5]['description'])
scores_diff