###### Import Packages

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import re
import string
import pickle

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 


import spacy as sp
# import en_core_web_md

# misspelling:
from textblob import TextBlob
# missing whitespace
import wordninja

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from gensim import corpora, models, similarities, matutils
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yichi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


###### Load in data

In [None]:
description = pd.read_csv('home-depot-product-search-relevance/product_descriptions.csv', header = 0)

**sample first 50000 data

In [None]:
sample_df = description[:50000]
sample_df.sample(10)

###### Tokenization

Step1: Remove numbers, captial letters and punctuation: <br>
(misspelling correction might be needed for social media data, using TextBlog, but since this data is from professional business site, spelling is not an issue)
Step2: Self-define stopwords and remove all stopwords<br>
Step3: Lemmitization<BR>
Step4: Remove duplications

In [None]:
# string.punctuation.replace('#','')

In [None]:
customize_stopwords = (['add', 'added', 'additional','allow', 'allows','best','come','cut', \
                        'cutting', 'day', 'easily', 'easy', 'efficiency', 'efficient', 'end',\
                        'ensure', 'help','included', 'includes', 'including','item','look','make', \
                        'nbsp', 'need','required', 'requires','use','used', 'using', 'variety', \
                        'vary','work','year','simple', 'single', 'size', 'small','home','depot',\
                       'pre','ready', 'resident', 'residential'])

In [None]:
def transform_sentence_to_list_of_words(df, column, new_column):
    description_words = []

    alphanumeric = lambda x: re.sub('\w*\d\w*', '', x)
    punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x.lower())

    df[new_column] = df[column].map(alphanumeric).map(punc_lower)

    for n in range(len(df[new_column])):
        text = df[new_column].iloc[n]
        
        
#       separate words missing whitespace
        separation = wordninja.split(text)

#       Load stop words
        stop_words = stopwords.words('english')
#       Remove basic stop words
        new_text = [word for word in separation if word not in stop_words]
#       remove customized stop words
        new_text = [word for word in new_text if word not in customize_stopwords]
#       lemmitization
        lemmatizer = WordNetLemmatizer() 
        lemm=[]
        for i in new_text:
            lemm.append(lemmatizer.lemmatize(i))

#       remove duplicate words
        unique_words = []
        for i in lemm:
            if i not in unique_words:
                unique_words.append(i)
#       add each list of unique words in each row to a new list 
#       that will replace the original description
        unique_words = ' '.join(unique_words)
        description_words.append(unique_words)
    
    df[new_column] = pd.Series(description_words)

In [None]:
transform_sentence_to_list_of_words(df, 'product_description','description_words')

In [None]:
transform_sentence_to_list_of_words(description, 'product_description','description_words')

In [None]:
# pickle processed dataframe
with open('pickle/sample.pkl', 'wb') as picklefile:
    pickle.dump(sample_df, picklefile)

In [None]:
with open('pickle/full_description.pkl', 'wb') as picklefile:
    pickle.dump(description, picklefile)

###### *Quick loading processed description dataframe*

In [2]:
with open('pickle/full_description.pkl', 'rb') as picklefile: 
    df = pickle.load(picklefile)

###### CountVectorizer

In [3]:
cv = CountVectorizer(stop_words='english', min_df=0.05, max_df=0.3)
X_cv = cv.fit_transform(df.description_words)

pd.DataFrame(X_cv.toarray(), columns=cv.get_feature_names()).head(7)

Unnamed: 0,accent,accessory,add,adjustable,air,aluminum,appearance,application,area,assembly,...,water,watt,way,weather,weight,white,wide,width,wood,year
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
6,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


###### TF-IDF

In [4]:
# TF-IDF

vectorizer = TfidfVectorizer(stop_words='english', min_df=0.05, max_df=0.3)
X = vectorizer.fit_transform(df.description_words)

pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names()).head()

Unnamed: 0,accent,accessory,add,adjustable,air,aluminum,appearance,application,area,assembly,...,water,watt,way,weather,weight,white,wide,width,wood,year
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.252485,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177769,0.0
2,0.0,0.286597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.24847,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.181413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


###### LSA

In [5]:
# LSA
lsa = TruncatedSVD(10)
lsa_topic = lsa.fit_transform(X)
lsa.explained_variance_ratio_

array([0.00690066, 0.0254325 , 0.02013834, 0.01899063, 0.01755225,
       0.01571087, 0.01319896, 0.01192622, 0.01092248, 0.01001492])

In [6]:
# show words in each topic
lsa_topic_word_df = pd.DataFrame(lsa.components_.round(3),
             index = (["component_1",
                       "component_2",
                       'component_3',
                       'component_4',
                        'component_5',
                      'component_6',
                      'component_7',
                      'component_8',
                      'component_9',
                      'component_10'
#                       "component_11",
#                        "component_12",
#                        'component_13',
#                        'component_14',
#                         'component_15',
#                       'component_16',
#                       'component_17',
#                       'component_18',
#                       'component_19',
#                       'component_20',
#                       'component_21',
#                        "component_22",
#                        'component_23',
#                        'component_24',
#                         'component_25',
#                       'component_26',
#                       'component_27',
#                       'component_28',
#                       'component_29',
#                       'component_30'
                      ]),
             columns = vectorizer.get_feature_names())

lsa_topic_word_df

Unnamed: 0,accent,accessory,add,adjustable,air,aluminum,appearance,application,area,assembly,...,water,watt,way,weather,weight,white,wide,width,wood,year
component_1,0.041,0.046,0.042,0.066,0.062,0.061,0.055,0.102,0.072,0.053,...,0.113,0.052,0.063,0.056,0.046,0.077,0.062,0.053,0.095,0.052
component_2,-0.01,-0.029,-0.008,-0.071,-0.034,-0.02,0.057,0.07,0.001,-0.045,...,0.042,-0.084,0.002,0.011,-0.005,-0.066,-0.023,0.008,0.112,-0.04
component_3,-0.004,-0.012,-0.013,0.012,0.054,0.005,0.04,0.009,0.007,-0.033,...,-0.057,0.24,0.003,0.014,-0.086,0.055,-0.031,-0.126,-0.067,0.06
component_4,0.074,-0.025,0.036,-0.068,0.005,-0.044,0.075,-0.009,0.067,-0.046,...,0.052,0.136,0.031,0.001,0.056,0.076,-0.009,0.116,-0.028,0.046
component_5,0.084,0.028,0.096,0.064,-0.079,-0.005,0.004,-0.131,-0.056,0.074,...,-0.121,0.084,0.055,-0.03,-0.068,0.105,-0.01,-0.032,0.077,0.013
component_6,0.019,-0.01,-0.009,-0.044,-0.012,-0.001,-0.038,0.183,-0.047,-0.031,...,-0.113,0.1,-0.013,-0.078,0.116,0.008,0.008,0.068,-0.047,-0.008
component_7,-0.024,0.018,0.016,0.084,0.072,-0.061,-0.067,-0.129,-0.005,0.008,...,0.026,-0.07,0.05,-0.087,0.064,-0.09,0.025,0.062,-0.086,-0.036
component_8,0.026,-0.008,-0.009,0.111,-0.031,0.069,-0.01,-0.061,0.075,0.096,...,-0.175,0.005,0.003,0.034,0.093,0.038,0.062,0.032,0.102,-0.037
component_9,0.022,0.035,-0.006,-0.012,0.027,-0.105,-0.086,-0.087,0.097,-0.004,...,-0.087,-0.036,0.087,-0.133,-0.032,-0.042,0.007,0.09,0.008,-0.038
component_10,-0.014,-0.032,0.015,0.02,0.051,0.141,-0.079,-0.032,-0.01,0.033,...,0.004,-0.025,-0.014,0.171,-0.039,-0.088,-0.033,0.02,-0.046,-0.07


In [7]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [8]:
# topics returned by CountVector
display_topics(lsa, cv.get_feature_names(), 10)


Topic  0
design, feature, installation, construction, designed, provides, high, product, steel, durable

Topic  1
proposition, resident, california, information, paint, online, sq, care, surface, state

Topic  2
light, watt, plan, protection, bulb, led, energy, lighting, hour, power

Topic  3
color, light, bulb, indoor, room, online, watt, pattern, width, order

Topic  4
style, hardware, decor, collection, proposition, resident, door, california, glass, bulb

Topic  5
application, piece, length, box, screw, proposition, resident, california, lb, ground

Topic  6
warranty, limited, control, lifetime, plan, power, information, handle, protection, tool

Topic  7
heavy, duty, storage, lb, frame, ft, door, capacity, plan, space

Topic  8
note, product, store, clean, provide, dimension, great, room, length, cleaning

Topic  9
listed, ul, product, warranty, note, store, limited, protection, weather, resistant


In [9]:
# topics returned by tfidf
display_topics(lsa, vectorizer.get_feature_names(), 12)


Topic  0
design, feature, installation, construction, designed, provides, high, product, steel, durable, color, resistant

Topic  1
proposition, resident, california, information, paint, online, sq, care, surface, state, wood, color

Topic  2
light, watt, plan, protection, bulb, led, energy, lighting, hour, power, fixture, california

Topic  3
color, light, bulb, indoor, room, online, watt, pattern, width, order, recommended, lighting

Topic  4
style, hardware, decor, collection, proposition, resident, door, california, glass, bulb, solid, decorative

Topic  5
application, piece, length, box, screw, proposition, resident, california, lb, ground, tool, listed

Topic  6
warranty, limited, control, lifetime, plan, power, information, handle, protection, tool, california, resident

Topic  7
heavy, duty, storage, lb, frame, ft, door, capacity, plan, space, adjustable, large

Topic  8
note, product, store, clean, provide, dimension, great, room, length, cleaning, pattern, area

Topic  9
lis

###### NMF (Non-Negative Matrix Factorization)

In [10]:
nmf_model = NMF(n_components=10, init='nndsvd', alpha=0.7)
nmf_topic = nmf_model.fit_transform(X)

# show words in each topic
nmf_topic_word_df = pd.DataFrame(nmf_model.components_.round(3),
             index = (["component_1",
                       "component_2",
                       'component_3',
                       'component_4',
                        'component_5',
                      'component_6',
                      'component_7',
                      'component_8',
                      'component_9',
                      'component_10']),
             columns = vectorizer.get_feature_names())

nmf_topic_word_df

Unnamed: 0,accent,accessory,add,adjustable,air,aluminum,appearance,application,area,assembly,...,water,watt,way,weather,weight,white,wide,width,wood,year
component_1,0.042,0.013,0.0,0.0,0.062,0.268,0.936,0.401,0.545,0.0,...,1.39,0.0,0.021,0.787,0.0,0.116,0.05,0.0,0.837,0.124
component_2,0.053,0.015,0.174,0.0,0.0,0.0,0.064,0.383,0.0,0.0,...,0.0,0.0,0.268,0.0,0.0,0.0,0.003,0.0,0.534,0.0
component_3,0.463,0.0,0.242,0.0,0.0,0.112,0.671,0.141,0.175,0.0,...,0.0,2.476,0.171,0.19,0.0,1.214,0.0,0.0,0.0,0.981
component_4,0.255,0.02,0.099,0.0,0.203,0.041,0.137,0.849,0.349,0.0,...,0.613,0.0,0.146,0.0,1.341,0.149,0.405,1.326,0.195,0.0
component_5,0.485,0.505,0.672,1.015,0.0,0.288,0.0,0.0,0.122,0.959,...,0.0,0.0,0.557,0.058,0.0,0.857,0.451,0.089,0.829,0.027
component_6,0.0,0.117,0.0,0.052,0.0,0.614,0.009,0.941,0.0,0.227,...,0.046,0.0,0.0,0.33,0.134,0.022,0.279,0.0,0.292,0.136
component_7,0.0,0.187,0.048,0.0,0.133,0.039,0.0,0.0,0.0,0.016,...,0.792,0.0,0.064,0.225,0.0,0.0,0.023,0.0,0.0,0.508
component_8,0.0,0.261,0.0,0.644,1.104,0.08,0.0,0.0,0.539,0.063,...,0.378,0.007,0.265,0.017,0.119,0.0,0.369,0.0,0.0,0.0
component_9,0.048,0.072,0.007,0.0,0.14,0.0,0.0,0.035,0.447,0.0,...,0.095,0.0,0.352,0.0,0.0,0.026,0.039,0.646,0.0,0.075
component_10,0.0,0.0,0.0,0.0,0.243,0.241,0.0,0.765,0.0,0.0,...,0.085,0.143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
display_topics(nmf_model, vectorizer.get_feature_names(), 12)


Topic  0
paint, surface, sq, color, care, ft, cover, resistant, state, water, stain, exterior

Topic  1
resident, proposition, california, information, wood, application, screw, pack, interior, piece, trim, head

Topic  2
light, bulb, watt, lighting, fixture, energy, led, white, hour, base, year, glass

Topic  3
order, ground, grade, length, multiple, tile, box, weight, width, lb, recommended, commercial

Topic  4
hardware, style, door, design, installation, solid, space, decor, panel, storage, cabinet, mount

Topic  5
steel, construction, heavy, duty, material, designed, stainless, screw, metal, strength, fit, hole

Topic  6
warranty, limited, lifetime, free, quality, water, standard, faucet, maintenance, meet, installation, performance

Topic  7
protection, plan, power, control, feature, time, provides, air, ft, large, handle, operation

Topic  8
product, note, store, dimension, provide, length, clean, pattern, usa, color, width, area

Topic  9
listed, ul, safety, application, type,

###### LDA

In [15]:
# Create the term-document matrix
# Transpose it so the terms are the rows
doc_word = vectorizer.transform(df['description_words']).transpose()

doc_topic = nmf_model.fit_transform(doc_word)

# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(doc_word)

id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())

# Create lda model
lda = models.LdaModel(corpus=corpus, num_topics=10, id2word=id2word, passes=5)

In [16]:
for i in lda.show_topics(formatted=False):
    x = len(i[1])
    print ('topic %s' % i[0])
    print ([i[1][j][0] for j in range(10)])

topic 0
['proposition', 'resident', 'california', 'information', 'screw', 'pack', 'steel', 'construction', 'piece', 'head']
topic 1
['designed', 'corrosion', 'steel', 'application', 'high', 'listed', 'life', 'material', 'performance', 'standard']
topic 2
['paint', 'sq', 'care', 'surface', 'california', 'state', 'resident', 'proposition', 'information', 'cover']
topic 3
['color', 'note', 'product', 'clean', 'pattern', 'stain', 'dimension', 'design', 'store', 'room']
topic 4
['door', 'installation', 'frame', 'opening', 'aluminum', 'fit', 'install', 'panel', 'warranty', 'hardware']
topic 5
['bulb', 'light', 'watt', 'fixture', 'lighting', 'led', 'glass', 'listed', 'base', 'white']
topic 6
['plan', 'control', 'protection', 'power', 'feature', 'time', 'air', 'temperature', 'ft', 'technology']
topic 7
['order', 'ground', 'tile', 'grade', 'multiple', 'length', 'floor', 'online', 'commercial', 'width']
topic 8
['faucet', 'design', 'style', 'hardware', 'solid', 'bathroom', 'installation', 'const

In [14]:
with open('pickle/lda_model.pkl', 'wb') as picklefile:
    pickle.dump(lda, picklefile)

###### *Quick loading lda model*

In [None]:
with open('pickle/lda_model.pkl', 'rb') as picklefile: 
    lda = pickle.load(picklefile)

######  **Topics**

1. Flooring
2. Power Tools & Accessories
3. Kitchen applicances --Ranges
4. Hardware/Door & Windows
5. Flooring
6. Paint
7. Plumbing
8. Hardware
9. Lighting -- Light Bulbs
10. Storage & Organization

_Topic Words List_
topic 0 <br>
['heavy', 'duty', 'steel', 'lb', 'storage', 'coated', 'construction', 'durable', 'hold', 'feature']<br>
topic 1<br>
['ground', 'length', 'multiple', 'grade', 'width', 'tile', 'online', 'weight', 'trim', 'commercial']<br>
topic 2<br>
['color', 'resistant', 'dimension', 'clean', 'pattern', 'durable', 'stain', 'product', 'beautiful', 'style']<br>
topic 3<br>
['hardware', 'door', 'style', 'installation', 'design', 'cabinet', 'construction', 'solid', 'faucet', 'mount']<br>
topic 4<br>
['surface', 'plastic', 'clean', 'great', 'water', 'fit', 'safe', 'cover', 'ft', 'area']<br>
topic 5<br>
['proposition', 'california', 'resident', 'information', 'paint', 'sq', 'surface', 'wood', 'cover', 'color']<br>
topic 6<br>
['pressure', 'water', 'flow', 'performance', 'high', 'maximum', 'standard', 'designed', 'life', 'construction']<br>
topic 7<br>
['screw', 'steel', 'listed', 'construction', 'ul', 'pack', 'application', 'california', 'proposition', 'resident']<br>
topic 8<br>
['bulb', 'light', 'watt', 'lighting', 'led', 'energy', 'white', 'glass', 'hour', 'listed']<br>
topic 9<br>
['plan', 'protection', 'control', 'power', 'speed', 'volt', 'feature', 'ft', 'temperature', 'air']



###### Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
vectorizer1 = TfidfVectorizer(n_components = 2, stop_words='english')
compare_string = ['angle bracket']
compare_string_vector = vectorizer1.fit_transform(compare_string)
compare_vector = lsa.fit_transform(compare_string_vector)

In [None]:
cosine_similarity(doc_topic[0].reshape(1,-1), compare_vector.reshape(1,-1))