# Importing Libraries and Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from IPython.display import display

from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import FreqDist
from nltk.corpus import wordnet as wn

%matplotlib inline



In [2]:
attributes_raw = pd.read_csv('attributes.csv')
product_desc_raw = pd.read_csv('product_descriptions.csv')
test_raw = pd.read_csv('test.csv')
train_raw = pd.read_csv('train.csv')

In [3]:
attributes_raw.head(20)

Unnamed: 0,product_uid,name,value
0,100001.0,Bullet01,Versatile connector for various 90° connection...
1,100001.0,Bullet02,Stronger than angled nailing or screw fastenin...
2,100001.0,Bullet03,Help ensure joints are consistently straight a...
3,100001.0,Bullet04,Dimensions: 3 in. x 3 in. x 1-1/2 in.
4,100001.0,Bullet05,Made from 12-Gauge steel
5,100001.0,Bullet06,Galvanized for extra corrosion resistance
6,100001.0,Bullet07,Install with 10d common nails or #9 x 1-1/2 in...
7,100001.0,Gauge,12
8,100001.0,Material,Galvanized Steel
9,100001.0,MFG Brand Name,Simpson Strong-Tie


In [4]:
attributes_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2044803 entries, 0 to 2044802
Data columns (total 3 columns):
product_uid    float64
name           object
value          object
dtypes: float64(1), object(2)
memory usage: 46.8+ MB


In [5]:
product_desc_raw.head(20)

Unnamed: 0,product_uid,product_description
0,100001,"Not only do angles make joints stronger, they ..."
1,100002,BEHR Premium Textured DECKOVER is an innovativ...
2,100003,Classic architecture meets contemporary design...
3,100004,The Grape Solar 265-Watt Polycrystalline PV So...
4,100005,Update your bathroom with the Delta Vero Singl...
5,100006,Achieving delicious results is almost effortle...
6,100007,The Quantum Adjustable 2-Light LED Black Emerg...
7,100008,The Teks #10 x 1-1/2 in. Zinc-Plated Steel Was...
8,100009,Get the House of Fara 3/4 in. x 3 in. x 8 ft. ...
9,100010,Valley View Industries Metal Stakes (4-Pack) a...


In [6]:
product_desc_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124428 entries, 0 to 124427
Data columns (total 2 columns):
product_uid            124428 non-null int64
product_description    124428 non-null object
dtypes: int64(1), object(1)
memory usage: 1.9+ MB


In [7]:
train_raw.head(20)

Unnamed: 0,id,product_uid,product_title,search_term,relevance
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67
5,18,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,convection otr,3.0
6,20,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,microwave over stove,2.67
7,21,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,microwaves,3.0
8,23,100007,Lithonia Lighting Quantum 2-Light Black LED Em...,emergency light,2.67
9,27,100009,House of Fara 3/4 in. x 3 in. x 8 ft. MDF Flut...,mdf 3/4,3.0


In [8]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74067 entries, 0 to 74066
Data columns (total 5 columns):
id               74067 non-null int64
product_uid      74067 non-null int64
product_title    74067 non-null object
search_term      74067 non-null object
relevance        74067 non-null float64
dtypes: float64(1), int64(2), object(2)
memory usage: 2.8+ MB


In [9]:
test_raw.head(20)

Unnamed: 0,id,product_uid,product_title,search_term
0,1,100001,Simpson Strong-Tie 12-Gauge Angle,90 degree bracket
1,4,100001,Simpson Strong-Tie 12-Gauge Angle,metal l brackets
2,5,100001,Simpson Strong-Tie 12-Gauge Angle,simpson sku able
3,6,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong ties
4,7,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong tie hcc668
5,8,100001,Simpson Strong-Tie 12-Gauge Angle,wood connectors
6,10,100003,STERLING Ensemble 33-1/4 in. x 60 in. x 75-1/4...,bath and shower kit
7,11,100003,STERLING Ensemble 33-1/4 in. x 60 in. x 75-1/4...,bath drain kit
8,12,100003,STERLING Ensemble 33-1/4 in. x 60 in. x 75-1/4...,one piece tub shower
9,13,100004,Grape Solar 265-Watt Polycrystalline Solar Pan...,solar panel


# Data Wrangling

In [10]:
train_merged = train_raw.merge(product_desc_raw, on='product_uid', how='left')
train_merged.head(20)

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"Not only do angles make joints stronger, they ..."
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"Not only do angles make joints stronger, they ..."
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,BEHR Premium Textured DECKOVER is an innovativ...
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Update your bathroom with the Delta Vero Singl...
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Update your bathroom with the Delta Vero Singl...
5,18,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,convection otr,3.0,Achieving delicious results is almost effortle...
6,20,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,microwave over stove,2.67,Achieving delicious results is almost effortle...
7,21,100006,Whirlpool 1.9 cu. ft. Over the Range Convectio...,microwaves,3.0,Achieving delicious results is almost effortle...
8,23,100007,Lithonia Lighting Quantum 2-Light Black LED Em...,emergency light,2.67,The Quantum Adjustable 2-Light LED Black Emerg...
9,27,100009,House of Fara 3/4 in. x 3 in. x 8 ft. MDF Flut...,mdf 3/4,3.0,Get the House of Fara 3/4 in. x 3 in. x 8 ft. ...


## try to find the similarity among the search term and the description and product title

### experiment on the single row:

In [108]:
texts = [train_merged.iloc[1,2].lower(), train_merged.iloc[1,3].lower(), train_merged.iloc[1,5].lower()]
texts

['simpson strong-tie 12-gauge angle',
 'l bracket',
 'not only do angles make joints stronger, they also provide more consistent, straight corners. simpson strong-tie offers a wide variety of angles in various sizes and thicknesses to handle light-duty jobs or projects where a structural connection is needed. some can be bent (skewed) to match the project. for outdoor projects or those where moisture is present, use our zmax zinc-coated connectors, which provide extra resistance against corrosion (look for a "z" at the end of the model number).versatile connector for various 90 connections and home repair projectsstronger than angled nailing or screw fastening alonehelp ensure joints are consistently straight and strongdimensions: 3 in. x 3 in. x 1-1/2 in.made from 12-gauge steelgalvanized for extra corrosion resistanceinstall with 10d common nails or #9 x 1-1/2 in. strong-drive sd screws']

#### extract the text, tokenize the sentences and clean up the text

In [109]:
from nltk.tokenize import word_tokenize, RegexpTokenizer

In [110]:
#tokenize and remove punctuation
tokenizer = RegexpTokenizer(r'\w+')
texts_t = [tokenizer.tokenize(t) for t in texts]

In [111]:
#remove stopwords
from nltk.corpus import stopwords

def remove_stopwords(text):
    return [word for word in text if word not in stopwords.words('english')]
    
texts_t = map(remove_stopwords, texts_t)

In [112]:
#remove suffix of the words
from nltk.stem.wordnet import WordNetLemmatizer
def get_words_stem(tokenized_text):
    lemmatizer = WordNetLemmatizer()
    return map(lemmatizer.lemmatize, tokenized_text)

texts_t = map(get_words_stem, texts_t)

#### use freqdist() to check the frequencies of each word and compare it with the search term

In [113]:
from nltk import FreqDist

In [114]:
def get_freq_in_text(text, word):
#     print text
    freq = FreqDist(text)
    return freq[word]

In [115]:
texts_t

[['simpson', 'strong', 'tie', '12', 'gauge', 'angle'],
 ['l', 'bracket'],
 [u'angle',
  'make',
  u'joint',
  'stronger',
  'also',
  'provide',
  'consistent',
  'straight',
  u'corner',
  'simpson',
  'strong',
  'tie',
  u'offer',
  'wide',
  'variety',
  u'angle',
  'various',
  u'size',
  u'thickness',
  'handle',
  'light',
  'duty',
  u'job',
  u'project',
  'structural',
  'connection',
  'needed',
  'bent',
  'skewed',
  'match',
  'project',
  'outdoor',
  u'project',
  'moisture',
  'present',
  'use',
  'zmax',
  'zinc',
  'coated',
  u'connector',
  'provide',
  'extra',
  'resistance',
  'corrosion',
  'look',
  'z',
  'end',
  'model',
  'number',
  'versatile',
  'connector',
  'various',
  '90',
  u'connection',
  'home',
  'repair',
  'projectsstronger',
  'angled',
  'nailing',
  'screw',
  'fastening',
  'alonehelp',
  'ensure',
  u'joint',
  'consistently',
  'straight',
  'strongdimensions',
  '3',
  'x',
  '3',
  'x',
  '1',
  '1',
  '2',
  'made',
  '12',
  'gau

In [116]:
for word in texts_t[1]:
    print word
    print 'freq in title: ', get_freq_in_text(texts_t[0], word)
    print 'freq in desc: ', get_freq_in_text(texts_t[2], word)

l
freq in title:  0
freq in desc:  0
bracket
freq in title:  0
freq in desc:  0


#### use synsets module to check simlilarity

 ##### method:
* text_words
    * word
    * word
    * word
    * word
        * synset1 <-loop through each synset
        * synset2
        * synset3
        
 compared to:
    * ref_word
        * synset1
        * synset2
 
 find the max similarity between eg word:synset1 and ref_word:synset2
 
 append this simliarity into word's syn_sims
 
 find the max similarity between word:synset2 and the ref_word's synsets
 ...
 until each synset in the word has found the max similarity to the ref_word
 
 then return the max value of the word's syn_sims list to represent the similarity of the word to the ref_word
 

In [121]:
kw = texts_t[1][0]
kw

'l'

In [122]:
kw_syn = wn.synsets(kw)
kw_syn

[Synset('liter.n.01'),
 Synset('fifty.n.01'),
 Synset('lambert.n.01'),
 Synset('l.n.04'),
 Synset('fifty.s.01')]

In [154]:
def check_similarity_word_words(word_syn, words):
    result = {}
    # for each word in the text
    for w in words:
        syn_sims = []
        if len(wn.synsets(w)) > 0 :
#             print 'target word & synsets: ', w, wn.synsets(w)
            # for each synonym in the synsets of the word
            for syn in wn.synsets(w):
#                 print syn, [ref_syn.path_similarity(syn) for ref_syn in word_syn]
                comparisons = [ref_syn.path_similarity(syn) for ref_syn in word_syn]
                # get the highest similarity between the synonyms of the reference word and that of the target word
                sim = max(comparisons)
                print 'sim:', sim
                syn_sims.append(sim)
#             print w, syn_sims    
            result[w] = max(syn_sims)
    return result

In [155]:
print kw, kw_syn, texts_t[0]
sim_kw_title = check_similarity_word_list(kw_syn, texts_t[0])
kw, sim_kw_title

l [Synset('liter.n.01'), Synset('fifty.n.01'), Synset('lambert.n.01'), Synset('l.n.04'), Synset('fifty.s.01')] ['simpson', 'strong', 'tie', '12', 'gauge', 'angle']
sim: 0.0666666666667
sim: 0.0588235294118
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: 0.0588235294118
sim: 0.0909090909091
sim: 0.0833333333333
sim: 0.0714285714286
sim: 0.0588235294118
sim: 0.0833333333333
sim: 0.0769230769231
sim: 0.0588235294118
sim: 0.0666666666667
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: 0.333333333333
sim: None
sim: 0.0588235294118
sim: 0.111111111111
sim: 0.0769230769231
sim: 0.0769230769231
sim: 0.1
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: 0.0833333333333
sim: 0.0769230769231
sim: 0.0714285714286
sim: None
sim: None
sim: None
sim: None
sim: None


('l',
 {'12': 0.3333333333333333,
  'angle': 0.08333333333333333,
  'gauge': 0.1111111111111111,
  'simpson': 0.06666666666666667,
  'strong': None,
  'tie': 0.09090909090909091})

* check similarity between search key word and the product description

In [156]:
sim_kw_desc = check_similarity_word_list(kw_syn, texts_t[2])
kw, sim_kw_desc

sim: 0.0833333333333
sim: 0.0769230769231
sim: 0.0714285714286
sim: None
sim: None
sim: None
sim: None
sim: None
sim: 0.0666666666667
sim: 0.0555555555556
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: 0.0769230769231
sim: 0.0588235294118
sim: 0.0909090909091
sim: 0.0666666666667
sim: 0.0625
sim: 0.0588235294118
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: None
sim: Non

('l',
 {'1': 0.2,
  '12': 0.3333333333333333,
  '2': 0.2,
  '3': 0.2,
  '9': 0.2,
  '90': 0.3333333333333333,
  'also': None,
  u'angle': 0.08333333333333333,
  'angled': None,
  'bent': 0.08333333333333333,
  'coated': None,
  'common': 0.06666666666666667,
  'connection': 0.1111111111111111,
  u'connector': 0.07142857142857142,
  'consistent': None,
  'consistently': None,
  u'corner': 0.08333333333333333,
  'corrosion': 0.07692307692307693,
  'drive': 0.1,
  'duty': 0.08333333333333333,
  'end': 0.125,
  'ensure': None,
  'extra': 0.07692307692307693,
  'fastening': 0.06666666666666667,
  'gauge': 0.1111111111111111,
  'handle': 0.07692307692307693,
  'home': 0.08333333333333333,
  u'job': 0.08333333333333333,
  u'joint': 0.09090909090909091,
  'light': 0.125,
  'look': 0.09090909090909091,
  'made': None,
  'make': 0.06666666666666667,
  'match': 0.16666666666666666,
  'model': 0.08333333333333333,
  'moisture': 0.08333333333333333,
  u'nail': 0.16666666666666666,
  'nailing': None

* mean values of the similarities

In [157]:
def cal_similarities_mean(similarities_list):
    sims = [v for k,v in similarities_list.iteritems()]
    #drop na
    sims = np.array([e for e in sims if e != None])
    return sims.mean()

In [158]:
sim_kw_title_mean =cal_similarities_mean(sim_kw_title)
print sim_kw_title_mean

sim_kw_desc_mean = cal_similarities_mean(sim_kw_desc)
print sim_kw_desc_mean

0.137070707071
0.123636759482


### run on all rows

In [167]:
def remove_stopwords(text):
    return [word for word in text if word not in stopwords.words('english')]

def get_words_stem(tokenized_text):
    lemmatizer = WordNetLemmatizer()
    return map(lemmatizer.lemmatize, tokenized_text)

def check_similarity_word_words(word_syn, words):
    result = {}
    # for each word in the text
    for w in words:
        syn_sims = []
        if len(wn.synsets(w)) > 0 :
#             print 'target word & synsets: ', w, wn.synsets(w)
            # for each synonym in the synsets of the word
            for syn in wn.synsets(w):
#                 print syn, [ref_syn.path_similarity(syn) for ref_syn in word_syn]
                comparisons = [ref_syn.path_similarity(syn) for ref_syn in word_syn]
                # get the highest similarity between the synonyms of the reference word and that of the target word
                if len(comparisons) > 0:
                    sim = max(comparisons)
                else:
                    sim = 0
#                 print 'sim:', sim
                syn_sims.append(sim)
#             print w, syn_sims    
            result[w] = max(syn_sims)
    return result

def cal_similarities_mean(similarities_list):
    sims = [v for k,v in similarities_list.iteritems()]
    #drop na
    sims = np.array([e for e in sims if e != None])
    return sims.mean()



In [168]:
def find_search_similarity_title_desc(row,title_col_name,search_col_name, desc_col_name):
    print row.id
    texts = [row[title_col_name],row[search_col_name],row[desc_col_name] ]
#     print texts

    #tokenize and remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    texts_t = [tokenizer.tokenize(t) for t in texts]
    
    #remove stopwords
    texts_t = map(remove_stopwords, texts_t)
    
    #remove suffix of the words
    texts_t = map(get_words_stem, texts_t)
    
    sim_kw_title_mean_all = {}
    sim_kw_desc_mean_all = {}
    
    for kw in texts_t[1]:
        print kw
        #get the synsets of the keyword
        kw_syn = wn.synsets(kw)
        #get the similarity matrix of kw:product_title
        sim_kw_title = check_similarity_word_words(kw_syn, texts_t[0])
        #get the similarity matrix of kw:product_description
        sim_kw_desc = check_similarity_word_words(kw_syn, texts_t[2])
        #calculate the mean similarities in the kw:product_title similarity matrix
        sim_kw_title_mean =cal_similarities_mean(sim_kw_title)
        #calculate the mean similarities in the kw:product_description similarity matrix
        sim_kw_desc_mean = cal_similarities_mean(sim_kw_desc)
    
        sim_kw_title_mean_all[kw] = sim_kw_title_mean
        sim_kw_desc_mean_all[kw] = sim_kw_desc_mean
        
    print sim_kw_title_mean_all, sim_kw_desc_mean_all
    return sim_kw_title_mean_all, sim_kw_desc_mean_all

In [None]:
train_merged['sim'] = train_merged.apply(lambda row: find_search_similarity_title_desc(row, 'product_title','search_term', 'product_description'), axis=1)

2
angle
bracket
{'bracket': 0.20555555555555552, 'angle': 0.34583333333333338} {'bracket': 0.22414709473533001, 'angle': 0.24742647058823528}
3
l
bracket
{'bracket': 0.20555555555555552, 'l': 0.13707070707070707} {'bracket': 0.22414709473533001, 'l': 0.12363675948234772}
9
deck
{'deck': 0.18787878787878787} {'deck': 0.22149102423827696}
16
rain
shower
head
{'shower': 0.23551587301587307, 'head': 0.24186507936507937, 'rain': 0.20952380952380953} {'shower': 0.1923524887810602, 'head': 0.24089491857349002, 'rain': 0.21038614163614167}
17
shower
faucet
{'shower': 0.23551587301587307, 'faucet': 0.20593434343434339} {'shower': 0.1923524887810602, 'faucet': 0.10554180777533274}
18
convection
otr
{'convection': 0.16452607221837995, 'otr': 0.0} {'convection': 0.12190105266370656, 'otr': 0.0}
20
microwave
stove
{'stove': 0.22133037902268671, 'microwave': 0.24041791541791541} {'stove': 0.1485471797196028, 'microwave': 0.15810303992643443}
21
microwave
{u'microwave': 0.24041791541791541} {u'microw



{'decorative': nan, 'wood': 0.113248556998557, '1x1': 0.0, 'rail': 0.2009259259259259} {'decorative': nan, 'wood': 0.1210462937735665, '1x1': 0.0, 'rail': 0.21805860805860802}
90
4
8
beadboard
paneling
{'8': 0.19406478937728938, '4': 0.11073145604395604, 'paneling': 0.27584175084175089, 'beadboard': 0.0} {'8': 0.12647067974608078, '4': 0.1062686595440606, 'paneling': 0.22144429644429636, 'beadboard': 0.0}
92
4x8wood
paneling
{'4x8wood': 0.0, 'paneling': 0.27584175084175089} {'4x8wood': 0.0, 'paneling': 0.22144429644429636}
101
MDF
4x8
{'MDF': 0.0, '4x8': 0.0} {'MDF': 0.0, '4x8': 0.0}
105
wainscot
chair
rail
{'chair': 0.12450820784154118, 'rail': 0.2009259259259259, 'wainscot': 0.2521329365079365} {'chair': 0.15918599918599918, 'rail': 0.21805860805860802, 'wainscot': 0.17280514098695915}
106
wainscot
plank
paneling
{'paneling': 0.27584175084175089, 'plank': 0.18544973544973542, 'wainscot': 0.2521329365079365} {'paneling': 0.22144429644429636, 'plank': 0.20403744403744398, 'wainscot': 0

In [56]:
train_merged.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"Not only do angles make joints stronger, they ..."
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"Not only do angles make joints stronger, they ..."
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,BEHR Premium Textured DECKOVER is an innovativ...
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,Update your bathroom with the Delta Vero Singl...
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,Update your bathroom with the Delta Vero Singl...
