https://www.kaggle.com/c/home-depot-product-search-relevance/data

In [170]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor
from nltk.stem.snowball import SnowballStemmer

In [171]:
df_train=pd.read_csv("train.csv",encoding="ISO-8859-1")
df_test=pd.read_csv("test.csv",encoding="ISO-8859-1")

In [172]:
df_desc=pd.read_csv('product_descriptions.csv')

In [173]:
df_train.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67


In [174]:
df_desc.head()

Unnamed: 0,product_uid,product_description
0,100001,"Not only do angles make joints stronger, they ..."
1,100002,BEHR Premium Textured DECKOVER is an innovativ...
2,100003,Classic architecture meets contemporary design...
3,100004,The Grape Solar 265-Watt Polycrystalline PV So...
4,100005,Update your bathroom with the Delta Vero Singl...


In [175]:
df_test.head()

Unnamed: 0,id,product_uid,product_title,search_term
0,1,100001,Simpson Strong-Tie 12-Gauge Angle,90 degree bracket
1,4,100001,Simpson Strong-Tie 12-Gauge Angle,metal l brackets
2,5,100001,Simpson Strong-Tie 12-Gauge Angle,simpson sku able
3,6,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong ties
4,7,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong tie hcc668


In [176]:
df_all=pd.concat((df_train,df_test),axis=0,ignore_index=True)

In [177]:
df_all.head()

Unnamed: 0,id,product_title,product_uid,relevance,search_term
0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3.0,angle bracket
1,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.5,l bracket
2,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.0,deck over
3,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head
4,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet


In [178]:
df_all.shape

(240760, 5)

In [179]:
df_all=pd.merge(df_all,df_desc,how='left',on='product_uid')

In [180]:
df_all.head()

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description
0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3.0,angle bracket,"Not only do angles make joints stronger, they ..."
1,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.5,l bracket,"Not only do angles make joints stronger, they ..."
2,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.0,deck over,BEHR Premium Textured DECKOVER is an innovativ...
3,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head,Update your bathroom with the Delta Vero Singl...
4,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet,Update your bathroom with the Delta Vero Singl...


In [181]:
# 这里我们主要看关键词是否被包含，于是我们让任何term都只有一种表达方式

In [182]:
stemmer=SnowballStemmer('english')
def str_stemmer(s):
    return " ".join([stemmer.stem(word)for word in s.lower().split()])

In [183]:
# naive的看关键词出现了多少次,.find会返回单词所在的位置，若没有则是－1

In [184]:
def str_common_word(str1,str2):
    return sum(int(str2.find(word)>=0) for word in str1.split())

In [185]:
df_all['search_term']=df_all['search_term'].map(lambda x:str_stemmer(x))

In [186]:
df_all['product_title']=df_all['product_title'].map(lambda x:str_stemmer(x))

In [187]:
df_all['product_description']=df_all['product_description'].map(lambda x:str_stemmer(x))

In [188]:
# 开始创建feature了,把搜索词分别与title和description进行比较，map应用于series，apply应用于dataframe行或列

In [189]:
import Levenshtein

df_all['dist_in_title']=df_all.apply(lambda x:Levenshtein.ratio(x['search_term'],x['product_title']),axis=1)
df_all['dist_in_desc']=df_all.apply(lambda x: Levenshtein.ratio(x['search_term'],x['product_description']),axis=1)

In [190]:
df_all.head()

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description,dist_in_title,dist_in_desc
0,2,simpson strong-ti 12-gaug angl,100001,3.0,angl bracket,"not onli do angl make joint stronger, they als...",0.190476,0.030418
1,3,simpson strong-ti 12-gaug angl,100001,2.5,l bracket,"not onli do angl make joint stronger, they als...",0.153846,0.022901
2,9,behr premium textur deckov 1-gal. #sc-141 tugb...,100002,3.0,deck over,behr premium textur deckov is an innov solid c...,0.175,0.017875
3,16,delta vero 1-handl shower onli faucet trim kit...,100005,2.33,rain shower head,updat your bathroom with the delta vero single...,0.326087,0.048632
4,17,delta vero 1-handl shower onli faucet trim kit...,100005,2.67,shower onli faucet,updat your bathroom with the delta vero single...,0.382979,0.054545


In [191]:
# 对于tfidf首先要用所有的句子建一个语料库,加上句号保持句子完整,然后取出所有单词做个大辞典

In [192]:
df_all['all_texts']=df_all['product_title']+'.'+df_all['product_description']+'.'

In [193]:
df_all['all_texts'].head()

0    simpson strong-ti 12-gaug angl.not onli do ang...
1    simpson strong-ti 12-gaug angl.not onli do ang...
2    behr premium textur deckov 1-gal. #sc-141 tugb...
3    delta vero 1-handl shower onli faucet trim kit...
4    delta vero 1-handl shower onli faucet trim kit...
Name: all_texts, dtype: object

In [194]:
from gensim.utils import tokenize
from gensim.corpora.dictionary import Dictionary
dictionary = Dictionary(list(tokenize(x,errors='ignore') for x in df_all['all_texts'].values))

In [195]:
print(dictionary)


Dictionary(221877 unique tokens: [u'wooda', u'endfoam', u'removabledur', u'instructionsul', u'woodi']...)


In [196]:
# 转成bag of words 单词计数器,变成bow向量的数组

In [197]:
class MyCorpus(object):
    def __iter__(self):
        for x in df_all['all_texts'].values:
            yield dictionary.doc2bow(list(tokenize(x,errors='ignore')))

In [198]:
corpus=MyCorpus() 

In [199]:
from gensim.models.tfidfmodel import TfidfModel
tfidf=TfidfModel(corpus) #initial一个model

In [200]:
# 测试一下：
tfidf[dictionary.doc2bow(list(tokenize('hello,how are you',errors='ignore')))]

[(6, 0.10690948111380054),
 (480, 0.11521691626581843),
 (2863, 0.354677299907641),
 (33767, 0.9216829378828701)]

In [201]:
# 这里前面是位置，后面是tfidf，因为每个句子size不一样，只是因为没有的单词（feature）删掉了，所以。。

In [202]:
from gensim.similarities import MatrixSimilarity

def to_tfidf(text):
    res=tfidf[dictionary.doc2bow(list(tokenize(text,errors='ignore')))]
    return res
    
def cos_sim(text1,text2):
    tfidf1=to_tfidf(text1)
    tfidf2=to_tfidf(text2)
    index=MatrixSimilarity([tfidf1],num_features=len(dictionary))
    sim=index[tfidf2]
    
    return float(sim[0])

In [203]:
#通过给 array() 函数传递一个列表的列表（或者是一个序列的序列），可以创建二维数组。
# 如果我们想要一个三维数组，那我们就传递一个列表的列表的列表，四维数组就是列表的列表的列表的列表，以此类推。

In [None]:
df_all['tfidf_cos_sim_in_title']=df_all.apply(lambda x: cos_sim(x['search_term'],x['product_title']),axis=1)

In [None]:
df_all['tfidf_cos_sim_in_desc']=df_all.apply(lambda x:cos_sim(x['search_term'],x['product_description']),axis=1)

In [None]:
# word2vec: 与tfidf不同的是要考虑句子的层级，前后语境，用nltk

In [None]:
import nltk
nltk.download('punkt')

In [None]:
tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')


In [None]:
sentences=[tokenizer.tokenize(x) for x in df_all['all_texts'].values]

In [None]:
sentences = [y for x in sentences for y in x] # 把list of lists 给平铺成一个list

In [None]:
len(sentences)

In [None]:
from nltk.tokenize import word_tokenize
w2v_corpus=[word_tokenize(x) for x in sentences]

In [None]:
from gensim.models.word2vec import Word2Vec
model=Word2Vec(w2v_corpus,size=128,window=5,min_count=5,workers=4)

In [None]:
# 每个单词都可以像查字典一样找到一个128维的向量w2v坐标
# 但tfidf针对的是句子，我们也要针对句子，所以需要把一个句子的所有单词对应的vector平均

In [None]:
vocab = model.wv.vocab

def get_vector(text):
    res=np.zeros([128])
    count=0
    for word in text:
        if word in vocab:
            res +=model[word]
            count +=1
    return res/count

In [None]:
from scipy import spatial

def w2v_cos_sim(text1,text2):
    try:
        w2v1=get_vector(text1)
        w2v2=get_vector(text2)
        sim=1-spatial.distance.cosine(w2v1,w2v2) # cos.dis=1-cos.sim
        return float(sim)
    except:
        return float(0)

In [None]:
df_all['w2v_cos_sim_in_title']=df_all.apply(lambda x:w2v_cos_sim(x['search_term'],x['product_title']),axis=1)
df_all['w2v_cos_sim_in_desc']=df_all.apply(lambda x:w2v_cos_sim(x['search_term'],x['product_description']),axis=1)

In [None]:
# 也可以把word2vec的128个维度全写在里面，不计算距离，不过就是慢
df_all.head()

In [None]:
df_all=df_all.drop(['search_term','product_title','product_description','all_texts'],axis=1)

In [None]:
df_train=df_all.loc[df_train.index]

In [None]:
df_test=df_all.loc[df_test.index]

In [None]:
test_ids=df_test['id']

In [None]:
y_train=df_train['relevance'].values

In [None]:
X_train=df_train.drop(['id','relevance'],axis=1)
X_test=df_test.drop(['id','relevance'],axis=1)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [None]:
params=[1,3,5,6,7,8,9,10]
test_scores=[]
for param in params:
    clf=RandomForestRegressor(n_estimators=30,max_depth=param)
    test_score=np.sqrt(-cross_val_score(clf,X_train,y_train,cv=5,scoring='neg_mean_squared_error')) #返回5次cv的值
    test_scores.append(np.mean(test_score))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(params,test_scores)
plt.title('Params vs CV Error')

In [None]:
rf=RandomForestRegressor(n_estimators=30,max_depth=8)

In [None]:
rf.fit(X_train,y_train)

In [None]:
y_pred=rf.predict(X_test)

In [169]:
pd.DataFrame({"id":test_ids,"relevance":y_pred}).to_csv("submission.csv",index=False)