### step1 导入需要的数据&包

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from nltk.stem.snowball import SnowballStemmer
import os

In [2]:
os.chdir("/Users/a1/Desktop/learning/kaggle_04/kaggle_04")

In [3]:
df_train = pd.read_csv('train.csv', encoding = "ISO-8859-1")
df_test = pd.read_csv('test.csv', encoding = "ISO-8859-1")

In [4]:
df_desc = pd.read_csv('product_descriptions.csv')


In [5]:
df_train.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67


In [6]:
df_train.shape

(74067, 5)

In [7]:
df_test.head()

Unnamed: 0,id,product_uid,product_title,search_term
0,1,100001,Simpson Strong-Tie 12-Gauge Angle,90 degree bracket
1,4,100001,Simpson Strong-Tie 12-Gauge Angle,metal l brackets
2,5,100001,Simpson Strong-Tie 12-Gauge Angle,simpson sku able
3,6,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong ties
4,7,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong tie hcc668


In [8]:
df_test.shape

(166693, 4)

In [9]:
df_desc.head()

Unnamed: 0,product_uid,product_description
0,100001,"Not only do angles make joints stronger, they ..."
1,100002,BEHR Premium Textured DECKOVER is an innovativ...
2,100003,Classic architecture meets contemporary design...
3,100004,The Grape Solar 265-Watt Polycrystalline PV So...
4,100005,Update your bathroom with the Delta Vero Singl...


In [10]:
df_desc.shape

(124428, 2)

In [11]:
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

In [12]:
df_all.head()

Unnamed: 0,id,product_title,product_uid,relevance,search_term
0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3.0,angle bracket
1,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.5,l bracket
2,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.0,deck over
3,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head
4,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet


In [13]:
df_all.shape

(240760, 5)

In [14]:
df_all = pd.merge(df_all, df_desc, how='left', on='product_uid')

In [15]:
df_all.head()

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description
0,2,Simpson Strong-Tie 12-Gauge Angle,100001,3.0,angle bracket,"Not only do angles make joints stronger, they ..."
1,3,Simpson Strong-Tie 12-Gauge Angle,100001,2.5,l bracket,"Not only do angles make joints stronger, they ..."
2,9,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,100002,3.0,deck over,BEHR Premium Textured DECKOVER is an innovativ...
3,16,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.33,rain shower head,Update your bathroom with the Delta Vero Singl...
4,17,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,100005,2.67,shower only faucet,Update your bathroom with the Delta Vero Singl...


### step2 文本数据预处理

In [16]:
stemmer = SnowballStemmer('english')
#1）每一列中的每条句子 -> 分词 -> 小写化 -> 提取词干
def str_stemmer(s):
    return " ".join([stemmer.stem(word) for word in s.lower().split()])

In [17]:
#2）构造特征1， 看搜索词在产品名称中出现了几次
def str_common_word(str1, str2):
    return sum(int(str2.find(word) >= 0) for word in str1.split())

In [18]:
df_all['search_term'] = df_all['search_term'].map(lambda x: str_stemmer(x))


In [19]:
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))

In [20]:
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

### step3 进阶版的文本特征

In [23]:
#1) 高级特征1
import Levenshtein
Levenshtein.ratio('hello', 'hello world')

ModuleNotFoundError: No module named 'Levenshtein'

In [None]:
df_all['dist_in_title'] = df_all.apply(lambda x: Levenshtein.ratio(x['search_term'], x['product_title']), axis=1)
df_all['dist_in_desc'] = df_all.apply(lambda x: Levenshtein.ratio(x['search_term'], x['product_description']), axis=1)


In [24]:
#2） 高级特征2  TF-IDF(term frequency - inverse document frequency)
df_all['all_texts'] = df_all['product_title'] + ' . ' + df_all['product_description'] + ' . '

In [25]:
df_all['all_texts'][:2]

0    simpson strong-ti 12-gaug angl . not onli do a...
1    simpson strong-ti 12-gaug angl . not onli do a...
Name: all_texts, dtype: object

In [26]:
df_all.head()

Unnamed: 0,id,product_title,product_uid,relevance,search_term,product_description,all_texts
0,2,simpson strong-ti 12-gaug angl,100001,3.0,angl bracket,"not onli do angl make joint stronger, they als...",simpson strong-ti 12-gaug angl . not onli do a...
1,3,simpson strong-ti 12-gaug angl,100001,2.5,l bracket,"not onli do angl make joint stronger, they als...",simpson strong-ti 12-gaug angl . not onli do a...
2,9,behr premium textur deckov 1-gal. #sc-141 tugb...,100002,3.0,deck over,behr premium textur deckov is an innov solid c...,behr premium textur deckov 1-gal. #sc-141 tugb...
3,16,delta vero 1-handl shower onli faucet trim kit...,100005,2.33,rain shower head,updat your bathroom with the delta vero single...,delta vero 1-handl shower onli faucet trim kit...
4,17,delta vero 1-handl shower onli faucet trim kit...,100005,2.67,shower onli faucet,updat your bathroom with the delta vero single...,delta vero 1-handl shower onli faucet trim kit...


In [27]:
#自己制作一个大的字典
from gensim.utils import tokenize
from gensim.corpora.dictionary import Dictionary
dictionary = Dictionary(list(tokenize(x, errors='ignore')) for x in df_all['all_texts'].values)
print(dictionary) 

Dictionary(221877 unique tokens: ['a', 'against', 'alonehelp', 'also', 'and']...)


In [29]:
# df_all.head()

In [32]:
#类，扫所有的语料，转换为单词的个数
class MyCorpus(object):
    def __iter__(self):
        for x in df_all['all_texts'].values:
            yield dictionary.doc2bow(list(tokenize(x, errors='ignore')))
corpus = MyCorpus()

In [33]:
from gensim.models.tfidfmodel import TfidfModel
tfidf = TfidfModel(corpus)

In [36]:
tfidf[dictionary.doc2bow(list(tokenize('hello, good morning', errors = 'ignore')))]

[(3433, 0.3009749701169266),
 (33767, 0.6893340791165151),
 (35250, 0.6589632726728412)]

In [39]:
from gensim.similarities import MatrixSimilarity

# 先把刚刚那句话包装成一个方法
def to_tfidf(text):
    res = tfidf[dictionary.doc2bow(list(tokenize(text, errors='ignore')))]
    return res

# 然后，我们创造一个cosine similarity的比较方法
def cos_sim(text1, text2):
    tfidf1 = to_tfidf(text1)
    tfidf2 = to_tfidf(text2)
    index = MatrixSimilarity([tfidf1],num_features=len(dictionary))
    sim = index[tfidf2]
    # 本来sim输出是一个array，我们不需要一个array来表示，
    # 所以我们直接cast成一个float
    return float(sim[0])

In [45]:
text1 = 'hello world'
text2 = 'hello from the other side'
cos_sim(text1, text2)

1.0842021724855044e-19

In [47]:
df_all['tfidf_cos_sim_in_title'] = df_all.apply(lambda x: cos_sim(x['search_term'], x['product_title']), axis=1)

In [48]:
df_all['tfidf_cos_sim_in_title'][:5]

0   -3.689349e+19
1    0.000000e+00
2    0.000000e+00
3   -3.689349e+19
4    0.000000e+00
Name: tfidf_cos_sim_in_title, dtype: float64

In [49]:
#3) 高级特征3 word2vec
import nltk
# nltk也是自带一个强大的句子分割器。
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [50]:
tokenizer.tokenize(df_all['all_texts'].values[0])

['simpson strong-ti 12-gaug angl .',
 'not onli do angl make joint stronger, they also provid more consistent, straight corners.',
 'simpson strong-ti offer a wide varieti of angl in various size and thick to handl light-duti job or project where a structur connect is needed.',
 'some can be bent (skewed) to match the project.',
 'for outdoor project or those where moistur is present, use our zmax zinc-coat connectors, which provid extra resist against corros (look for a "z" at the end of the model number).versatil connector for various 90 connect and home repair projectsstrong than angl nail or screw fasten alonehelp ensur joint are consist straight and strongdimensions: 3 in.',
 'x 3 in.',
 'x 1-1/2 in.mad from 12-gaug steelgalvan for extra corros resistanceinstal with 10d common nail or #9 x 1-1/2 in.',
 'strong-driv sd screw .']

In [51]:
sentences = [tokenizer.tokenize(x) for x in df_all['all_texts'].values]

In [52]:
sentences = [y for x in sentences for y in x]

In [53]:
len(sentences)

1998321

In [None]:
from nltk.tokenize import word_tokenize
w2v_corpus = [word_tokenize(x) for x in sentences]

In [None]:
from gensim.models.word2vec import Word2Vec

model = Word2Vec(w2v_corpus, size=128, window=5, min_count=5, workers=4)

In [None]:
model

In [None]:
# 先拿到全部的vocabulary
vocab = model.vocab

# 得到任意text的vector
def get_vector(text):
    # 建立一个全是0的array
    res =np.zeros([128])
    count = 0
    for word in word_tokenize(text):
        if word in vocab:
            res += model[word]
            count += 1
    return res/count     

In [None]:
print(get_vector('life is like a box of chocolate'))

In [None]:
from scipy import spatial
# 这里，我们再玩儿个新的方法，用scipy的spatial

def w2v_cos_sim(text1, text2):
    try:
        w2v1 = get_vector(text1)
        w2v2 = get_vector(text2)
        sim = 1 - spatial.distance.cosine(w2v1, w2v2)
        return float(sim)
    except:
        return float(0)
# 这里加个try exception，以防我们得到的vector是个[0,0,0,...]

In [None]:
w2v_cos_sim('hello world', 'hello from the other side')

In [None]:
df_all['w2v_cos_sim_in_title'] = df_all.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_title']), axis=1)
df_all['w2v_cos_sim_in_desc'] = df_all.apply(lambda x: w2v_cos_sim(x['search_term'], x['product_description']), axis=1)

In [None]:
df_all.head(5)