In [29]:
import cfg
import pandas as pd
import numpy as np
import scipy.sparse as sp
import re
import pickle
#import cPickle as pickle

from bs4 import BeautifulSoup
from nltk.stem.porter import *
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import wordnet

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import pairwise_distances

#from tsne import bh_sne     # use sklearn.manifold.TSNE(method='barnes_hut') instead
from sklearn.manifold import TSNE
#from gensim.models import Word2Vec
from gensim.models import KeyedVectors

import logging

In [2]:
logging.basicConfig(format='[%(asctime)s] %(message)s', level=logging.INFO)
logging.info("Feature extractor (Mikhail's part)")
logging.info('** see cfg.py for path settings **')


[2021-02-04 20:40:14,106] Feature extractor (Mikhail's part)
[2021-02-04 20:40:14,138] ** see cfg.py for path settings **


# load data

#  Lemmatizing part

In [11]:
# download data before lemmatizing and tokenizing
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ming.lee\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [4]:
logging.info('Lemmatizing')
toker = TreebankWordTokenizer()
lemmer = wordnet.WordNetLemmatizer()

[2021-02-04 20:41:22,816] Lemmatizing


In [9]:
def text_preprocessor(x):
    '''
    Get one string and clean\lemm it
    '''
    tmp = str(x)
    tmp = tmp.lower().replace('blu-ray', 'bluray').replace('wi-fi', 'wifi')
    x_cleaned = tmp.replace('/', ' ').replace('-', ' ').replace('"', '')
    tokens = toker.tokenize(x_cleaned)
    return " ".join([lemmer.lemmatize(z) for z in tokens])


In [12]:
# lemm description
train_df['desc_stem']  = train_df['product_description'].apply(text_preprocessor)
test_df[ 'desc_stem']  =  test_df['product_description'].apply(text_preprocessor)
# lemm title
train_df['title_stem'] = train_df['product_title'].apply(text_preprocessor)
test_df[ 'title_stem'] =  test_df['product_title'].apply(text_preprocessor)
# lemm query
train_df['query_stem'] = train_df['query'].apply(text_preprocessor)
test_df[ 'query_stem'] =  test_df['query'].apply(text_preprocessor)

In [14]:
train_df.sample(10)

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance,desc_stem,title_stem,query_stem
2189,6998,leather mens briefcase,McKlein Ladies Leather Briefcase - Black,Protect and transport your laptop in this styl...,3,0.894,protect and transport your laptop in this styl...,mcklein lady leather briefcase black,leather men briefcase
5023,16176,blender,Vitamix Certified Reconditioned Program Blende...,details\nThe Keurig Cappuccino Maker - R500 Ri...,4,0.0,detail the keurig cappuccino maker r500 rivo s...,vitamix certified reconditioned program blende...,blender
7042,22754,hair accessories,Kate Marie 'Ada' Rhinestone Crown Tiara Hair Pin,This sparkling Rhinestone Crown Tiara Hair Pin...,4,0.0,this sparkling rhinestone crown tiara hair pin...,kate marie 'ada ' rhinestone crown tiara hair pin,hair accessory
9534,30640,pittsburgh pirates,Pittsburgh Pirates Girls 3pk Body Suit,,3,0.471,,pittsburgh pirate girl 3pk body suit,pittsburgh pirate
2504,7971,three wheeled bike,Sta-Tru 27 x 1-1/4 Bolt-on Alloy Flat ted Axle...,"27"" x 1 1/4"" wheels assembled in the USA. Fron...",1,0.0,27 x 1 1 4 wheel assembled in the usa. front s...,sta tru 27 x 1 1 4 bolt on alloy flat ted axle...,three wheeled bike
1313,4245,queen comforter sets,Ara 8 Piece Comforter Set,The Ara 8 Piece Comforter Set will make your b...,4,0.943,the ara 8 piece comforter set will make your b...,ara 8 piece comforter set,queen comforter set
6952,22455,baseball photo frame,Baseball Wood Picture Frame,Personalized Baseball Frame - Personalized Spo...,3,0.471,personalized baseball frame personalized sport...,baseball wood picture frame,baseball photo frame
4038,13052,chocolate molds,Freshware Brown 15-cavity Spiral Cone Chocolat...,The spiral cone shape of this 100-percent FDA ...,4,0.0,the spiral cone shape of this 100 percent fda ...,freshware brown 15 cavity spiral cone chocolat...,chocolate mold
5370,17292,assassinss creed,Assassins Creed 4 (Xbox 360) - Pre-Owned,It is 1715. Pirates rule the Caribbean and hav...,3,0.471,it is 1715. pirate rule the caribbean and have...,assassin creed 4 ( xbox 360 ) pre owned,assassin creed
9135,29385,ps3 wireless controller,"Mad Catz Wireless Controller, Red (PS3)",The Mad Catz Wireless Controller features mult...,4,0.0,the mad catz wireless controller feature multi...,"mad catz wireless controller , red ( ps3 )",ps3 wireless controller


# Similarities

In [15]:
def calc_cosine_dist(text_a ,text_b, vect):
    """
    find cosine distance between TF-IDF representation
    """
    return pairwise_distances(vect.transform([text_a]), vect.transform([text_b]), metric='cosine')[0][0]

def calc_set_intersection(text_a, text_b):
    """
    find ratio of matching words
    """
    a = set(text_a.split())
    b = set(text_b.split())
    return len(a.intersection(b)) *1.0 / len(a)


In [16]:
# vectorizers for similarities
logging.info('\t fit vectorizers')
tfv_orig = TfidfVectorizer(ngram_range=(1,2), min_df=2)
tfv_stem = TfidfVectorizer(ngram_range=(1,2), min_df=2)
tfv_desc = TfidfVectorizer(ngram_range=(1,2), min_df=2)
tfv_orig.fit(
    list(train_df['query'].values) + 
    list(test_df['query'].values) + 
    list(train_df['product_title'].values) + 
    list(test_df['product_title'].values)
) 
tfv_stem.fit(
    list(train_df['query_stem'].values) + 
    list(test_df['query_stem'].values) + 
    list(train_df['title_stem'].values) + 
    list(test_df['title_stem'].values)
) 
tfv_desc.fit(
    list(train_df['query_stem'].values) + 
    list(test_df['query_stem'].values) + 
    list(train_df['desc_stem'].values) + 
    list(test_df['desc_stem'].values)
) 


[2021-02-04 20:51:56,509] 	 fit vectorizers


TfidfVectorizer(min_df=2, ngram_range=(1, 2))

In [55]:
"""
For both training and testing data.

find cosine distance between TF-IDF representation:
- query and product title
- query stemmed and product title stemmed
- query stemmed and product description stemmed

find ratio of matching words
- query stemmed and product title stemmed
"""

'\nfind cosine distance between TF-IDF representation:\n- query and product title\n- query stemmed and product title stemmed\n- query stemmed and product description stemmed\n\nfind ratio of matching words\n- query stemmed and product title stemmed\n'

In [18]:
# for train
logging.info('\t process train')
cosine_orig = []
cosine_stem = []
cosine_desc = []
set_stem = []
for i, row in train_df.iterrows():
    cosine_orig.append(calc_cosine_dist(row['query'], row['product_title'], tfv_orig))
    cosine_stem.append(calc_cosine_dist(row['query_stem'], row['title_stem'], tfv_stem))
    cosine_desc.append(calc_cosine_dist(row['query_stem'], row['desc_stem'], tfv_desc))
    set_stem.append(calc_set_intersection(row['query_stem'], row['title_stem']))
train_df['cosine_qt_orig'] = cosine_orig
train_df['cosine_qt_stem'] = cosine_stem
train_df['cosine_qd_stem'] = cosine_desc
train_df['set_qt_stem'] = set_stem   

[2021-02-04 20:53:05,539] 	 process train


In [19]:
# for test
logging.info('\t process test')
cosine_orig = []
cosine_stem = []
cosine_desc = []
set_stem = []
for i, row in test_df.iterrows():
    cosine_orig.append(calc_cosine_dist(row['query'], row['product_title'], tfv_orig))
    cosine_stem.append(calc_cosine_dist(row['query_stem'], row['title_stem'], tfv_stem))
    cosine_desc.append(calc_cosine_dist(row['query_stem'], row['desc_stem'], tfv_desc))
    set_stem.append(calc_set_intersection(row['query_stem'], row['title_stem']))
test_df['cosine_qt_orig'] = cosine_orig
test_df['cosine_qt_stem'] = cosine_stem
test_df['cosine_qd_stem'] = cosine_desc
test_df['set_qt_stem'] = set_stem  

[2021-02-04 20:55:46,171] 	 process test


In [69]:
train_df.iloc[:, 0:13].sample(10)

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance,desc_stem,title_stem,query_stem,cosine_qt_orig,cosine_qt_stem,cosine_qd_stem,set_qt_stem
9046,29079,white plain dinner set,50 PLAIN SOLID COLORS Dinner Hand towel NAPKIN...,,1,0.0,,50 plain solid color dinner hand towel napkin ...,white plain dinner set,0.819933,0.815318,1.0,0.5
3519,11290,electric blanket,Serta Serta Luxe Plush Micro Fleece Electric B...,"Patented, UL approved, Safe & Warm&trade, low ...",4,0.8,"patented , ul approved , safe & warm & trade ,...",serta serta luxe plush micro fleece electric b...,electric blanket,0.697337,0.697585,0.898917,1.0
3036,9721,an extremely goofy movie,AN EXTREMELY GOOFY MOVIE (VHS VIDEO),,4,0.471,,an extremely goofy movie ( vhs video ),an extremely goofy movie,0.282783,0.282818,1.0,1.0
56,172,duffle bag,Every Day Carry Tactical Large Heavy Duty Carr...,The Every Day Carry duffle bag is constructed ...,4,0.49,the every day carry duffle bag is constructed ...,every day carry tactical large heavy duty carr...,duffle bag,0.759637,0.761344,0.893725,1.0
1241,4030,golf clubs,WILSON Rookie Tour Junior Boys 5 Piece Complet...,The Wilson Rookie Tour Junior Golf Set is grea...,4,0.0,the wilson rookie tour junior golf set is grea...,wilson rookie tour junior boy 5 piece complete...,golf club,0.921274,0.698436,0.972894,1.0
2986,9572,toddler sandals,"Toddler CROCS 'Cayman' Sandal, Size 2 M - Blue",Lightweight clogs are shaped from durable Cros...,4,0.471,lightweight clog are shaped from durable crosl...,"toddler crocs 'cayman ' sandal , size 2 m blue",toddler sandal,0.883858,0.747308,1.0,1.0
1197,3883,rachel ray cookware,Rachael Ray Green Stoneware 4-pc. 10-oz. Bubbl...,Excite your dinner guests by serving your favo...,2,0.745,excite your dinner guest by serving your favor...,rachael ray green stoneware 4 pc. 10 oz. bubbl...,rachel ray cookware,0.949513,0.94513,0.986241,0.333333
1606,5158,hair accessories,Crawford Corner Shop Black and White Houndstoo...,,3,0.471,,crawford corner shop black and white houndstoo...,hair accessory,0.894953,0.89309,1.0,0.5
3826,12340,levis 505,Signature by Levi Strauss & Co. Men's Skinny J...,These Signature by Levi Strauss & Co. men's sk...,2,0.8,these signature by levi strauss & co. men 's s...,signature by levi strauss & co. men 's skinny ...,levi 505,1.0,0.894892,0.939795,0.5
9889,31772,victorias secret lace gown,Queen Satin Long Gown 7801X iCollection White ...,,2,0.943,,queen satin long gown 7801x icollection white ...,victoria secret lace gown,0.928843,0.926444,1.0,0.25


# w2v part

In [None]:
"""
Find distance between the average word2vec vectors.
"""

In [33]:
def calc_w2v_sim(row):
    '''
    Calc w2v similarities and diff of centers of query\title
    '''
    a2 = [x for x in row['query_stem'].lower().split() if x in embedder.vocab]
    b2 = [x for x in row['title_stem'].lower().split() if x in embedder.vocab]
    if len(a2)>0 and len(b2)>0:
        w2v_sim = embedder.n_similarity(a2, b2)
    else:
        return((-1, -1, np.zeros(300)))
    
    vectorA = np.zeros(300)
    for w in a2:
        vectorA += embedder[w]
    vectorA /= len(a2)

    vectorB = np.zeros(300)
    for w in b2:
        vectorB += embedder[w]
    vectorB /= len(b2)

    vector_diff = (vectorA - vectorB)

    w2v_vdiff_dist = np.sqrt(np.sum(vector_diff**2))
    return (w2v_sim, w2v_vdiff_dist, vector_diff)


[2021-02-05 09:59:09,387] 	 load pretrained model from ../tools-w2v/GoogleNews-vectors-negative300.bin
[2021-02-05 09:59:09,388] loading projection weights from ../tools-w2v/GoogleNews-vectors-negative300.bin
[2021-02-05 09:59:47,403] loaded (3000000, 300) matrix from ../tools-w2v/GoogleNews-vectors-negative300.bin


In [45]:
logging.info('\t load pretrained model from {}'.format(cfg.path_w2v_pretrained_model))
embedder = KeyedVectors.load_word2vec_format(cfg.path_w2v_pretrained_model, binary=True)

[2021-02-05 19:42:15,069] 	 load pretrained model from ../tools-w2v/GoogleNews-vectors-negative300.bin
[2021-02-05 19:42:15,073] loading projection weights from ../tools-w2v/GoogleNews-vectors-negative300.bin
[2021-02-05 19:43:05,672] loaded (3000000, 300) matrix from ../tools-w2v/GoogleNews-vectors-negative300.bin


In [46]:
# for train
logging.info('\t process train')
X_w2v = []
sim_list = []
dist_list = []
for i,row in train_df.iterrows():
    sim, dist, vdiff = calc_w2v_sim(row)
    X_w2v.append(vdiff)
    sim_list.append(sim)
    dist_list.append(dist)
X_w2v_tr = np.array(X_w2v)
train_df['w2v_sim'] = np.array(sim_list)
train_df['w2v_dist'] = np.array(dist_list)


[2021-02-05 19:43:15,972] 	 process train


In [47]:
# for test
logging.info('\t process test')
X_w2v = []
sim_list = []
dist_list = []
for i,row in test_df.iterrows():
    sim, dist, vdiff = calc_w2v_sim(row)
    X_w2v.append(vdiff)
    sim_list.append(sim)
    dist_list.append(dist)
X_w2v_te = np.array(X_w2v)
test_df['w2v_sim'] = np.array(sim_list)
test_df['w2v_dist'] = np.array(dist_list)


[2021-02-05 19:43:21,497] 	 process test


In [74]:
X_w2v_tr[1][0:20]

array([ 0.05483704,  0.03566895,  0.01350098, -0.03406982,  0.00502218,
       -0.04147949,  0.01893565, -0.04858398, -0.07416992,  0.0942805 ,
        0.05486654, -0.10199382,  0.0446106 , -0.02469076,  0.09288534,
       -0.04185384,  0.04448751,  0.02110596, -0.02271678, -0.10723877])

In [80]:
train_df.iloc[:, [1,2,3,4,5,6,7,8,9, 13,14]].sample(10)

Unnamed: 0,query,product_title,product_description,median_relevance,relevance_variance,desc_stem,title_stem,query_stem,cosine_qt_orig,w2v_sim,w2v_dist
3454,teenage mutant ninja turtles,Teenage Mutant Ninja Turtles Newborn Boys' Bod...,Teenage Mutant Ninja Turtles Newborn Boys' Bod...,2,0.943,teenage mutant ninja turtle newborn boy ' body...,teenage mutant ninja turtle newborn boy ' body...,teenage mutant ninja turtle,0.58527,0.822669,1.225705
6091,table clock,Cas sem Aluminum Clock Table,"This riveted, vintage aluminum Cas sem clock t...",3,1.166,"this riveted , vintage aluminum ca sem clock t...",ca sem aluminum clock table,table clock,0.672844,0.628914,1.744994
7511,watch women fossil,Fossil Women's Sculptor Watch Quartz Mineral C...,"Fossil Women's Sculptor Stainless steel case, ...",4,0.0,fossil woman 's sculptor stainless steel case ...,fossil woman 's sculptor watch quartz mineral ...,watch woman fossil,0.724925,0.713991,1.386204
92,san francisco 49ers,San Francisco 49ers Official NFL Infant One Si...,Keep your baby warm and dry in this hooded tow...,4,0.471,keep your baby warm and dry in this hooded tow...,san francisco 49ers official nfl infant one si...,san francisco 49ers,0.537895,0.460351,1.841848
7473,cast iron skillet,Griswold #6 Chrome Cast Iron Skillet Pn#699 Sm...,,4,0.0,,griswold # 6 chrome cast iron skillet pn # 699...,cast iron skillet,0.519278,0.698564,1.825989
3751,pencil skirt,Women's Jacquard Pencil Skirt Black Merona,,4,0.8,,woman 's jacquard pencil skirt black merona,pencil skirt,0.486526,0.816115,1.329119
1480,nike womens,Womens Nike Dri-Fit Epic Run Capris - Hot Pink...,Don't let your running clothes slow you down. ...,4,0.471,do n't let your running clothes slow you down....,woman nike dri fit epic run capri hot pink hot...,nike woman,0.8571,0.560681,1.798385
1430,memory foam pillow,Dough Supportive Low Loft Firm Memory Foam Pillow,The Dough pillow is constructed with Memory Fo...,4,0.0,the dough pillow is constructed with memory fo...,dough supportive low loft firm memory foam pillow,memory foam pillow,0.638711,0.780288,1.45771
1968,coffee for nespresso,My-Cap Capsule Holder for Nespresso VirtuoLine...,,2,0.0,,my cap capsule holder for nespresso virtuoline...,coffee for nespresso,0.750804,0.200269,2.155609
6755,oakley radar,Oakley Radar EV Pitch with Grey Ink Frame and ...,Oakley items are available for US sales only. ...,4,0.0,oakley item are available for u sale only. we ...,oakley radar ev pitch with grey ink frame and ...,oakley radar,0.6999,0.417584,2.995795


In [48]:
logging.info('\t dump w2v-features')
pickle.dump((X_w2v_tr, X_w2v_te), open(cfg.path_processed + 'X_w2v.pickled', 'wb'), protocol=2)

[2021-02-05 19:43:28,787] 	 dump w2v-features


# tSNE features

In [None]:
"""
Levenshtein distance
for title
for title-query
for description
"""

In [49]:
logging.info('tSNE part')
logging.info('\t [1\3] process title')
vect = TfidfVectorizer(ngram_range=(1,2), min_df=3)
X_tf = vect.fit_transform(list(train_df['title_stem'].values) + list(test_df['title_stem'].values))
svd = TruncatedSVD(n_components=200)
X_svd = svd.fit_transform(X_tf)
X_scaled = StandardScaler().fit_transform(X_svd)
#X_tsne = bh_sne(X_scaled)
X_tsne = TSNE(method='barnes_hut').fit_transform(X_scaled)

train_df['tsne_title_1'] = X_tsne[:len(train_df), 0]
train_df['tsne_title_2'] = X_tsne[:len(train_df), 1]
test_df[ 'tsne_title_1'] = X_tsne[len(train_df):, 0]
test_df[ 'tsne_title_2'] = X_tsne[len(train_df):, 1]


[2021-02-05 19:43:29,468] tSNE part
[2021-02-05 19:43:29,470] 	 [1] process title


In [81]:
train_df.iloc[:, [1,2,3,4,5,6,7,8,9, 15,16]].sample(10)

Unnamed: 0,query,product_title,product_description,median_relevance,relevance_variance,desc_stem,title_stem,query_stem,cosine_qt_orig,tsne_title_1,tsne_title_2
131,aqua shoes,Women's Skechers GOrun Ultra 2 Hot Pink/Aqua,The acclaimed Skechers GOrun Ultra gets even b...,2,0.943,the acclaimed skechers gorun ultra get even be...,woman 's skechers gorun ultra 2 hot pink aqua,aqua shoe,0.855906,39.220203,33.255306
162,wreck it ralph,Girl's Wreck-It Ralph Time Teacher Watch,,4,0.0,,girl 's wreck it ralph time teacher watch,wreck it ralph,0.508,-15.271293,-81.284805
265,fuji bike shirt,Asics 2014 Men's Fuji Long Sleeve 1/2 Zip Runn...,When you decide to brave less than ideal weath...,2,0.943,when you decide to brave le than ideal weather...,asics 2014 men 's fuji long sleeve 1 2 zip run...,fuji bike shirt,0.899232,12.687432,44.26762
7789,rubbermaid turntables,"Rubbermaid FG1A7109 Twin Turntable, No Skid",,4,0.0,,"rubbermaid fg1a7109 twin turntable , no skid",rubbermaid turntable,0.873322,10.550785,-29.277102
7040,glitter vials,***FAST TRACK*** Elmer's 3D Glitter Glue Paint...,Elmer's 3D Glitter Glue Paint Pens add a new d...,2,0.471,elmer 's 3d glitter glue paint pen add a new d...,***fast track*** elmer 's 3d glitter glue pain...,glitter vial,0.871965,1.07213,1.404547
1820,candle chandelier,CHIC DAZZLING SHABBY DISTRESS CHIC BEADED Chan...,,4,0.0,,chic dazzling shabby distress chic beaded chan...,candle chandelier,0.80671,-20.190968,-31.843819
2993,sports bra,Champion Double Dry Seamless Racerback Sports Bra,This updated sports bra is seamless all over f...,4,0.0,this updated sport bra is seamless all over fo...,champion double dry seamless racerback sport bra,sport bra,0.625713,26.961023,30.07613
8963,multiple phone charger,Clear Gear SW-004 Wally - Dual USB Plush DC Wa...,FeaturesThe Schatzii WALLY - Dual USB DC Wall ...,2,0.8,featuresthe schatzii wally dual usb dc wall ch...,clear gear sw 004 wally dual usb plush dc wall...,multiple phone charger,0.949456,8.344467,-36.728703
6972,polaroid camera,Polaroid Id610-blu 14.0 Megapixel All-weather ...,"All-weather digital camcorder Fog proof, sand ...",2,0.943,"all weather digital camcorder fog proof , sand...",polaroid id610 blu 14.0 megapixel all weather ...,polaroid camera,0.882835,3.093921,-7.943844
5733,polaroid camera,"Unused Polaroid SX-70, 680 Or 690 Instant Came...",,2,0.0,,"unused polaroid sx 70 , 680 or 690 instant cam...",polaroid camera,0.82232,2.883445,-31.514324


In [50]:
logging.info('\t [2\3] process title-query')
vect = TfidfVectorizer(ngram_range=(1,2), min_df=3)
X_title = vect.fit_transform(list(train_df['title_stem'].values) + list(test_df['title_stem'].values))
X_query = vect.fit_transform(list(train_df['query_stem'].values) + list(test_df['query_stem'].values))
X_tf = sp.hstack([X_title, X_query]).tocsr()
svd = TruncatedSVD(n_components=200)
X_svd = svd.fit_transform(X_tf)
X_scaled = StandardScaler().fit_transform(X_svd)
#X_tsne = bh_sne(X_scaled)
X_tsne = TSNE(method='barnes_hut').fit_transform(X_scaled)

train_df['tsne_qt_1'] = X_tsne[:len(train_df), 0]
train_df['tsne_qt_2'] = X_tsne[:len(train_df), 1]
test_df[ 'tsne_qt_1'] = X_tsne[len(train_df):, 0]
test_df[ 'tsne_qt_2'] = X_tsne[len(train_df):, 1]


[2021-02-05 19:55:59,733] 	 [2] process title-query


In [82]:
train_df.iloc[:, [1,2,3,4,5,6,7,8,9, 17,18]].sample(10)

Unnamed: 0,query,product_title,product_description,median_relevance,relevance_variance,desc_stem,title_stem,query_stem,cosine_qt_orig,tsne_qt_1,tsne_qt_2
4866,steel necklace,Gold Plated Stainless Steel Necklace With Pendant,ELYA's signature hangs from this cable chain n...,4,0.471,elya 's signature hang from this cable chain n...,gold plated stainless steel necklace with pendant,steel necklace,0.629464,-27.431309,45.992661
1252,levis,511 Slim Fit Line 8 Pants - Graphite Melange,The Line 8 collection is a carefully designed ...,2,0.632,the line 8 collection is a carefully designed ...,511 slim fit line 8 pant graphite melange,levi,1.0,-14.331005,73.055389
6591,silver necklace,Italian Sterling Silver Box Chain Necklace,Italian Sterling Silver Box Chain Necklace. Fi...,4,0.0,italian sterling silver box chain necklace. fi...,italian sterling silver box chain necklace,silver necklace,0.769718,30.543852,-2.663028
2435,patio umbrellas,Fiberbuilt Umbrellas 9 ft. Patio Umbrella in F...,About Fiberbuilt Umbrellas FiberBuilt Umbrella...,4,0.0,about fiberbuilt umbrella fiberbuilt umbrella ...,fiberbuilt umbrella 9 ft. patio umbrella in fo...,patio umbrella,0.803812,17.178013,-14.857862
5331,wii gamepad,PDP - NERF Armor Case for Nintendo Wii U GameP...,Protect your Wii U GamePad from accidental bum...,2,0.471,protect your wii u gamepad from accidental bum...,pdp nerf armor case for nintendo wii u gamepad...,wii gamepad,0.649157,-36.174004,-28.762239
6613,electric blanket,Perfect Fit Microfleece Electric Blanket Beige...,,4,0.0,,perfect fit microfleece electric blanket beige...,electric blanket,0.604955,45.447403,-61.349617
4041,electric griddle,Oster DuraCeramic Griddle,,4,0.471,,oster duraceramic griddle,electric griddle,0.774674,-38.001877,-57.483238
1983,duffle bag,CalPak Silver Lake Solid 22-inch Carry-on Duff...,This large carry-on 22-inch duffel bag feature...,4,0.0,this large carry on 22 inch duffel bag feature...,calpak silver lake solid 22 inch carry on duff...,duffle bag,0.92962,26.534119,-72.66864
9170,mac book air cover,NOOK Color Insutriell Cover - Black,Protect your Nook Color or Nook Tablet with th...,2,0.471,protect your nook color or nook tablet with th...,nook color insutriell cover black,mac book air cover,0.905746,-27.272736,-59.423183
7128,coffee for nespresso,30 Pack Mixpresso Nespresso Compatible Coffee ...,Several rich flavors made with coffee beans se...,3,0.471,several rich flavor made with coffee bean sele...,30 pack mixpresso nespresso compatible coffee ...,coffee for nespresso,0.856481,48.428074,0.174882


In [51]:
logging.info('\t [3\3] process description')
vect = TfidfVectorizer(ngram_range=(1,2), min_df=3)
X_desc = vect.fit_transform(list(train_df['desc_stem'].values) + list(test_df['desc_stem'].values))
X_tf = X_desc
svd = TruncatedSVD(n_components=200)
X_svd = svd.fit_transform(X_tf)
X_scaled = StandardScaler().fit_transform(X_svd)
#X_tsne = bh_sne(X_scaled)
X_tsne = TSNE(method='barnes_hut').fit_transform(X_scaled)

train_df['tsne_desc_1'] = X_tsne[:len(train_df), 0]
train_df['tsne_desc_2'] = X_tsne[:len(train_df), 1]
test_df[ 'tsne_desc_1'] = X_tsne[len(train_df):, 0]
test_df[ 'tsne_desc_2'] = X_tsne[len(train_df):, 1]


[2021-02-05 20:02:40,118] 	 [3] process description


In [83]:
train_df.iloc[:, [1,2,3,4,5,6,7,8,9, 19,20]].sample(10)

Unnamed: 0,query,product_title,product_description,median_relevance,relevance_variance,desc_stem,title_stem,query_stem,cosine_qt_orig,tsne_desc_1,tsne_desc_2
4105,screen protector samsung,Screen Protector for Samsung Galaxy Note N7000,This is a screen protector for Samsung© Galaxy...,4,0.0,this is a screen protector for samsung© galaxy...,screen protector for samsung galaxy note n7000,screen protector samsung,0.479583,46.491837,21.672194
7460,road bicycle,Takara Men's 700C 1-Speed Sugiyama Takara Road...,,4,0.0,,takara men 's 700c 1 speed sugiyama takara roa...,road bicycle,0.881682,-5.200672,0.2195
8842,drawer organizer,"Safco Drawer Organizer, Mesh, Black",Low-profile organizer with a variety of bin si...,4,0.0,low profile organizer with a variety of bin si...,"safco drawer organizer , mesh , black",drawer organizer,0.558649,9.877446,-48.047688
9455,speck iphone 5 case,Speck CandyShell Grip 2-Layers Hard Shell Gami...,,4,0.471,,speck candyshell grip 2 layer hard shell gamin...,speck iphone 5 case,0.820747,7.847014,9.84188
5192,barbie,Barbie in the Nutcracker (DVD),ITEM#: 13041370\nAmerica's favorite pint-sized...,1,0.943,item # : 13041370 america 's favorite pint siz...,barbie in the nutcracker ( dvd ),barbie,0.691583,-49.469311,-16.825737
4990,dress shirts,Coogi Luxe Men's Light Purple Button-down Dres...,Crafted with the finest materials and unparall...,4,0.433,crafted with the finest material and unparalle...,coogi luxe men 's light purple button down dre...,dress shirt,0.937467,-42.411716,-11.670886
5801,converse high tops,Converse CT Ox Youth Girls Size 3 Bronze Texti...,"The Converse brand of athletic shoes, which no...",2,0.471,"the converse brand of athletic shoe , which no...",converse ct ox youth girl size 3 bronze textil...,converse high top,0.941901,-24.418301,40.196121
8631,cocoa butter,"Badger Cocoa Butter Lip Balm, Vanilla Bean, .2...",,2,0.471,,"badger cocoa butter lip balm , vanilla bean , ...",cocoa butter,0.66543,2.985541,14.526398
6812,road bicycle,Schwinn Men's Drop Bar Road Volare 1300 Road Bike,,4,0.471,,schwinn men 's drop bar road volare 1300 road ...,road bicycle,0.72591,-6.019669,-6.949962
7936,candle lantern,Threshold Large Lantern with LED Candle,,4,0.471,,threshold large lantern with led candle,candle lantern,0.74012,-6.924657,6.732132


In [52]:
logging.info('\t dump results')
train_df.to_pickle(cfg.path_processed + 'train_df')
test_df.to_pickle( cfg.path_processed + 'test_df')


[2021-02-05 20:16:02,529] 	 dump results


# X_additional

In [53]:
logging.info("Dump additional features")
feat_list = [
    u'w2v_sim',
    u'w2v_dist',
    u'tsne_title_1', 
    u'tsne_title_2', 
    u'tsne_qt_1',
    u'tsne_qt_2',
    u'cosine_qt_orig', 
    u'cosine_qt_stem', 
    u'cosine_qd_stem',
    u'set_qt_stem'
]
X_additional_tr = train_df[feat_list].values
X_additional_te = test_df[feat_list].values


[2021-02-05 20:16:02,759] Dump additional features


In [54]:
np.savetxt(cfg.path_processed + 'X_additional_tr.txt', X_additional_tr)
np.savetxt(cfg.path_processed + 'X_additional_te.txt', X_additional_te)
