In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn import pipeline, grid_search
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD
#from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, make_scorer
#from nltk.metrics import edit_distance
from nltk.stem.porter import *
stemmer = PorterStemmer()
#from nltk.stem.snowball import SnowballStemmer #0.003 improvement but takes twice as long as PorterStemmer
#stemmer = SnowballStemmer('english')
import re
import random
random.seed(2016)

In [6]:
def fmean_squared_error(ground_truth, predictions):
    fmean_squared_error_ = mean_squared_error(ground_truth, predictions)**0.5
    return fmean_squared_error_

RMSE  = make_scorer(fmean_squared_error, greater_is_better=False)

In [7]:
class cust_txt_col(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, data_dict):
        return data_dict[self.key].apply(str)

In [16]:
def str_whole_word(str1, str2, i_):
    cnt = 0
    while i_ < len(str2):
        i_ = str2.find(str1, i_)
        if i_ == -1:
            return cnt
        else:
            cnt += 1
            i_ += len(str1)
    return cnt

In [32]:
def str_common_word(str1, str2):
    words, cnt = str1.split(), 0
    for word in words:
        if str2.find(word)>=0:
            cnt+=1
    return cnt

In [50]:
class cust_regression_vals(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, hd_searches):
        d_col_drops=['id','relevance','search_term','product_title','product_description','product_info','brand']
        hd_searches = hd_searches.drop(d_col_drops,axis=1).values
        return hd_searches

# load data

In [99]:
product = pd.read_csv('Dataset/product_stemmed.csv')
search = pd.read_csv('Dataset/search_cleaned_stemmed.csv')

In [100]:
product.info(),search.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124428 entries, 0 to 124427
Data columns (total 8 columns):
prod_id                       124428 non-null int64
product_title_unit            103 non-null object
product_title_number          89328 non-null object
product_title                 124428 non-null object
product_description_unit      392 non-null object
product_description_number    109122 non-null object
product_description           124428 non-null object
brand                         124428 non-null object
dtypes: int64(1), object(7)
memory usage: 8.5+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 240760 entries, 0 to 240759
Data columns (total 6 columns):
id                    240760 non-null int64
product_uid           240760 non-null int64
relevance             74067 non-null float64
search_term_unit      151 non-null object
search_term_number    40553 non-null object
search_term           240630 non-null object
dtypes: float64(1), int64(2), object(3)
memory usa

(None, None)

In [101]:
df_all = search.merge(product,left_on='product_uid',right_on='prod_id',how='left')

In [102]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240760 entries, 0 to 240759
Data columns (total 14 columns):
id                            240760 non-null int64
product_uid                   240760 non-null int64
relevance                     74067 non-null float64
search_term_unit              151 non-null object
search_term_number            40553 non-null object
search_term                   240630 non-null object
prod_id                       240760 non-null int64
product_title_unit            183 non-null object
product_title_number          176404 non-null object
product_title                 240760 non-null object
product_description_unit      750 non-null object
product_description_number    215548 non-null object
product_description           240760 non-null object
brand                         240760 non-null object
dtypes: float64(1), int64(3), object(10)
memory usage: 27.6+ MB


In [103]:
def len_of_query(s):
    if type(s)==float:
        return 0
    else:
        return len(s.split())

In [114]:
df_all.search_term.fillna('None',inplace=True)

In [117]:
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len_of_query(x)).astype(np.int64)
df_all['len_of_title'] = df_all['product_title'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_description'] = df_all['product_description'].map(lambda x:len(x.split())).astype(np.int64)
df_all['len_of_brand'] = df_all['brand'].map(lambda x:len(x.split())).astype(np.int64)
df_all['product_info'] = df_all['search_term']+"\t"+df_all['product_title'] +"\t"+df_all['product_description']
df_all['query_in_title'] = df_all['product_info'].map(lambda x:str_whole_word(str(x).split('\t')[0],str(x).split('\t')[1],0))
df_all['query_in_description'] = df_all['product_info'].map(lambda x:str_whole_word(str(x).split('\t')[0],str(x).split('\t')[2],0))
df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(str(x).split('\t')[0],str(x).split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(str(x).split('\t')[0],str(x).split('\t')[2]))
df_all['ratio_title'] = df_all['word_in_title']/df_all['len_of_query']
df_all['ratio_description'] = df_all['word_in_description']/df_all['len_of_query']

In [118]:
df_all_new = df_all.drop(['search_term_unit','search_term_number','product_title_unit','product_title_number',
                         'product_description_unit','product_description_number','product_uid','prod_id'],axis = 1)
df_all_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240760 entries, 0 to 240759
Data columns (total 17 columns):
id                      240760 non-null int64
relevance               74067 non-null float64
search_term             240760 non-null object
product_title           240760 non-null object
product_description     240760 non-null object
brand                   240760 non-null object
len_of_query            240760 non-null int64
len_of_title            240760 non-null int64
len_of_description      240760 non-null int64
len_of_brand            240760 non-null int64
product_info            240760 non-null object
query_in_title          240760 non-null int64
query_in_description    240760 non-null int64
word_in_title           240760 non-null int64
word_in_description     240760 non-null int64
ratio_title             240760 non-null float64
ratio_description       240760 non-null float64
dtypes: float64(3), int64(9), object(5)
memory usage: 33.1+ MB


# Model

In [119]:
num_train = df_all_new[df_all_new.relevance.isnull()==False].relevance.size
df_train = df_all_new.iloc[:num_train]
df_test = df_all_new.iloc[num_train:]
id_test = df_test['id']
y_train = df_train['relevance'].values
X_train =df_train[:]
X_test = df_test[:]

In [120]:
np.unique(y_train)

array([ 1.  ,  1.25,  1.33,  1.5 ,  1.67,  1.75,  2.  ,  2.25,  2.33,
        2.5 ,  2.67,  2.75,  3.  ])

In [121]:
rfr = ExtraTreesRegressor(n_estimators = 200, n_jobs = -1, random_state = 2016, verbose = 1)
tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')
tsvd = TruncatedSVD(n_components=20, random_state = 2016)

In [122]:
clf = pipeline.Pipeline([
        ('union', FeatureUnion(
                    transformer_list = [ 
                        ('cst',  cust_regression_vals()),
                        ('txt1', pipeline.Pipeline([('s1', cust_txt_col(key='search_term')), ('tfidf1', tfidf), ('tsvd1', tsvd)])),
                        ('txt2', pipeline.Pipeline([('s2', cust_txt_col(key='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])),
                        ('txt3', pipeline.Pipeline([('s3', cust_txt_col(key='product_description')), ('tfidf3', tfidf), ('tsvd3', tsvd)])),
                        ('txt4', pipeline.Pipeline([('s4', cust_txt_col(key='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)]))
                        ],
                    transformer_weights = {
                        'cst':1.0,
                        'txt1': 0.5,
                        'txt2': 0.25,
                        'txt3': 0.0,
                        'txt4': 0.5
                        },
                n_jobs = -1
                )), 
        ('rfr', rfr)])

In [123]:
param_grid = {'rfr__max_features': [60], 'rfr__max_depth': [30]}
model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid, n_jobs = -1, cv = 2, verbose = 1, scoring=RMSE)
model.fit(X_train, y_train)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   3 out of   2 | elapsed:   53.1s remaining:  -17.7s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   53.1s finished
  for name, trans in self.transformer_list)
  for name, trans in self.transformer_list)
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   32.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   35.3s finished
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   35.3s finished
  for name, trans in self.transformer_list)
  for name, trans in self.transformer_list)
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 200 out of 

GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(steps=[('union', FeatureUnion(n_jobs=-1,
       transformer_list=[('cst', cust_regression_vals()), ('txt1', Pipeline(steps=[('s1', cust_txt_col(key='search_term')), ('tfidf1', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8...imators=200, n_jobs=-1, oob_score=False, random_state=2016,
          verbose=1, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'rfr__max_features': [60], 'rfr__max_depth': [30]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(fmean_squared_error, greater_is_better=False),
       verbose=1)

In [124]:
print("Best parameters found by grid search:")
print(model.best_params_)
print("Best CV score:")
print(model.best_score_)

Best parameters found by grid search:
{'rfr__max_features': 60, 'rfr__max_depth': 30}
Best CV score:
-0.488372911151


In [125]:
y_pred = model.predict(X_test)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    2.2s
[Parallel(n_jobs=8)]: Done 200 out of 200 | elapsed:    2.4s finished


In [126]:
pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('Dataset/submission.csv',index=False)