In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('G:/Downloads/crowdflower-search-relevance/train.csv/train.csv')
test = pd.read_csv('G:/Downloads/crowdflower-search-relevance/test.csv/test.csv')
sample = pd.read_csv('G:/Downloads/crowdflower-search-relevance/sampleSubmission.csv/sampleSubmission.csv')

In [3]:
train.shape

(10158, 6)

((6805, 4), (3353, 4), (6805,), (3353,))

In [46]:
train['product_description'].fillna('0', inplace=True)

In [47]:
train['weight'] = 1 / (1 + train['relevance_variance'])

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import scipy
import nltk
from scipy.sparse import coo_matrix, hstack
from nltk.stem.snowball import SnowballStemmer

In [49]:
stop_words = set(stopwords.words('english')) 

In [50]:
train['query'] = train['query'].str.lower()
train['product_title'] = train['product_title'].str.lower()
train['product_description'] = train['product_description'].str.lower()

In [51]:
train['query'] = train['query'].str.split()
train['product_title'] = train['product_title'].str.split()
train['product_description'] = train['product_description'].str.split()

In [52]:
stemmer = SnowballStemmer("english")

In [53]:
train['query'] = train['query'].apply(lambda x: [stemmer.stem(y) for y in x])
train['product_title'] = train['product_title'].apply(lambda x: [stemmer.stem(y) for y in x])
train['product_description'] = train['product_description'].apply(lambda x: [stemmer.stem(y) for y in x])

In [54]:
def listToString(s):  
    str1 = ""  
    for ele in s:  
        str1 = str1 + ' ' + ele      
    return str1[1:]

In [55]:
train['query'] = train['query'].apply(listToString)
train['product_title'] = train['product_title'].apply(listToString)
train['product_description'] = train['product_description'].apply(listToString)

In [56]:
vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=(1,6))
vectorizer1 = CountVectorizer(stop_words=stop_words, ngram_range=(1,6))
vectorizer2 = CountVectorizer(stop_words=stop_words, ngram_range=(1,6))

In [57]:
X = vectorizer.fit_transform(train['query'])
X1 = vectorizer1.fit_transform(train['product_title'])
X2 = vectorizer2.fit_transform(train['product_description'])

In [58]:
X.shape,X1.shape,X2.shape

((10158, 955), (10158, 195556), (10158, 1499068))

In [59]:
tr = hstack([X,X1,X2])

In [33]:
tr.shape

(10158, 1695579)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(tr, train['median_relevance'], 
                                                    test_size=0.3, random_state=123)

In [61]:
weight = train['weight'].iloc[y_train.index]

In [62]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7110, 1695579), (3048, 1695579), (7110,), (3048,))

In [63]:
from xgboost import XGBClassifier
from sklearn.metrics import cohen_kappa_score

In [67]:
%%time
xgb = XGBClassifier(eval_metric='mlogloss', sample_weight=weight, n_estimators=1500, n_jobs=-1)
xgb.fit(X_train, y_train)

Parameters: { sample_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Wall time: 39min 42s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1500, n_jobs=-1,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1,
              sample_weight=281     0.474834
9459    1.000000
347     0.679810
3613    0.679810
5408    1.000000
          ...   
9785    1.000000
7763    1.000000
5218    0.679810
1346    0.679810
3582    0.679810
Name: weight, Length: 7110, dtype: float64,
              scale_pos_weight=None, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [68]:
y_preds = xgb.predict(X_test)

In [69]:
cohen_kappa_score(y_test,y_preds,weights='quadratic')

0.48085816969532746

In [66]:
cohen_kappa_score(y_test,y_preds,weights='quadratic')

0.3866078601838173

In [70]:
from catboost import CatBoostClassifier

In [73]:
%%time
cat = CatBoostClassifier(task_type="GPU",n_estimators=500)
cat.fit(X_train, y_train)

Parameters: { task_type } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Wall time: 9min 1s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1, task_type='GPU',
              tree_method='exact', validate_parameters=1, verbosity=None)

In [75]:
y_preds = xgb.predict(X_test)
cohen_kappa_score(y_test,y_preds,weights='quadratic')

0.48085816969532746

In [72]:
y_preds = xgb.predict(X_test)
cohen_kappa_score(y_test,y_preds,weights='quadratic')

0.48085816969532746

In [120]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
%%time
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
y_preds = rf.predict(X_test)
cohen_kappa_score(y_test,y_preds,weights='quadratic')

In [69]:
from xgboost import XGBRegressor

In [24]:
%%time
xgb = XGBRegressor(n_estimators=1500, eval_metric='rmse')
xgb.fit(X_train, y_train)

Wall time: 2min 26s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eval_metric='rmse',
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1500, n_jobs=0,
             num_parallel_tree=1, objective='reg:squarederror', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [26]:
y_preds = xgb.predict(X_test)

In [29]:
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import r2_score

In [30]:
mean_squared_error(y_test,y_preds), r2_score(y_test,y_preds)

(1.0744249741728824, 0.21294389183428275)

In [31]:
y_preds

array([1.8547031, 1.8616464, 1.8666785, ..., 3.6828585, 2.9216523,
       2.5272646], dtype=float32)

In [34]:
iii = y_test.index

In [35]:
y_ts = train['median_relevance'].iloc[iii]

In [38]:
y_pr_clf = []
for i in y_preds:
    if i < 1.5:
        y_pr_clf.append(1)
    elif i < 2.5:
        y_pr_clf.append(2)
    elif i < 3.5:
        y_pr_clf.append(3)
    else:
        y_pr_clf.append(4)

In [41]:
cohen_kappa_score(y_ts,y_pr_clf,weights='quadratic')

0.3953204221946591

In [54]:
sample['prediction'] = y_preds

In [55]:
sample.to_csv('G:/Downloads/crowdflower-search-relevance/sampleSubmission.csv/sampleSubmission.csv', index=False)

In [9]:
from gensim.models import Word2Vec