### Introduction

Build models using Catboost, Lightgbm and NaiveBayes algorithm in Python. Given the text classification problem, we will clean data, create bag of words matrix, tf-idf matrix. 

Next we will create a simple voting ensemble from the predictions generated from these models here.

In [1]:
# Load Libraries
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, make_scorer
import pickle
import matplotlib.pyplot as plt

In [2]:
# load data
train_feats1 = pickle.load(open('./proxy/train_feats1.bin','rb'))
train_feats2 = pickle.load(open('./proxy/train_feats2.bin','rb'))
test_feats1 = pickle.load(open('./proxy/test_feats1.bin','rb'))
test_feats2 = pickle.load(open('./proxy/test_feats2.bin','rb'))
target = pickle.load(open('./proxy/labels.bin','rb'))
User_ID = pickle.load(open('./proxy/userid.bin','rb'))

In [4]:
target.value_counts(normalize=True)

1    0.681213
0    0.318787
Name: Is_Response, dtype: float64

In [2]:
# load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
0,id10326,The room was kind of clean but had a VERY stro...,Edge,Mobile,not happy
1,id10327,I stayed at the Crown Plaza April -- - April -...,Internet Explorer,Mobile,not happy
2,id10328,I booked this hotel through Hotwire at the low...,Mozilla,Tablet,not happy
3,id10329,Stayed here with husband and sons on the way t...,InternetExplorer,Desktop,happy
4,id10330,My girlfriends and I stayed here to celebrate ...,Edge,Tablet,not happy


In [4]:
train.Is_Response.value_counts()

happy        26521
not happy    12411
Name: Is_Response, dtype: int64

In [5]:
# function to clean data

stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [6]:
## join data
test['Is_Response'] = np.nan
alldata = pd.concat([train, test]).reset_index(drop=True)

In [7]:
# clean description
alldata['Description'] = alldata['Description'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))

In [8]:
# initialise the functions - we'll create separate models for each type.
countvec = CountVectorizer(analyzer='word', ngram_range = (1,2), min_df=150, max_features=500)
tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,2), min_df =150, max_features=500)

In [9]:
# create features
bagofwords = countvec.fit_transform(alldata['Description'])
tfidfdata = tfidfvec.fit_transform(alldata['Description'])

In [10]:
# label encode categorical features in data given
cols = ['Browser_Used','Device_Used']

for x in cols:
    lbl = LabelEncoder()
    alldata[x] = lbl.fit_transform(alldata[x])

In [11]:
# create dataframe for features
bow_df = pd.DataFrame(bagofwords.todense())
tfidf_df = pd.DataFrame(tfidfdata.todense())

In [12]:
# set column names
bow_df.columns = ['col'+ str(x) for x in bow_df.columns]
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

In [13]:
# create separate data frame for bag of words and tf-idf

bow_df_train = bow_df[:len(train)]
bow_df_test = bow_df[len(train):]

tfid_df_train = tfidf_df[:len(train)]
tfid_df_test = tfidf_df[len(train):]

In [14]:
# split the merged data file into train and test respectively
train_feats = alldata[~pd.isnull(alldata.Is_Response)]
test_feats = alldata[pd.isnull(alldata.Is_Response)]

In [15]:
### set target variable

train_feats['Is_Response'] = [1 if x == 'happy' else 0 for x in train_feats['Is_Response']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [16]:
# merge count (bag of word) features into train
train_feats1 = pd.concat([train_feats[cols], bow_df_train], axis = 1)
test_feats1 = pd.concat([test_feats[cols], bow_df_test], axis=1)

test_feats1.reset_index(drop=True, inplace=True)

In [17]:
# merge into a new data frame with tf-idf features
train_feats2 = pd.concat([train_feats[cols], tfid_df_train], axis=1)
test_feats2 = pd.concat([test_feats[cols], tfid_df_test], axis=1)

### NaiveBayes

In [18]:
# let's check cross validation score of the model
# cv score acts a unbiased estimate of models accuracy on unseen data

mod1 = GaussianNB()
target = train_feats['Is_Response']

In [19]:
## Naive Bayes 1
print(cross_val_score(mod1, train_feats1, target, cv=5, scoring=make_scorer(accuracy_score)))

[ 0.78261428  0.77099923  0.7784485   0.77793475  0.78628307]


In [20]:
## Naive Bayes 2 - tfidf is giving higher CV score
print(cross_val_score(mod1, train_feats2, target, cv=5, scoring=make_scorer(accuracy_score)))

[ 0.80559836  0.81274082  0.80863088  0.81274082  0.80362189]


In [None]:
# make our first set of predictions

clf1 = GaussianNB()
clf1.fit(train_feats1, target)

clf2 = GaussianNB()
clf2.fit(train_feats2, target)

In [None]:
preds1 = clf1.predict(test_feats1)
preds2 = clf2.predict(test_feats2)

In [None]:
def to_labels(x):
    if x == 1:
        return "happy"
    return "not_happy"

In [None]:
sub1 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds1})
sub1['Is_Response'] = sub1['Is_Response'].map(lambda x: to_labels(x))

In [None]:
sub2 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds2})
sub2['Is_Response'] = sub2['Is_Response'].map(lambda x: to_labels(x))

In [None]:
sub1 = sub1[['User_ID', 'Is_Response']]
sub2 = sub2[['User_ID', 'Is_Response']]

In [None]:
## write submission files
sub1.to_csv('submissions/sub1_cv.csv', index=False)
sub2.to_csv('submissions/sub2_tf.csv', index=False)

### LightGBM - 1

We are prefering lightgbm over xgboost because of its speed. <br />
In this model, we'll use count features for model training.

In [3]:
import lightgbm as lgb

In [4]:
# set the data in format lgb accepts
d_train = lgb.Dataset(train_feats1, label = target)

In [5]:
## set parameters
## you can tune the parameters can try to better score

params = {'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'num_iterations': 200,
    'metric': 'binary_error',
    'learning_rate': 0.05, 
    'max_depth': 12, 
    'num_leaves': 21, 
    'feature_fraction': 0.5, 
    'bagging_fraction': 0.9, 
    'bagging_freq': 5}

In [6]:
lgb_cv = lgb.cv(params, d_train, num_boost_round=600, nfold= 5, shuffle=True, stratified=True, verbose_eval=20, early_stopping_rounds=40)

[20]	cv_agg's binary_error: 0.194596 + 0.00269656
[40]	cv_agg's binary_error: 0.17641 + 0.00295816
[60]	cv_agg's binary_error: 0.164338 + 0.00282463
[80]	cv_agg's binary_error: 0.15545 + 0.00203493
[100]	cv_agg's binary_error: 0.14944 + 0.00251394
[120]	cv_agg's binary_error: 0.144534 + 0.00221926
[140]	cv_agg's binary_error: 0.140476 + 0.00302015
[160]	cv_agg's binary_error: 0.137239 + 0.00288219
[180]	cv_agg's binary_error: 0.135184 + 0.00259829
[200]	cv_agg's binary_error: 0.13277 + 0.00313745
[220]	cv_agg's binary_error: 0.131126 + 0.00334159
[240]	cv_agg's binary_error: 0.129816 + 0.00411599
[260]	cv_agg's binary_error: 0.129148 + 0.0040012
[280]	cv_agg's binary_error: 0.128069 + 0.00373029
[300]	cv_agg's binary_error: 0.126991 + 0.00342644
[320]	cv_agg's binary_error: 0.126734 + 0.00341319
[340]	cv_agg's binary_error: 0.126066 + 0.00342685
[360]	cv_agg's binary_error: 0.125552 + 0.00321423
[380]	cv_agg's binary_error: 0.124756 + 0.00355743
[400]	cv_agg's binary_error: 0.124371 + 

In [7]:
## get nround value which hd lowest error
nround = lgb_cv['binary_error-mean'].index(np.min(lgb_cv['binary_error-mean']))

In [8]:
## train the model
model = lgb.train(params, d_train, num_boost_round=nround)

In [9]:
## make predictions
preds = model.predict(test_feats1)

In [12]:
# make submission

def to_labels(x):
    if x > 0.:  # cutoff - you can change it and see if accuracy improves or plot AUC curve. 
        return "happy"
    return "not_happy"
sub3 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds})
sub3['Is_Response'] = sub3['Is_Response'].map(lambda x: to_labels(x))
sub3 = sub3[['User_ID','Is_Response']]
sub3.to_csv('submissions/sub3_lgb.csv', index=False) # 0.85518

In [23]:
sub3

Unnamed: 0,User_ID,Is_Response
0,id80132,0
1,id80133,1
2,id80134,1
3,id80135,0
4,id80136,1
5,id80137,1
6,id80138,1
7,id80139,0
8,id80140,1
9,id80141,1


### LightGBM - 2

In this model, we'll use tf-idf features for model training.

In [15]:
# set data format
d_train = lgb.Dataset(train_feats2, label = target)

In [16]:
## set parameters
## you can tune the parameters can try to better score

params = {'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'num_iterations': 200,
    'metric': 'binary_error',
    'learning_rate': 0.05, 
    'max_depth': 12, 
    'num_leaves': 21, 
    'feature_fraction': 0.5, 
    'bagging_fraction': 0.9, 
    'bagging_freq': 5}

In [17]:
## do cross validation to find nround i.e. at this round (iteration) we can expect lowest error
lgb_cv = lgb.cv(params, d_train, num_boost_round=500, nfold= 5, shuffle=True, stratified=True, verbose_eval=20, early_stopping_rounds=40)

[20]	cv_agg's binary_error: 0.190358 + 0.00340262
[40]	cv_agg's binary_error: 0.173405 + 0.00222938
[60]	cv_agg's binary_error: 0.163465 + 0.00241333
[80]	cv_agg's binary_error: 0.154192 + 0.00280347
[100]	cv_agg's binary_error: 0.148156 + 0.00248319
[120]	cv_agg's binary_error: 0.144072 + 0.00268211
[140]	cv_agg's binary_error: 0.140476 + 0.00198786
[160]	cv_agg's binary_error: 0.13783 + 0.00243854
[180]	cv_agg's binary_error: 0.135441 + 0.00255368
[200]	cv_agg's binary_error: 0.133772 + 0.00208681
[220]	cv_agg's binary_error: 0.132308 + 0.00229294
[240]	cv_agg's binary_error: 0.130741 + 0.00207624
[260]	cv_agg's binary_error: 0.129662 + 0.00299886
[280]	cv_agg's binary_error: 0.129225 + 0.00256252
[300]	cv_agg's binary_error: 0.128891 + 0.00302137
[320]	cv_agg's binary_error: 0.128224 + 0.0030393
[340]	cv_agg's binary_error: 0.128198 + 0.00329641


In [18]:
# get nround value
nround = lgb_cv['binary_error-mean'].index(np.min(lgb_cv['binary_error-mean']))

In [19]:
# train model
model = lgb.train(params, d_train, num_boost_round=nround)

In [22]:
# make prediction
preds = model.predict(test_feats2)

In [21]:
# make submission

def to_labels(x):
    if x > 0.73:
        return 1
    return 0

sub4 = pd.DataFrame({'User_ID':User_ID, 'Is_Response':preds})
sub4['Is_Response'] = sub4['Is_Response'].map(lambda x: to_labels(x))
sub4 = sub4[['User_ID','Is_Response']]
sub4.to_csv('./output/sub4.csv', index=False) # 0.84925

In [24]:
submission = pd.read_csv("./output/ensemble.csv")

In [26]:
def to_labels(x):
    if x > 0.73:
        return "happy"
    return "not_happy"

submission['Is_Response'] = submission['Is_Response'].map(lambda x: to_labels(x))
submission.to_csv('./output/final.csv', index=False)

In [27]:
submission

Unnamed: 0,User_ID,Is_Response
0,id100001,not_happy
1,id100002,not_happy
2,id100003,not_happy
3,id100004,happy
4,id100005,happy
5,id100006,happy
6,id100007,happy
7,id100008,happy
8,id100009,happy
9,id100010,not_happy


### CatBoost

Catboost is a new package recently launched by Yandex. It is said that it works well when the data has many categorical features. We'll use it on count data and see it our model improves.

In [24]:
## import library
from catboost import CatBoostClassifier,cv, Pool

In [25]:
## catboost accepts categorical columns as a list of column numbers. In this data, all columns are categorical
cat_cols = [x for x in range(502)] ## 502 == train_feats1.shape[1] 

In [26]:
## set parameters
## you can refer the parameters here: https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_parameters-list-docpage/#python-reference_parameters-list
param = {
    'use_best_model':True,
    'loss_function':'CrossEntropy',
    'eval_metric':'Accuracy',
    'iterations':1000,
    'depth':6,
    'learning_rate':0.03,
    'rsm':0.3,
    'random_seed':2017,
    
    
}

In [None]:
## for doing cross validation, set data in Pool format
my_dt =  Pool(train_feats1, 
           label=target,
           cat_features=cat_cols,
           column_description=None,
           delimiter='\t',
           has_header=None,
           weight=None, 
           baseline=None,
           feature_names=None,
           thread_count=1)

In [None]:
## run cv to get best iteration
ctb_cv = cv(param, my_dt, fold_count=5, random_seed=2017)

In [None]:
# fetch best round
best_round = ctb_cv['b\'Accuracy\'_test_avg'].index(np.max(ctb_cv['b\'Accuracy\'_test_avg']))

In [None]:
## define the classifer model
model = CatBoostClassifier(iterations=best_round, learning_rate=0.03,rsm = 0.3 ,depth=6, eval_metric='Accuracy', random_seed=2017)

In [None]:
## train model
model.fit(my_dt)

In [None]:
## make predictions
preds = model.predict(test_feats1)

In [None]:
## make submission
sub5 = pd.DataFrame({'User_ID':test.User_ID, 'Is_Response':preds})
sub5['Is_Response'] = ['happy' if x == 1 else 'not_happy' for x in sub5['Is_Response']]
sub5 = sub5[['User_ID','Is_Response']]
sub5.to_csv('submissions/sub5_cb.csv', index=False)