In [2]:
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas, xgboost, numpy, string
import pandas as pd
import numpy as np

In [6]:
xgboost.__version__

'0.81'

In [7]:
#importing review text and label files to dataframe

import pandas as pd

df_test_text = pd.read_table('imdb_test_text.txt', delim_whitespace=False, names=('A'))
df_test_labels = pd.read_table('imdb_test_labels.txt', delim_whitespace=False, names=('B'))

In [8]:
#merging them on index
df_test=df_test_text.join(df_test_labels, how='outer')
df_test.shape

(25000, 2)

In [9]:
#changing column name
df_test=df_test.rename(columns={'A':'review_text',
                          'B':'review_label'})
df_test.index.name='S.No.'
# df_test

In [10]:
df_train_text = pd.read_table('imdb_train_text.txt', delim_whitespace=False, names=('A'))
df_train_labels = pd.read_table('imdb_train_labels.txt', delim_whitespace=False, names=('B'))

In [11]:
#merging them on index
df_train=df_train_text.join(df_train_labels, how='outer')

In [12]:
#changing column name
df_train=df_train.rename(columns={'A':'review_text',
                          'B':'review_label'})
df_train.index.name='S.No.'
# df_train

In [13]:
df_train.shape

(25000, 2)

In [14]:
df_all = pd.concat(objs=[df_train,
                         df_test],
                   axis=0)
df_all.reset_index(inplace=True)
df_all.shape

(50000, 3)

In [15]:
df_all.drop(labels=['S.No.'],
            inplace=True,
            axis=1)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df_all['review_text'],
                                                    df_all['review_label'],
                                                    test_size=0.2,
                                                    random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((40000,), (10000,), (40000,), (10000,))

In [17]:
y_train.unique(), y_test.unique()

(array([ 3,  8,  2,  1, 10,  4,  7,  9], dtype=int64),
 array([ 9, 10,  8,  3,  1,  4,  7,  2], dtype=int64))

In [18]:
# label encode the target variable
#mention a random split
encoder = LabelEncoder()
y_train_en = encoder.fit_transform(y_train)
y_test_en = encoder.transform(y_test)

y_train_en.shape, y_test_en.shape

((40000,), (10000,))

In [19]:
# # create a count vectorizer object 
# count_vect = CountVectorizer(analyzer='word', 
#                              token_pattern=r'\w{1,}')
# count_vect.fit(X_train)

# # transform the training and validation data using count vectorizer object
# X_train_count =  count_vect.transform(X_train)
# X_val_count =  count_vect.transform(X_val)

# print(X_train_count.shape, X_val_count.shape)

In [20]:
%%time

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', 
                             token_pattern=r'\w{1,}', 
                             max_features=1000,
                             min_df=0.01, 
                             max_df=0.95)

X_train_tfidf = tfidf_vect.fit_transform(X_train)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', 
                                   token_pattern=r'\w{1,}', 
                                   ngram_range=(2,3), 
                                   max_features=1000,
                                   min_df=0.01, 
                                   max_df=0.95)
X_train_tfidf_ngram = tfidf_vect_ngram.fit_transform(X_train)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', 
                                         token_pattern=r'\w{1,}', 
                                         ngram_range=(2,3), 
                                         max_features=1000,
                                         min_df=0.01, 
                                         max_df=0.95)
X_train_tfidf_ngram_chars = tfidf_vect_ngram_chars.fit_transform(X_train)


Wall time: 2min 22s


In [21]:
%%time
X_test_tfidf = tfidf_vect.transform(X_test)
X_test_tfidf_ngram = tfidf_vect_ngram.transform(X_test)
X_test_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(X_test)

Wall time: 21.4 s


In [22]:
# # Extereme Gradient Boosting on Count Vectors
# accuracy_cv = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
# print ("Xgb, Count Vectors: ", accuracy_cv)

In [23]:
X_train_tfidf_ngram_chars

<40000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 20752370 stored elements in Compressed Sparse Row format>

In [24]:
y_test_en = pd.Series(y_test_en)
y_train_en = pd.Series(y_train_en)

In [1]:
# y_train_en.value_counts()

In [2]:
# y_test_en.value_counts()

In [4]:
import multiprocessing
n_jobs_cnt = multiprocessing.cpu_count()-1
n_jobs_cnt

7

In [5]:
xgb_clf = xgboost.XGBClassifier(n_estimators=300,
                                n_jobs=n_jobs_cnt)

In [28]:
param_grid_xgb = {'max_depth':[3,5],
                  'min_child_weight':[1,2],
                  'learning_rate':[0.05,0.1]}

In [29]:
grid_xgb = GridSearchCV(estimator=xgb_clf,
                        param_grid=param_grid_xgb,
                        verbose=10,
                        return_train_score=False,
                        scoring='accuracy',
                        cv=5)

In [30]:
X_train_tfidf_ngram

<40000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 1999041 stored elements in Compressed Sparse Row format>

In [31]:
%%time
grid_xgb.fit(X=X_train_tfidf_ngram,
             y=y_train_en)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] learning_rate=0.05, max_depth=3, min_child_weight=1 .............


  if diff:
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.8min remaining:    0.0s


[CV]  learning_rate=0.05, max_depth=3, min_child_weight=1, score=0.3292515306759965, total= 3.8min
[CV] learning_rate=0.05, max_depth=3, min_child_weight=1 .............


  if diff:
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  8.0min remaining:    0.0s


[CV]  learning_rate=0.05, max_depth=3, min_child_weight=1, score=0.34153961509622593, total= 4.2min
[CV] learning_rate=0.05, max_depth=3, min_child_weight=1 .............


  if diff:
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 11.9min remaining:    0.0s


[CV]  learning_rate=0.05, max_depth=3, min_child_weight=1, score=0.33820772403449567, total= 3.9min
[CV] learning_rate=0.05, max_depth=3, min_child_weight=1 .............


  if diff:
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 15.4min remaining:    0.0s


[CV]  learning_rate=0.05, max_depth=3, min_child_weight=1, score=0.33775165687132674, total= 3.5min
[CV] learning_rate=0.05, max_depth=3, min_child_weight=1 .............


  if diff:
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 19.0min remaining:    0.0s


[CV]  learning_rate=0.05, max_depth=3, min_child_weight=1, score=0.3356258596973865, total= 3.6min
[CV] learning_rate=0.05, max_depth=3, min_child_weight=2 .............


  if diff:
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 22.5min remaining:    0.0s


[CV]  learning_rate=0.05, max_depth=3, min_child_weight=2, score=0.32837685867799576, total= 3.5min
[CV] learning_rate=0.05, max_depth=3, min_child_weight=2 .............


  if diff:
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 26.1min remaining:    0.0s


[CV]  learning_rate=0.05, max_depth=3, min_child_weight=2, score=0.34053986503374156, total= 3.5min
[CV] learning_rate=0.05, max_depth=3, min_child_weight=2 .............


  if diff:
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 29.4min remaining:    0.0s


[CV]  learning_rate=0.05, max_depth=3, min_child_weight=2, score=0.3370828646419198, total= 3.3min
[CV] learning_rate=0.05, max_depth=3, min_child_weight=2 .............


  if diff:
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 33.0min remaining:    0.0s


[CV]  learning_rate=0.05, max_depth=3, min_child_weight=2, score=0.3393772664749281, total= 3.5min
[CV] learning_rate=0.05, max_depth=3, min_child_weight=2 .............


  if diff:


[CV]  learning_rate=0.05, max_depth=3, min_child_weight=2, score=0.33512567212704764, total= 3.5min
[CV] learning_rate=0.05, max_depth=5, min_child_weight=1 .............


  if diff:


[CV]  learning_rate=0.05, max_depth=5, min_child_weight=1, score=0.34074721979257777, total= 4.8min
[CV] learning_rate=0.05, max_depth=5, min_child_weight=1 .............


  if diff:


[CV]  learning_rate=0.05, max_depth=5, min_child_weight=1, score=0.34978755311172205, total= 5.3min
[CV] learning_rate=0.05, max_depth=5, min_child_weight=1 .............


  if diff:


[CV]  learning_rate=0.05, max_depth=5, min_child_weight=1, score=0.34583177102862145, total= 5.4min
[CV] learning_rate=0.05, max_depth=5, min_child_weight=1 .............


  if diff:


[CV]  learning_rate=0.05, max_depth=5, min_child_weight=1, score=0.3463798924596724, total= 5.4min
[CV] learning_rate=0.05, max_depth=5, min_child_weight=1 .............


  if diff:


[CV]  learning_rate=0.05, max_depth=5, min_child_weight=1, score=0.34150306364886834, total= 5.3min
[CV] learning_rate=0.05, max_depth=5, min_child_weight=2 .............


  if diff:


[CV]  learning_rate=0.05, max_depth=5, min_child_weight=2, score=0.3384980632262901, total= 5.3min
[CV] learning_rate=0.05, max_depth=5, min_child_weight=2 .............


  if diff:


[CV]  learning_rate=0.05, max_depth=5, min_child_weight=2, score=0.34916270932266935, total= 5.2min
[CV] learning_rate=0.05, max_depth=5, min_child_weight=2 .............


  if diff:


[CV]  learning_rate=0.05, max_depth=5, min_child_weight=2, score=0.34570678665166854, total= 5.3min
[CV] learning_rate=0.05, max_depth=5, min_child_weight=2 .............


  if diff:


[CV]  learning_rate=0.05, max_depth=5, min_child_weight=2, score=0.3440040015005627, total= 5.2min
[CV] learning_rate=0.05, max_depth=5, min_child_weight=2 .............


  if diff:


[CV]  learning_rate=0.05, max_depth=5, min_child_weight=2, score=0.3417531574340378, total= 5.2min
[CV] learning_rate=0.1, max_depth=3, min_child_weight=1 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=3, min_child_weight=1, score=0.33774834437086093, total= 3.2min
[CV] learning_rate=0.1, max_depth=3, min_child_weight=1 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=3, min_child_weight=1, score=0.3537865533616596, total= 3.3min
[CV] learning_rate=0.1, max_depth=3, min_child_weight=1 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=3, min_child_weight=1, score=0.34908136482939633, total= 3.3min
[CV] learning_rate=0.1, max_depth=3, min_child_weight=1 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=3, min_child_weight=1, score=0.3496311116668751, total= 3.2min
[CV] learning_rate=0.1, max_depth=3, min_child_weight=1 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=3, min_child_weight=1, score=0.34500437664124045, total= 3.3min
[CV] learning_rate=0.1, max_depth=3, min_child_weight=2 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=3, min_child_weight=2, score=0.34337123578658, total= 3.2min
[CV] learning_rate=0.1, max_depth=3, min_child_weight=2 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=3, min_child_weight=2, score=0.35128717820544864, total= 3.2min
[CV] learning_rate=0.1, max_depth=3, min_child_weight=2 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=3, min_child_weight=2, score=0.35020622422197223, total= 3.2min
[CV] learning_rate=0.1, max_depth=3, min_child_weight=2 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=3, min_child_weight=2, score=0.3490058772039515, total= 3.2min
[CV] learning_rate=0.1, max_depth=3, min_child_weight=2 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=3, min_child_weight=2, score=0.3451294235338252, total= 3.2min
[CV] learning_rate=0.1, max_depth=5, min_child_weight=1 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=5, min_child_weight=1, score=0.34999375234287144, total= 5.2min
[CV] learning_rate=0.1, max_depth=5, min_child_weight=1 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=5, min_child_weight=1, score=0.3586603349162709, total= 5.2min
[CV] learning_rate=0.1, max_depth=5, min_child_weight=1 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=5, min_child_weight=1, score=0.35083114610673666, total= 5.2min
[CV] learning_rate=0.1, max_depth=5, min_child_weight=1 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=5, min_child_weight=1, score=0.3542578466925097, total= 5.2min
[CV] learning_rate=0.1, max_depth=5, min_child_weight=1 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=5, min_child_weight=1, score=0.34875578341878205, total= 5.2min
[CV] learning_rate=0.1, max_depth=5, min_child_weight=2 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=5, min_child_weight=2, score=0.3453704860677246, total= 5.2min
[CV] learning_rate=0.1, max_depth=5, min_child_weight=2 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=5, min_child_weight=2, score=0.3535366158460385, total= 5.2min
[CV] learning_rate=0.1, max_depth=5, min_child_weight=2 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=5, min_child_weight=2, score=0.35295588051493565, total= 5.1min
[CV] learning_rate=0.1, max_depth=5, min_child_weight=2 ..............


  if diff:


[CV]  learning_rate=0.1, max_depth=5, min_child_weight=2, score=0.35400775290734027, total= 5.2min
[CV] learning_rate=0.1, max_depth=5, min_child_weight=2 ..............


  if diff:
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 173.0min finished


[CV]  learning_rate=0.1, max_depth=5, min_child_weight=2, score=0.3493810178817056, total= 5.2min
Wall time: 2h 59min 33s


GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=7, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [3, 5], 'min_child_weight': [1, 2], 'learning_rate': [0.05, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='accuracy', verbose=10)

In [33]:
grid_xgb.best_score_, grid_xgb.best_params_

(0.3525, {'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 1})

In [34]:
grid_xgb.best_estimator_.score(X_test_tfidf_ngram,
                               y_test_en)

  if diff:


0.3495

---

In [96]:
# xgb_clf_dummy = xgboost.XGBClassifier(min_child_weight=2,
#                                       max_depth=5,
#                                       learning_rate=0.1,
#                                       n_estimators=300,
#                                       n_jobs=n_jobs_cnt)

In [None]:
# %%time
# xgb_clf_dummy.fit(X_train_tfidf_ngram,
#                   y_train_en)

In [None]:
# xgb_clf_dummy.score(X_test_tfidf_ngram,
#                     y_test_en)