In [4]:
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas, xgboost, numpy, string
import pandas as pd
import numpy as np

7

In [6]:
xgboost.__version__

'0.81'

In [7]:
#importing review text and label files to dataframe

import pandas as pd

df_test_text = pd.read_table('imdb_test_text.txt', delim_whitespace=False, names=('A'))
df_test_labels = pd.read_table('imdb_test_labels.txt', delim_whitespace=False, names=('B'))

In [8]:
#merging them on index
df_test=df_test_text.join(df_test_labels, how='outer')
df_test.shape

(25000, 2)

In [9]:
#changing column name
df_test=df_test.rename(columns={'A':'review_text',
                          'B':'review_label'})
df_test.index.name='S.No.'
# df_test

In [10]:
df_train_text = pd.read_table('imdb_train_text.txt', delim_whitespace=False, names=('A'))
df_train_labels = pd.read_table('imdb_train_labels.txt', delim_whitespace=False, names=('B'))

In [11]:
#merging them on index
df_train=df_train_text.join(df_train_labels, how='outer')

In [12]:
#changing column name
df_train=df_train.rename(columns={'A':'review_text',
                          'B':'review_label'})
df_train.index.name='S.No.'
# df_train

In [13]:
df_train.shape

(25000, 2)

In [14]:
df_all = pd.concat(objs=[df_train,
                         df_test],
                   axis=0)
df_all.reset_index(inplace=True)
df_all.shape

(50000, 3)

In [15]:
df_all.drop(labels=['S.No.'],
            inplace=True,
            axis=1)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df_all['review_text'],
                                                    df_all['review_label'],
                                                    test_size=0.2,
                                                    random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((40000,), (10000,), (40000,), (10000,))

In [17]:
y_train.unique(), y_test.unique()

(array([ 3,  8,  2,  1, 10,  4,  7,  9], dtype=int64),
 array([ 9, 10,  8,  3,  1,  4,  7,  2], dtype=int64))

In [18]:
# label encode the target variable
#mention a random split
encoder = LabelEncoder()
y_train_en = encoder.fit_transform(y_train)
y_test_en = encoder.transform(y_test)

y_train_en.shape, y_test_en.shape

((40000,), (10000,))

In [19]:
# # create a count vectorizer object 
# count_vect = CountVectorizer(analyzer='word', 
#                              token_pattern=r'\w{1,}')
# count_vect.fit(X_train)

# # transform the training and validation data using count vectorizer object
# X_train_count =  count_vect.transform(X_train)
# X_val_count =  count_vect.transform(X_val)

# print(X_train_count.shape, X_val_count.shape)

In [20]:
%%time

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', 
                             token_pattern=r'\w{1,}', 
                             max_features=1000,
                             min_df=0.01, 
                             max_df=0.95)

X_train_tfidf = tfidf_vect.fit_transform(X_train)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', 
                                   token_pattern=r'\w{1,}', 
                                   ngram_range=(2,3), 
                                   max_features=1000,
                                   min_df=0.01, 
                                   max_df=0.95)
X_train_tfidf_ngram = tfidf_vect_ngram.fit_transform(X_train)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', 
                                         token_pattern=r'\w{1,}', 
                                         ngram_range=(2,3), 
                                         max_features=1000,
                                         min_df=0.01, 
                                         max_df=0.95)
X_train_tfidf_ngram_chars = tfidf_vect_ngram_chars.fit_transform(X_train)


Wall time: 2min 22s


In [21]:
%%time
X_test_tfidf = tfidf_vect.transform(X_test)
X_test_tfidf_ngram = tfidf_vect_ngram.transform(X_test)
X_test_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(X_test)

Wall time: 21.4 s


In [22]:
# # Extereme Gradient Boosting on Count Vectors
# accuracy_cv = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
# print ("Xgb, Count Vectors: ", accuracy_cv)

In [23]:
X_train_tfidf_ngram_chars

<40000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 20752370 stored elements in Compressed Sparse Row format>

In [24]:
y_test_en = pd.Series(y_test_en)
y_train_en = pd.Series(y_train_en)

In [2]:
# y_train_en.value_counts()

In [1]:
# y_test_en.value_counts()

In [3]:
import multiprocessing
n_jobs_cnt = multiprocessing.cpu_count()-1
n_jobs_cnt

7

In [27]:
xgb_clf = xgboost.XGBClassifier(n_estimators=300,
                                n_jobs=n_jobs_cnt)

In [28]:
param_grid_xgb = {'max_depth':[3,5],
                  'min_child_weight':[1,2],
                  'learning_rate':[0.05,0.1]}

In [30]:
X_train_tfidf_ngram

<40000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 1999041 stored elements in Compressed Sparse Row format>

---

In [5]:
xgb_clf_dummy = xgboost.XGBClassifier(min_child_weight=2,
                                      max_depth=5,
                                      learning_rate=0.1,
                                      n_estimators=300,
                                      n_jobs=n_jobs_cnt)

In [None]:
%%time
xgb_clf_dummy.fit(X_train_tfidf_ngram,
                  y_train_en)

In [None]:
xgb_clf_dummy.score(X_test_tfidf_ngram,
                    y_test_en)