In [4]:
#importing review text and label files to dataframe

import pandas as pd

df_test_text = pd.read_table('imdb_test_text.txt', delim_whitespace=False, names=('A'))
df_test_labels = pd.read_table('imdb_test_labels.txt', delim_whitespace=False, names=('B'))

In [5]:
#merging them on index
df_test=df_test_text.join(df_test_labels, how='outer')
# df_test

In [6]:
#changing column name
df_test=df_test.rename(columns={'A':'review_text',
                          'B':'review_label'})
df_test.index.name='S.No.'
# df_test

In [7]:
df_train_text = pd.read_table('imdb_train_text.txt', delim_whitespace=False, names=('A'))
df_train_labels = pd.read_table('imdb_train_labels.txt', delim_whitespace=False, names=('B'))

In [8]:
#merging them on index
df_train=df_train_text.join(df_train_labels, how='outer')

In [9]:
#changing column name
df_train=df_train.rename(columns={'A':'review_text',
                          'B':'review_label'})
df_train.index.name='S.No.'
# df_train

In [10]:
frames=[df_train, df_test] # concatenating the training and testing datasets
df_total=pd.concat(frames)
# df_total

In [11]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, string

In [12]:
trainDF=pd.DataFrame()
trainDF['text']=df_train['review_text']
trainDF['label']=df_train['review_label']
# trainDF

In [13]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [17]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [14]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 

In [15]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    #return predictions
    
return metrics.accuracy_score(predictions, valid_y)

In [25]:
# Extereme Gradient Boosting on Count Vectors
accuracy_cv = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print ("Xgb, Count Vectors: ", accuracy_cv)

Xgb, Count Vectors:  [7 5 0 ... 0 1 5]


  if diff:


In [26]:
# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy_tf = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print ("Xgb, WordLevel TF-IDF: ", accuracy_tf)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy_char = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print ("Xgb, CharLevel Vectors: ", accuracy_char)

  if diff:


Xgb, WordLevel TF-IDF:  [7 7 0 ... 0 1 5]
Xgb, CharLevel Vectors:  [7 6 0 ... 0 5 4]


  if diff:


In [27]:
d = {'valid_review': valid_x, 'valid_label': valid_y, 'predicted': accuracy_tf}
df_pred = pd.DataFrame(data=d)
df_pred['predicted'].unique()
df_pred['accuracy']=df_pred['valid_label']-df_pred['predicted']
df_pred['accuracy']=df_pred['accuracy'].abs()
df_pred['accuracy'].mean()

1.676

In [28]:
df_pred

Unnamed: 0_level_0,valid_review,valid_label,predicted,accuracy
S.No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8202,A heartwarming film. The usual superb acting b...,7,7,0
8937,"Yes, I call this a perfect movie. Not one bori...",7,7,0
23736,"This movie was crap with a capital ""C."" The op...",1,0,1
11255,"I had the privilege of watching ""Holly"" at the...",7,7,0
21726,"Sorry Fulci fans, but I could not get through ...",0,0,0
6370,So Mary and Rhoda have aged--who hasn't? I was...,4,7,3
1738,Apart from the usual stereotypes of the thirti...,7,7,0
4101,Those engaging the movie camera so early in th...,4,3,1
19345,I'm not a big fan of rom/coms at the best of t...,3,7,4
657,"This ""tragicomedy"" written by famous Serbian t...",7,7,0
