# IMPORTING USEFUL PYTHON LIBRARIES

In [6]:
#Importing useful libraries in Python
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas, xgboost, numpy, string
import pandas as pd
import numpy as np

In [7]:
import multiprocessing
n_jobs_cnt = multiprocessing.cpu_count()-1
n_jobs_cnt

3

# IMPORTING THE DATA AND DATA PREPROCESSING

In [8]:
#importing train review text and label files to dataframe
import pandas as pd
df_test_text = pd.read_table('imdb_test_text.txt', delim_whitespace=False, names=('A'))
df_test_labels = pd.read_table('imdb_test_labels.txt', delim_whitespace=False, names=('B'))

In [9]:
#merging them on index
df_test=df_test_text.join(df_test_labels, how='outer')
df_test.shape

(25000, 2)

In [17]:
#importing test review text and label files to dataframe
df_train_text = pd.read_table('imdb_train_text.txt', delim_whitespace=False, names=('A'))
df_train_labels = pd.read_table('imdb_train_labels.txt', delim_whitespace=False, names=('B'))

In [18]:
#merging them on index
df_train=df_train_text.join(df_train_labels, how='outer')

In [118]:
#changing column name to review_text and review_label
df_train=df_train.rename(columns={'A':'review_text',
                          'B':'review_label'})
df_train.index.name='S.No.'
df_train.head()

Unnamed: 0_level_0,review_text,review_label
S.No.,Unnamed: 1_level_1,Unnamed: 2_level_1
0,I loved this movie since I was 7 and I saw it ...,10
1,"First things first, Edison Chen did a fantasti...",8
2,"Once again, I was browsing through the discoun...",7
3,"This is a gem, a real piece of Americana for a...",8
4,While I had wanted to se this film since the f...,8


In [119]:
#changing column name to review_text and review_label
df_test=df_test.rename(columns={'A':'review_text',
                          'B':'review_label'})
df_test.index.name='S.No.'
df_test.head(10)

Unnamed: 0_level_0,review_text,review_label
S.No.,Unnamed: 1_level_1,Unnamed: 2_level_1
0,not really sure what to make of this movie. ve...,7
1,"If you enjoyed films like Pulp Fiction, Reserv...",10
2,"Okay, here's the deal. There's this American p...",10
3,The BBC surpassed themselves with the boundari...,10
4,"Victor Mature, as a barely civilized and mostl...",8
5,I remember this film as the other person that ...,10
6,I first saw Heimat 2 on BBC2 in the 90's when ...,10
7,A rich experience is to be gained from watchin...,9
8,The second (not animated) movie about the only...,9
9,I first saw this in the theater in 1969 when I...,10


In [120]:
df_train.shape

(25000, 2)

In [121]:
#concatenating two datasets
df_all = pd.concat(objs=[df_train,
                         df_test],
                   axis=0)
df_all.reset_index(inplace=True)
df_all.shape

(50000, 3)

In [122]:
# Droping serial_no column
df_all.drop(labels=['S.No.'],
            inplace=True,
            axis=1)
df_all.head()


Unnamed: 0,review_text,review_label
0,I loved this movie since I was 7 and I saw it ...,10
1,"First things first, Edison Chen did a fantasti...",8
2,"Once again, I was browsing through the discoun...",7
3,"This is a gem, a real piece of Americana for a...",8
4,While I had wanted to se this film since the f...,8


# PREPARING THE DATASETS FOR MODEL FITTING

In [27]:
#Splitting the dataset into test and train for implementing ML models 
#mention a random split
X_train, X_test, y_train, y_test = train_test_split(df_all['review_text'],
                                                    df_all['review_label'],
                                                    test_size=0.2,
                                                    random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((40000,), (10000,), (40000,), (10000,))

In [123]:
#checking unique labels present in test and train data
y_train.unique(), y_test.unique()

(array([ 3,  8,  2,  1, 10,  4,  7,  9]),
 array([ 9, 10,  8,  3,  1,  4,  7,  2]))

In [29]:
# label encode the target variable

encoder = LabelEncoder()
y_train_en = encoder.fit_transform(y_train)
y_test_en = encoder.transform(y_test)

y_train_en.shape, y_test_en.shape

((40000,), (10000,))

## IMPLEMENTING TF-IDF VECTORIZER AND COUNT VECTORIZER 

### This was done in multiple iterations with different parameter values to achieve maximum accuracy on the results

### TF-DIF ON WORD LEVEL, NGRAM LEVEL, CHARS LEVEL

In [68]:
# word level tf-idf
%%time

tfidf_vect = TfidfVectorizer(analyzer='word', 
                             token_pattern=r'\w{1,}', 
                             max_features=2000,
                             min_df=0.01, 
                             max_df=0.95)

X_train_tfidf = tfidf_vect.fit_transform(X_train)

In [77]:
%%time
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', 
                                   token_pattern=r'\w{1,}', 
                                   ngram_range=(2,3), 
                                   max_features=2400,
                                   min_df=0.01, 
                                   max_df=0.95)
X_train_tfidf_ngram = tfidf_vect_ngram.fit_transform(X_train)

In [111]:
%%time
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', 
                                         token_pattern=r'\w{1,}', 
                                         ngram_range=(2,5), 
                                         max_features=10000,
                                         min_df=0.01, 
                                         max_df=0.95)
X_train_tfidf_ngram_chars = tfidf_vect_ngram_chars.fit_transform(X_train)

In [69]:
#Tranfroming the test data with TF-IDF Vectorizer
%%time
X_test_tfidf = tfidf_vect.transform(X_test)
X_test_tfidf_ngram = tfidf_vect_ngram.transform(X_test)
X_test_tfidf_ngram_chars = tfidf_vect_ngram_chars.transform(X_test)

In [34]:
#Converting y labels into series object for model fitting
y_test_en = pd.Series(y_test_en)
y_train_en = pd.Series(y_train_en)

# IMPLEMENTING COUNT VECTORIZER

In [142]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=1000, min_df=0.02, max_df=0.95)

count_vect.fit(df_all['review_text'])

# # transform the training and validation data using count vectorizer object
X_train_count =  count_vect.fit_transform(X_train)
X_test_count =  count_vect.transform(X_test)

In [40]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, y_test_en)

# FITTING THE MODELS

## Naive Bayes


### TF-IDF VECTORIZER


In [124]:
#Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), X_train_tfidf, y_train_en, X_test_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

NB, WordLevel TF-IDF:  0.395


In [125]:

#Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), X_train_tfidf_ngram , y_train_en, X_test_tfidf_ngram )
print ("NB, N-Gram Vectors: ", accuracy)

NB, N-Gram Vectors:  0.3769


In [126]:
# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), X_train_tfidf_ngram_chars, y_train_en, X_test_tfidf_ngram_chars )
print ("NB, CharLevel Vectors: ", accuracy)

NB, CharLevel Vectors:  0.3895


### COUNT VECTORIZER

In [143]:
accuracy = train_model(naive_bayes.MultinomialNB(), X_train_count, y_train_en, X_test_count)
print("RF, Count Vectors: ",accuracy)

RF, Count Vectors:  0.389
