##Connection to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/MyDrive/Colab Notebooks/Temp

##Importing necessary items

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


##Loading Dataset
This is IMDB movie review dataset which contains 40000 reviews for training, 5000 reviews for validation and 5000 reviews for testing.

In [128]:
train = pd.read_csv("Train.csv")
valid = pd.read_csv("Valid.csv")
test = pd.read_csv("Test.csv")
print('train size:', len(train))
print('valid size:', len(valid))
print('test size:', len(test))

train size: 40000
valid size: 5000
test size: 5000


In [47]:
train.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


##Preprocessing dataset

In [48]:
unwanted_symbols = re.compile('[/(){}\[\]\|@,;]')
bad_symbols = re.compile('[^0-9a-z #+_]')
sw = set(stopwords.words('english'))



def text_prepare(text):
    text = text.lower() # lowercase text
    text = unwanted_symbols.sub(' ', text) # replace unwanted_symbols by space in text
    text = bad_symbols.sub('', text) # delete symbols which are in bad from text
    text = ' '.join([x for x in text.split() if x and x not in sw]) # delete stopwords from text
    return text

In [49]:
X_train = []
for review in train.text:
    review = text_prepare(review.strip())
    X_train.append(review)
y_train = np.array(train.label)

In [50]:
X_valid = []
for review in valid.text:
    review = text_prepare(review.strip())
    X_valid.append(review)
y_valid = np.array(valid.label)

In [51]:
X_test = []
for review in test.text:
    review = text_prepare(review.strip())
    X_test.append(review)
y_test = np.array(test.label)

##Feature Extraction(bag of word method)





In [59]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train_bag = vectorizer.fit_transform(X_train)
X_valid_bag = vectorizer.transform(X_valid)
X_test_bag = vectorizer.transform(X_test)

In [60]:
vocab_bag = vectorizer.vocabulary_

##Feature Extraction(Tf-Idf method)

In [112]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_features(X_train, X_valid, X_test):   
    tfidf_vectorizer = TfidfVectorizer(min_df=3, max_df=0.9, ngram_range=(1, 2))
    
    X_train = tfidf_vectorizer.fit_transform(X_train)
    X_valid = tfidf_vectorizer.transform(X_valid)
    X_test = tfidf_vectorizer.transform(X_test)
    return X_train, X_valid, X_test, tfidf_vectorizer.vocabulary_, tfidf_vectorizer

In [113]:
X_train_tfidf, X_valid_tfidf, X_test_tfidf, vocab_tfidf, tfidf_vectorizer = tfidf_features(X_train, X_valid, X_test)
tfidf_reversed_vocab = {i:word for word,i in vocab_tfidf.items()}

#Training

##LogisticRegression Classifier

####Hyper parameter tuning for Logistic **Regression** (bag of words features)

In [91]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
llc_params = {'penalty': ['l1','l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

llc = LogisticRegression(solver='liblinear')
#use gridsearch to test all values
grid = GridSearchCV(llc, llc_params, cv=5, verbose = 5)
#fit model to data
grid.fit(X_valid_bag, y_valid)
grid.best_params_

Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV] C=0.001, penalty=l1 .............................................
[CV] ................. C=0.001, penalty=l1, score=0.498, total=   0.1s
[CV] C=0.001, penalty=l1 .............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s


[CV] ................. C=0.001, penalty=l1, score=0.497, total=   0.1s
[CV] C=0.001, penalty=l1 .............................................
[CV] ................. C=0.001, penalty=l1, score=0.497, total=   0.1s
[CV] C=0.001, penalty=l1 .............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s remaining:    0.0s


[CV] ................. C=0.001, penalty=l1, score=0.497, total=   0.1s
[CV] C=0.001, penalty=l1 .............................................
[CV] ................. C=0.001, penalty=l1, score=0.497, total=   0.1s
[CV] C=0.001, penalty=l2 .............................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.5s remaining:    0.0s


[CV] ................. C=0.001, penalty=l2, score=0.784, total=   1.3s
[CV] C=0.001, penalty=l2 .............................................
[CV] ................. C=0.001, penalty=l2, score=0.820, total=   1.6s
[CV] C=0.001, penalty=l2 .............................................
[CV] ................. C=0.001, penalty=l2, score=0.805, total=   1.3s
[CV] C=0.001, penalty=l2 .............................................
[CV] ................. C=0.001, penalty=l2, score=0.816, total=   1.3s
[CV] C=0.001, penalty=l2 .............................................
[CV] ................. C=0.001, penalty=l2, score=0.789, total=   1.2s
[CV] C=0.01, penalty=l1 ..............................................
[CV] .................. C=0.01, penalty=l1, score=0.690, total=   3.7s
[CV] C=0.01, penalty=l1 ..............................................
[CV] .................. C=0.01, penalty=l1, score=0.683, total=   3.7s
[CV] C=0.01, penalty=l1 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  70 out of  70 | elapsed:  5.9min finished


{'C': 1000, 'penalty': 'l1'}

In [94]:
llc = LogisticRegression(solver='liblinear', C = 1000, penalty = 'l1')
llc.fit(X_train_bag, y_train)
y_pred = llc.predict(X_test_bag)

####Evaluation of Logistic Regression Classifier (bag of words features)

In [96]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      2495
           1       0.89      0.91      0.90      2505

    accuracy                           0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000



####Hyper parameter tuning for Logistic **Regression** (Tf-Idf features)

In [80]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
llc_params = {'penalty': ['l1','l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

llc = LogisticRegression(solver='liblinear')
#use gridsearch to test all values
grid = GridSearchCV(llc, llc_params, cv=5, verbose = 5)
#fit model to data
grid.fit(X_valid_tfidf, y_valid)
grid.best_params_

Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV] C=0.001, penalty=l1 .............................................
[CV] ................. C=0.001, penalty=l1, score=0.498, total=   0.0s
[CV] C=0.001, penalty=l1 .............................................
[CV] ................. C=0.001, penalty=l1, score=0.497, total=   0.0s
[CV] C=0.001, penalty=l1 .............................................
[CV] ................. C=0.001, penalty=l1, score=0.497, total=   0.0s
[CV] C=0.001, penalty=l1 .............................................
[CV] ................. C=0.001, penalty=l1, score=0.497, total=   0.0s
[CV] C=0.001, penalty=l1 .............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s


[CV] ................. C=0.001, penalty=l1, score=0.497, total=   0.1s
[CV] C=0.001, penalty=l2 .............................................
[CV] ................. C=0.001, penalty=l2, score=0.503, total=   0.1s
[CV] C=0.001, penalty=l2 .............................................
[CV] ................. C=0.001, penalty=l2, score=0.510, total=   0.1s
[CV] C=0.001, penalty=l2 .............................................
[CV] ................. C=0.001, penalty=l2, score=0.508, total=   0.1s
[CV] C=0.001, penalty=l2 .............................................
[CV] ................. C=0.001, penalty=l2, score=0.506, total=   0.1s
[CV] C=0.001, penalty=l2 .............................................
[CV] ................. C=0.001, penalty=l2, score=0.506, total=   0.1s
[CV] C=0.01, penalty=l1 ..............................................
[CV] .................. C=0.01, penalty=l1, score=0.498, total=   0.1s
[CV] C=0.01, penalty=l1 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  70 out of  70 | elapsed:   25.0s finished


{'C': 100, 'penalty': 'l2'}

In [110]:
llc = LogisticRegression(solver="liblinear", C = 100, penalty = 'l2')
llc.fit(X_train_tfidf, y_train)
y_pred = llc.predict(X_test_tfidf)

####Evaluation of Logistic Regression Classifier (Tf-Idf features)

In [111]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.90      0.91      2495
           1       0.90      0.91      0.91      2505

    accuracy                           0.91      5000
   macro avg       0.91      0.91      0.91      5000
weighted avg       0.91      0.91      0.91      5000



##Decision
From above evaluation we can see that on IMDB dataset logistic regression classifier works better when reviews are represented in Tf-Idf forms instead of bag of words representation.

##Predicting the sentiment of custom review

In [126]:
def predict_sentiment_tfidf(review, clf):
  review = text_prepare(review)
  review = [review]
  input = tfidf_vectorizer.transform(review)
  sentiment = clf.predict(input)
  if sentiment[0] == 1:
    print("Positive")
  else:
    print("Negative")

In [127]:
review = "wow! this movie is awesome!"
predict_sentiment_tfidf(review, llc)

Positive
