In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report 
from sklearn.naive_bayes import ComplementNB 

In [2]:
data = pd.read_csv('imdb_labelled.txt', sep='\t', names=[ 'reviews', 'score'])
data.head()

Unnamed: 0,reviews,score
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [3]:
def get_sentiments(data):
  if data['score'] == 0:
    return 'negative'
  elif data['score'] == 1:
    return 'positive'
  else:
    return 'indifferent'

data['sentiments'] = data.apply(get_sentiments, axis=1)
data.head()

Unnamed: 0,reviews,score,sentiments
0,"A very, very, very slow-moving, aimless movie ...",0,negative
1,Not sure who was more lost - the flat characte...,0,negative
2,Attempting artiness with black & white and cle...,0,negative
3,Very little music or anything to speak of.,0,negative
4,The best scene in the movie was when Gerardo i...,1,positive


In [4]:
data.isnull().sum()

reviews       0
score         0
sentiments    0
dtype: int64

In [5]:
data['length'] = data['reviews'].apply(lambda x: len(str(x).split(' ')))
data.head()

Unnamed: 0,reviews,score,sentiments,length
0,"A very, very, very slow-moving, aimless movie ...",0,negative,15
1,Not sure who was more lost - the flat characte...,0,negative,21
2,Attempting artiness with black & white and cle...,0,negative,33
3,Very little music or anything to speak of.,0,negative,10
4,The best scene in the movie was when Gerardo i...,1,positive,23


In [6]:
data['length'].unique()

array([  15,   21,   33,   10,   23,   22,    5,   17,   12,    8,   13,
          6,   18,   25,   19,  868,   14,   16,    9,    7,   26,   35,
         11,    4,   27,   28,    3,   20,   37,   24,   36,  201, 1393,
         49,   30,  306,   31,   32,   44,   56,   45,   34,   38,   64,
         58,   40,   47,   29,   73,   41,   39,   55,   53,   42,  800])

In [7]:
X = data['reviews']
y = data['score']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)

In [9]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/wizardcalidad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/wizardcalidad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline 
from sklearn.feature_extraction.text import TfidfTransformer
lm = WordNetLemmatizer()

In [11]:
pipeNB = Pipeline([
      ('bow',CountVectorizer(stop_words='english')),
      ('tfidf',TfidfTransformer()),
      ('classifier', ComplementNB())
])

In [12]:
pipeNB.fit(X_train,y_train)

Pipeline(steps=[('bow', CountVectorizer(stop_words='english')),
                ('tfidf', TfidfTransformer()), ('classifier', ComplementNB())])

In [13]:
pred = pipeNB.predict(X_test) #predict testing data

from sklearn.metrics import classification_report
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.81      0.77      0.79        60
           1       0.75      0.79      0.77        53

    accuracy                           0.78       113
   macro avg       0.78      0.78      0.78       113
weighted avg       0.78      0.78      0.78       113



In [14]:
from statistics import mean, stdev 
from sklearn.model_selection import StratifiedKFold 


accuracy = []
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=100) 
skf.get_n_splits(X,y) 

for train_index, test_index in skf.split(X, y):
  X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index] 
  y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

  pipeNB.fit(X_train_fold, y_train_fold)
  result = pipeNB.score(X_test_fold, y_test_fold)
  accuracy.append(result)


# Print the output
print('List of possible accuracy:', accuracy) 
print('\nMaximum Accuracy That can be obtained from this model is:', 
  max(accuracy)*100, '%') 
print('\nMinimum Accuracy:', 
  min(accuracy)*100, '%') 
print('\nOverall Accuracy:', 
  mean(accuracy)*100, '%') 
print('\nStandard Deviation is:', stdev(accuracy)) 

List of possible accuracy: [0.7733333333333333, 0.8, 0.8, 0.76, 0.8266666666666667, 0.76, 0.76, 0.7733333333333333, 0.7837837837837838, 0.7702702702702703]

Maximum Accuracy That can be obtained from this model is: 82.66666666666667 %

Minimum Accuracy: 76.0 %

Overall Accuracy: 78.07387387387388 %

Standard Deviation is: 0.022035814226848343
