# Sentiment analysis uisng Several Techniques on Movie Reviews Dataset

In [4]:
# Load and prepare the dataset
import nltk
from nltk.corpus import movie_reviews
import random

documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

In [5]:
documents_words_list, documents_labels = zip(*documents)

In [6]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
import string

In [7]:
### Removing Stopwords, Removing Punctuations, Lemmatizing 
def preprocess_words_lists(words_lists):
    wordnet_lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english')) 
    processed_words_lists = []
    for words_list in words_lists:
        processed_words_lists.append([wordnet_lemmatizer.lemmatize(word, pos="v") for word in words_list if word 
                                      not in string.punctuation and word not in stop_words])
    return processed_words_lists

In [8]:
preprocessed_words_lists = preprocess_words_lists(documents_words_list)
## Making sentences from words
preprocessed_sentences = [" ".join(words_list) for words_list in preprocessed_words_lists]
## Converting categorical values to numerical values so we can try several ml algorithms
documents_labels = [1 if label == "pos" else -1 for label in documents_labels]
processed_docs = [(sentence, label) for sentence, label in zip(preprocessed_sentences, documents_labels)]

### Knowing about the dataset before running any algorithm is a good practice, let's look into the stats of the data

In [9]:
from collections import Counter
labels_count = Counter(documents_labels)
labels_count

Counter({1: 1000, -1: 1000})

In [10]:
all_words = [word for words_list in preprocessed_words_lists for word in words_list]
print("Total number of different words in this corpus is {}".format(len(set(all_words))))

Total number of different words in this corpus is 32165


In [11]:
most_frequent_words = sorted(dict(Counter(all_words)).items(), key = lambda x:x[1], reverse = True)

In [12]:
most_frequent_words

[('film', 11198),
 ('one', 5852),
 ('movie', 5771),
 ('make', 4327),
 ('like', 3972),
 ('character', 3879),
 ('get', 3759),
 ('see', 3137),
 ('go', 3056),
 ('time', 3036),
 ('even', 2611),
 ('good', 2411),
 ('play', 2360),
 ('take', 2211),
 ('story', 2169),
 ('would', 2109),
 ('much', 2049),
 ('know', 1972),
 ('come', 1970),
 ('also', 1967),
 ('well', 1964),
 ('give', 1925),
 ('two', 1911),
 ('look', 1896),
 ('end', 1853),
 ('seem', 1839),
 ('first', 1836),
 ('--', 1815),
 ('work', 1733),
 ('way', 1693),
 ('plot', 1619),
 ('find', 1602),
 ('life', 1586),
 ('say', 1583),
 ('think', 1573),
 ('really', 1558),
 ('little', 1501),
 ('show', 1490),
 ('people', 1471),
 ('could', 1427),
 ('man', 1409),
 ('star', 1408),
 ('scene', 1397),
 ('bad', 1395),
 ('love', 1392),
 ('never', 1374),
 ('try', 1351),
 ('best', 1336),
 ('new', 1292),
 ('scenes', 1274),
 ('many', 1268),
 ('become', 1263),
 ('action', 1260),
 ('director', 1237),
 ('want', 1236),
 ('watch', 1209),
 ('movies', 1206),
 ('use', 1161

### Apply different Machine Learning Algorithms

In [13]:
## Creating train and test datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(preprocessed_sentences, documents_labels, test_size = 0.05, stratify = documents_labels)

In [14]:
print("Number of Train and Test samples are : {} and {} respectively.".format(len(y_train), len(y_test)))

Number of Train and Test samples are : 1900 and 100 respectively.


In [15]:
## Convert the sentences into bag of words representation as ml algorithms would only work with numericals

from sklearn.feature_extraction.text import CountVectorizer

# Simply leaving the arguments as empty is fine.
vectorizer = CountVectorizer(min_df =0 , max_df = 0.9)

# Compute vocabulary of train data and transform it to bag of words.
X_train_bow = vectorizer.fit_transform(X_train)

# Check fitted vocabulary dictionary.
vocabulary = vectorizer.get_feature_names()
vocabulary_size = len(vocabulary)
print("The vocabulary size after converting to bag of words : {}".format(vocabulary_size))

The vocabulary size after converting to bag of words : 31340


In [16]:
X_test_bow = vectorizer.transform(X_test)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_curve, auc

clf = LogisticRegression(solver='lbfgs', max_iter = 1000)

# Train
clf.fit(X_train_bow, y_train)
# Test
pred_test = clf.predict(X_test_bow)
print("The accuracy is {}".format(accuracy_score(y_test, pred_test)))

fpr, tpr, thresholds = roc_curve(y_test, pred_test, pos_label=1)
print("LogisticRegression AUC: {0}".format(auc(fpr, tpr)))

The accuracy is 0.88
LogisticRegression AUC: 0.8799999999999999


In [18]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')

# Train
clf.fit(X_train_bow, y_train)
# Test
pred_test = clf.predict(X_test_bow)
print("The accuracy is {}".format(accuracy_score(y_test, pred_test)))

fpr, tpr, thresholds = roc_curve(y_test, pred_test, pos_label=1)
print("SVC AUC: {0}".format(auc(fpr, tpr)))

The accuracy is 0.64
SVC AUC: 0.64


In [19]:
from sklearn.naive_bayes import MultinomialNB
clf =  MultinomialNB()

# Train
clf.fit(X_train_bow, y_train)
# Test
pred_test = clf.predict(X_test_bow)
print("The accuracy is {}".format(accuracy_score(y_test, pred_test)))

fpr, tpr, thresholds = roc_curve(y_test, pred_test, pos_label=1)
print("MultinomialNB AUC: {0}".format(auc(fpr, tpr)))

The accuracy is 0.82
MultinomialNB AUC: 0.8200000000000001


In [20]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 200, max_depth=10)

# Train
clf.fit(X_train_bow, y_train)
# Test
pred_test = clf.predict(X_test_bow)
print("The accuracy is {}".format(accuracy_score(y_test, pred_test)))

fpr, tpr, thresholds = roc_curve(y_test, pred_test, pos_label=1)
print("RFC AUC: {0}".format(auc(fpr, tpr)))

The accuracy is 0.92
RFC AUC: 0.9199999999999998
