In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Importing Libraries

In [13]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfTransformer #To conduct Term Frequency Inverse Document Frequency Tranformation
from sklearn.feature_extraction.text import CountVectorizer #To create text document matrix
from sklearn.metrics import accuracy_score, confusion_matrix
import plotly.tools as pytools
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Load Data

0 - Negative

1 - Positive

In [5]:
df=pd.read_csv('/content/drive/MyDrive/Sentiment Analysis/train.csv')

In [6]:
df.shape

(8530, 2)

In [7]:
df.head()

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [8]:
df['label'].unique()

array([1, 0])

-> Binary Classification

# Cleaning the Data

## Stop Words and Stemming

In [15]:
stop_words=set(stopwords.words("english"))
ps=PorterStemmer()

#removes stop words/ digits/ puntuations and stemms the words
def stop_and_stem(phrase):
    global stop_words, ps
    phrase=[ps.stem(word) for word in word_tokenize(phrase.lower()) if not word in stop_words]
    phrase=[re.sub(r'[^\w\s]','',word) for word in phrase]
    return " ".join(phrase)

In [16]:
df['text'] = df['text'].apply(stop_and_stem)

In [17]:
df.head()

Unnamed: 0,text,label
0,rock destin 21st centuri new conan go make spl...,1
1,gorgeou elabor continu lord ring trilog huge c...,1
2,effect tootepid biopic,1
3,sometim like go movi fun wasabi good place start,1
4,emerg someth rare issu movi honest keenli obse...,1


## Converting Text Column to Matrix

In [18]:
count_vector=CountVectorizer()
x_df_counts=count_vector.fit_transform(df['text'])
x_df_counts.shape

(8530, 12757)

## Term Frequency Inverse Document Frequency (TF-IDF)

In [19]:
tfidf_transformer=TfidfTransformer()
x_df_tfidf=tfidf_transformer.fit_transform(x_df_counts)

# Train Test Split

In [20]:
np.random.seed(10)
msk = np.random.rand(len(df)) < 0.8
train = x_df_tfidf[msk]
test = x_df_tfidf[~msk]

In [21]:
train.shape, test.shape

((6889, 12757), (1641, 12757))

# Building Model

## Multinomial Naive Bayes

In [22]:
from sklearn.naive_bayes import MultinomialNB

clf_MNB=MultinomialNB().fit(train, df[msk]['label'])
train_pred_MNB=clf_MNB.predict(train)
test_pred_MNB=clf_MNB.predict(test)

In [23]:
#Model Performance Metrics
accuracy_MNB_train = accuracy_score(df[msk]['label'], train_pred_MNB)
accuracy_MNB_test = accuracy_score( df[~msk]['label'], test_pred_MNB)
print ("Train Accuracy: ",accuracy_MNB_train,"\nTest Accuracy:", accuracy_MNB_test)
confusion_MNB_train=confusion_matrix(df[msk]['label'], train_pred_MNB)
confusion_MNB_test=confusion_matrix(df[~msk]['label'], test_pred_MNB)

Train Accuracy:  0.923501233851067 
Test Accuracy: 0.7666057282145033


## Decision Tree

In [28]:
from sklearn import tree

clf_dtrees=tree.DecisionTreeClassifier().fit(train, df[msk]['label'])
train_pred_dtrees=clf_dtrees.predict(train)
test_pred_dtrees=clf_dtrees.predict(test)

In [29]:
#Model Performance Metrics
accuracy_dtrees_train= accuracy_score( df[msk]['label'], train_pred_dtrees)
accuracy_dtrees_test= accuracy_score( df[~msk]['label'], test_pred_dtrees)
print ("Train Accuracy: ",accuracy_dtrees_train, "\nTest Accuracy:",accuracy_dtrees_test)

confusion_dtrees_train=confusion_matrix(df[msk]['label'], train_pred_dtrees)
confusion_dtrees_test=confusion_matrix(df[~msk]['label'], test_pred_dtrees)

Train Accuracy:  1.0 
Test Accuracy: 0.6343692870201096


## Support Vector Machine

In [24]:
from sklearn import svm 

clf_svm_Linear = svm.LinearSVC().fit(train, df[msk]['label'])
train_pred_svm_linear=clf_svm_Linear.predict(train)
test_pred_svm_linear=clf_svm_Linear.predict(test)

In [25]:
#Model Performance Metrics
accuracy_svm_linear_train= accuracy_score(df[msk]['label'], train_pred_svm_linear)
accuracy_svm_linear_test= accuracy_score( df[~msk]['label'], test_pred_svm_linear)
print ("Train Accuracy: ",accuracy_svm_linear_train,"\nTest Accuracy:", accuracy_svm_linear_test)

confusion_svm_linear_train=confusion_matrix(df[msk]['label'], train_pred_svm_linear)
confusion_svm_linear_test=confusion_matrix(df[~msk]['label'], test_pred_svm_linear)

Train Accuracy:  0.982580926114095 
Test Accuracy: 0.7361365021328459


## Neural Network

In [26]:
from sklearn.neural_network import MLPClassifier

clf_mlp = MLPClassifier(solver='adam',activation='logistic', hidden_layer_sizes=(10),max_iter=400,random_state=1)
clf_mlp=clf_mlp.fit(train, df[msk]['label'])
train_pred_mlp=clf_mlp.predict(train)
test_pred_mlp=clf_mlp.predict(test)

In [27]:
#Model Performance Metrics
accuracy_mlp_train= accuracy_score(df[msk]['label'], train_pred_mlp)
accuracy_mlp_test= accuracy_score( df[~msk]['label'], test_pred_mlp)
print ("Train Accuracy: ",accuracy_mlp_train, "\nTest Accuracy:",accuracy_mlp_test)

confusion_mlp_train=confusion_matrix(df[msk]['label'], train_pred_mlp)
confusion_mlp_test=confusion_matrix(df[~msk]['label'], test_pred_mlp)

Train Accuracy:  0.9998548410509508 
Test Accuracy: 0.716636197440585
