In [33]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Libraries

In [34]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfTransformer #To conduct Term Frequency Inverse Document Frequency Tranformation
from sklearn.feature_extraction.text import CountVectorizer #To create text document matrix
from sklearn.metrics import accuracy_score, confusion_matrix
import plotly.tools as pytools
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load Data

0 - Negative

1 - Positive

In [35]:
df=pd.read_csv('/content/drive/MyDrive/Sentiment Analysis/train.csv')

In [36]:
df.shape

(8530, 2)

In [37]:
df.head()

Unnamed: 0,text,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [38]:
df['label'].unique()

array([1, 0])

-> Binary Classification

# Cleaning the Data

## Stop Words and Stemming

In [39]:
stop_words=set(stopwords.words("english"))
ps=PorterStemmer()

#removes stop words/ digits/ puntuations and stemms the words
def stop_and_stem(phrase):
    global stop_words, ps
    phrase=[ps.stem(word) for word in word_tokenize(phrase.lower()) if not word in stop_words]
    phrase=[re.sub(r'[^\w\s]','',word) for word in phrase]
    return " ".join(phrase)

In [40]:
df['text'] = df['text'].apply(stop_and_stem)

In [41]:
df.head()

Unnamed: 0,text,label
0,rock destin 21st centuri s new conan s go ma...,1
1,gorgeous elabor continu lord ring trilog hug...,1
2,effect tootepid biopic,1
3,sometim like go movi fun wasabi good place st...,1
4,emerg someth rare issu movi s honest keenli o...,1


## Converting Text Column to Matrix

In [42]:
count_vector=CountVectorizer()
x_df_counts=count_vector.fit_transform(df['text'])
x_df_counts.shape

(8530, 12911)

## Term Frequency Inverse Document Frequency (TF-IDF)

In [43]:
tfidf_transformer=TfidfTransformer()
x_df_tfidf=tfidf_transformer.fit_transform(x_df_counts)

# Train Test Split

In [44]:
np.random.seed(10)
msk = np.random.rand(len(df)) < 0.8
train = x_df_tfidf[msk]
test = x_df_tfidf[~msk]

In [45]:
train.shape, test.shape

((6889, 12911), (1641, 12911))

# Building Model

## Multinomial Naive Bayes

In [69]:
from sklearn.naive_bayes import MultinomialNB

clf_MNB=MultinomialNB().fit(train, df[msk]['label'])
train_pred_MNB=clf_MNB.predict(train)
test_pred_MNB=clf_MNB.predict(test)

In [70]:
#Model Performance Metrics
accuracy_MNB_train = accuracy_score(df[msk]['label'], train_pred_MNB)
accuracy_MNB_test = accuracy_score( df[~msk]['label'], test_pred_MNB)

## Decision Tree

In [71]:
from sklearn import tree

clf_dtrees=tree.DecisionTreeClassifier().fit(train, df[msk]['label'])
train_pred_dtrees=clf_dtrees.predict(train)
test_pred_dtrees=clf_dtrees.predict(test)

In [72]:
#Model Performance Metrics
accuracy_dtrees_train= accuracy_score( df[msk]['label'], train_pred_dtrees)
accuracy_dtrees_test= accuracy_score( df[~msk]['label'], test_pred_dtrees)

## K-nearest neighbors

In [73]:
from sklearn.neighbors import KNeighborsClassifier

clf_KNN=KNeighborsClassifier().fit(train, df[msk]['label'])
train_pred_KNN=clf_KNN.predict(train)
test_pred_KNN=clf_KNN.predict(test)

In [74]:
#Model Performance Metrics
accuracy_KNN_train= accuracy_score( df[msk]['label'], train_pred_KNN)
accuracy_KNN_test= accuracy_score( df[~msk]['label'], test_pred_KNN)

## Support Vector Machine

In [75]:
from sklearn import svm 

clf_svm_Linear = svm.LinearSVC().fit(train, df[msk]['label'])
train_pred_svm_linear=clf_svm_Linear.predict(train)
test_pred_svm_linear=clf_svm_Linear.predict(test)

In [76]:
#Model Performance Metrics
accuracy_svm_linear_train= accuracy_score(df[msk]['label'], train_pred_svm_linear)
accuracy_svm_linear_test= accuracy_score( df[~msk]['label'], test_pred_svm_linear)

## Neural Network

In [77]:
from sklearn.neural_network import MLPClassifier

clf_mlp = MLPClassifier(solver='adam',activation='logistic', hidden_layer_sizes=(10),max_iter=400,random_state=1)
clf_mlp=clf_mlp.fit(train, df[msk]['label'])
train_pred_mlp=clf_mlp.predict(train)
test_pred_mlp=clf_mlp.predict(test)

In [78]:
#Model Performance Metrics
accuracy_mlp_train= accuracy_score(df[msk]['label'], train_pred_mlp)
accuracy_mlp_test= accuracy_score( df[~msk]['label'], test_pred_mlp)

# Evaluation

In [79]:
train_accuracy = [accuracy_MNB_train, accuracy_dtrees_train, accuracy_KNN_train, accuracy_svm_linear_train, accuracy_mlp_train]
test_accuracy = [accuracy_MNB_test, accuracy_dtrees_test, accuracy_KNN_test, accuracy_svm_linear_test, accuracy_mlp_test]
index = ['Multinomial Naive Bayes', 'Decision Tree', 'K-nearest neighbors', 'Support Vector Machine', 'Neural Network']

In [80]:
a = {'Train Accuracy': train_accuracy, 'Test Accuracy': test_accuracy}
df_a = pd.DataFrame(data=a, index=index)
df_a

Unnamed: 0,Train Accuracy,Test Accuracy
Multinomial Naive Bayes,0.924227,0.764168
Decision Tree,1.0,0.636807
K-nearest neighbors,0.814487,0.695917
Support Vector Machine,0.982726,0.734918
Neural Network,0.999855,0.716636
