## Importing Libraries

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from collections import Counter
import re

#Text processing
import nltk as nl
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer 

#Feature extraction and data preparation for model
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split

#Model
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neural_network import MLPClassifier

#Evaluation
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

## Downloading Data

In [2]:
#Download resources from nltk
nl.download('wordnet')
nl.download('omw-1.4')
nl.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vivek\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\vivek\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vivek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
stop_words = stopwords.words('english')
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

## Reading and merging data

In [4]:
path = 'sentiment labelled sentences'
files = ['amazon_cells_labelled.txt', 'imdb_labelled.txt', 'yelp_labelled.txt']
dfs = []
for filename in files:
    filepath = os.path.join(path, filename)
    df = pd.read_csv(filepath, sep='\t', header=None, names=['text', 'label'])
    dfs.append(df)

# Merge the datasets
df = pd.concat(dfs)

# Save the merged dataset to a CSV file
df.to_csv('merged_dataset.csv', index=False)

In [5]:
df=pd.read_csv('merged_dataset.csv')
df.head()

Unnamed: 0,text,label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [6]:
df.shape

(2748, 2)

## Applying Stemming and Lemmatization

In [7]:
def clean_text(text):
    new_text=text.lower()
    clean_text= re.sub("[^a-z]+"," ",new_text)
    clean_text_stop_removed = ""
    for w in clean_text.split(" ")[1:]:
        if not w in stop_words and len(w) > 3:
            clean_text_stop_removed += w
            clean_text_stop_removed += " "
            clean_text_stop_removed=lemmatizer.lemmatize(clean_text_stop_removed)
            clean_text_stop_removed=stemmer.stem(clean_text_stop_removed)
    return clean_text_stop_removed

In [8]:
print(df['text'][4])

The mic is great.


In [9]:
print(clean_text(df['text'][4]))

great 


In [10]:
df["text_clean"] = df.text.apply(clean_text)

In [11]:
df.head()

Unnamed: 0,text,label,text_clean
0,So there is no way for me to plug it in here i...,0,plug unless converter
1,"Good case, Excellent value.",1,case excellent value
2,Great for the jawbone.,1,jawbone
3,Tied to charger for conversations lasting more...,0,charger conversations lasting minutes major pr...
4,The mic is great.,1,great


In [12]:
df_clean = df.drop(df.columns[0:1],axis=1)

In [13]:
df_clean['len'] = df_clean['text_clean'].str.len()

In [14]:
df_clean.head()

Unnamed: 0,label,text_clean,len
0,0,plug unless converter,22
1,1,case excellent value,21
2,1,jawbone,8
3,0,charger conversations lasting minutes major pr...,53
4,1,great,6


In [15]:
x=df_clean['text_clean']
y=df_clean['label']

In [16]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [17]:
print(x_train.shape,x_test.shape)

(2198,) (550,)


In [18]:
words = []
for sentence in x_train:
    words.extend(sentence.split())
vocabulary = sorted(set(words))

## TfidfVectorizer

In [19]:
N = len(x_train)
tf = np.zeros((N, len(vocabulary)))
df2 = np.zeros(len(vocabulary))
for i, sentence in enumerate(x_train):
    words = sentence.split()
    for word in words:
        if word in vocabulary:
            j = vocabulary.index(word)
            tf[i, j] += 1
    for word in set(words):
        if word in vocabulary:
            j = vocabulary.index(word)
            df2[j] += 1

In [20]:
log_tf = np.log(1 + tf)
idf = np.log(N / df2)
feature_values_train = log_tf * idf

In [21]:
N = len(x_test)
tf = np.zeros((N, len(vocabulary)))
df2 = np.zeros(len(vocabulary))
for i, sentence in enumerate(x_test):
    words = sentence.split()
    for word in words:
        if word in vocabulary:
            j = vocabulary.index(word)
            tf[i, j] += 1
          

In [22]:
log_tf = np.log(1 + tf)
feature_values_test = log_tf * idf

In [23]:
count1 = Counter(" ".join(x_train).split()).most_common()
df1 = pd.DataFrame.from_dict(count1)
df1 = df1.rename(columns={0: "words", 1 : "count"})
df1

Unnamed: 0,words,count
0,good,180
1,movie,154
2,great,136
3,film,136
4,phone,135
...,...,...
4123,sympathetic,1
4124,pricey,1
4125,kabuki,1
4126,dual,1


In [24]:
cv_df = pd.DataFrame(feature_values_train,columns = df1.words)
cv_df.head()

words,good,movie,great,film,phone,like,time,food,place,really,...,surprising,decided,verge,wayyy,horrified,sympathetic,pricey,kabuki,dual,purpose
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Multinomial Naive Bayes

In [25]:
clf=MultinomialNB()

In [26]:
clf.fit(feature_values_train,y_train)

MultinomialNB()

In [27]:
y_pred=clf.predict(feature_values_test)


In [28]:
accuracy=accuracy_score(y_test,y_pred)
print('Accuracy:',accuracy)

Accuracy: 0.7181818181818181


## Bernoulli Naive Bayes

In [31]:
clf2 = BernoulliNB()

In [32]:
clf2.fit(feature_values_train,y_train)

BernoulliNB()

In [34]:
y_pred2=clf2.predict(feature_values_test)

In [35]:
accuracy=accuracy_score(y_test,y_pred2)
print('Accuracy:',accuracy)

Accuracy: 0.7527272727272727
