# Importing the libraries

In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Importing the dataset

In [44]:
data = pd.read_csv('Movies_Feedback_NLP.csv')

# The dataset is too large to train, So I just take 10000 rows

In [45]:
negative = data[data['label'] == 0].sample(n = 5000, random_state = 42)
positive = data[data['label'] == 1].sample(n = 5000, random_state = 42)
data = pd.concat([negative,positive], ignore_index = True)

# Data Analysis

In [46]:
data.head()

Unnamed: 0,text,label
0,I went to see this movie with the most positiv...,0
1,"I was not expecting a classic, but at least a ...",0
2,"As a Mystery Science Theatre 3000 fan, I can w...",0
3,"A couple move into their dream home, unaware t...",0
4,Ever wonder why Pacific Islanders seem to auto...,0


In [47]:
data.tail()

Unnamed: 0,text,label
9995,"Based on a William Faulkner short story, Two S...",1
9996,"""Bon Voyage"" has the fast pace that in some wa...",1
9997,"Contrary to some people's summaries, the women...",1
9998,"Hong Kong, the 1920s. A young man from poor be...",1
9999,The first time I watched it was when it came o...,1


In [48]:
data.describe()

Unnamed: 0,label
count,10000.0
mean,0.5
std,0.500025
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [49]:
data.shape

(10000, 2)

In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    10000 non-null  object
 1   label   10000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 156.4+ KB


In [51]:
data.columns

Index(['text', 'label'], dtype='object')

# Cleaning the Data

In [52]:
import nltk
import re
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

In [53]:
from nltk.stem import WordNetLemmatizer
corpus = []
for i in range(0, 10000):
    cleaned_data = re.sub('[^a-zA-Z]', ' ',data['text'][i])
    lemma = WordNetLemmatizer()
    cleaned_data = cleaned_data.lower()
    cleaned_data = cleaned_data.split()
    cleaned_data = [lemma.lemmatize(word) for word in cleaned_data if word != set(stopwords)]
    cleaned_data = ' '.join(cleaned_data)
    corpus.append(cleaned_data)

In [54]:
print(len(corpus[1]))

507


# Creating the bag of words model

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x = vectorizer.fit_transform(corpus).toarray()
y = data['label']
x.shape

(10000, 46137)

# Splitting the data into train and test set

In [56]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 42)

In [57]:
x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [58]:
x_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [59]:
y_train

9254    1
1561    0
1670    0
6087    1
6669    1
       ..
5734    1
5191    1
5390    1
860     0
7270    1
Name: label, Length: 8000, dtype: int64

In [60]:
y_test

6252    1
4684    0
1731    0
4742    0
4521    0
       ..
6412    1
8285    1
7853    1
1095    0
6929    1
Name: label, Length: 2000, dtype: int64

# Training the logistic regression model

In [61]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver = 'liblinear', max_iter = 1000)
model.fit(x,y)

# Prediction the model

In [62]:
pred = model.predict(x_test)

Evaluating

In [63]:
from sklearn.metrics import accuracy_score
print(accuracy_score(pred, y_test))

0.9425


In [76]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(pred, y_test)

# Prediction my own feedback

# Cleaning

In [77]:
feedback = input('Enter your feedback')
new_corpus = []
feedback = re.sub('[^a-zA-Z]', ' ', feedback)
feedback = feedback.lower()
feedbak = feedback.split()
feedback = [lemma.lemmatize(word) for word in feedback if word!= set(stopwords)]
feedback = ''.join(feedback)
new_corpus.append(feedback)

Enter your feedback love this


# Creating bag of words model

In [78]:
transformed_new_corpus = vectorizer.transform(new_corpus)

Making predictions

In [79]:
result = model.predict(transformed_new_corpus)
if result == 1:
    print('The feedback was positive about the movie')
else:
    print('The movie watcher gave a negative feedback about the movie')

The feedback was positive about the movie


# The accuracy score was over 94% and our models performed well and good for our own feedback