📌 In this notebook, our primary objective is to categorize movie comments into two distinct classes: positive and negative. To achieve this, we employ various natural language processing and machine learning techniques to analyze and classify the sentiment expressed in the comments accurately.

# Import Necessary Libraries

In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from textblob import TextBlob, Word
from sklearn import model_selection, preprocessing, linear_model
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import nltk
nltk.download("wordnet")
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to /root/nltk_data...


# Import Dataset

In [10]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/datasets/sentiment-analysis-df.csv")
del df["Unnamed: 0"]
df.head()

Unnamed: 0,text,label
0,A series of escapades demonstrating the adage ...,Negative
1,good for the goose,Pozitive
2,good,Pozitive
3,"the gander , some of which occasionally amuses...",Negative
4,amuses,Pozitive


# Text Preprocessing

In [11]:
def text_preprocessing(dataframe, dependent_var, independent_var):
  # Case Conversion - Uppercase to Lowercase
  dataframe[dependent_var] = dataframe[dependent_var].apply(lambda x: " ".join(x.lower() for x in x.split()))

  # Removing Punctuation Marks
  dataframe[dependent_var] = dataframe[dependent_var].str.replace('[^\w\s]','')

  # Removing Numbers
  dataframe[dependent_var] = dataframe[dependent_var].str.replace('\d','')

  # StopWords
  sw = stopwords.words('english')
  dataframe[dependent_var] = dataframe[dependent_var].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

  # Deletion of Infrequent
  rmv_infrequent = pd.Series(' '.join(dataframe[dependent_var]).split()).value_counts()[-1000:]
  dataframe[dependent_var] = dataframe[dependent_var].apply(lambda x: " ".join(x for x in x.split() if x not in rmv_infrequent))

  # Lemmatize
  dataframe[dependent_var] = dataframe[dependent_var].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

  # train and test split
  train_x, test_x, train_y, test_y = model_selection.train_test_split(dataframe[dependent_var], dataframe[independent_var], random_state=1)

  # Label Encoder
  encoder = preprocessing.LabelEncoder()
  train_y = encoder.fit_transform(train_y)
  test_y = encoder.fit_transform(test_y)

  return dataframe, train_x, test_x, train_y, test_y

In [12]:
df, train_x, test_x, train_y, test_y = text_preprocessing(df, "text", "label")

  dataframe[dependent_var] = dataframe[dependent_var].str.replace('[^\w\s]','')
  dataframe[dependent_var] = dataframe[dependent_var].str.replace('\d','')


In [13]:
df.head()

Unnamed: 0,text,label
0,series demonstrating adage good goose also goo...,Negative
1,good goose,Pozitive
2,good,Pozitive
3,gander occasionally amuses none amount much story,Negative
4,amuses,Pozitive


# Feature Engineering

In [14]:
def create_features_count(train_x, test_x):
  # Count Vectors
  vectorizer = CountVectorizer()
  vectorizer.fit(train_x)
  x_train_count_vectorizer = vectorizer.transform(train_x)
  x_test_count_vectorizer = vectorizer.transform(test_x)

  return x_train_count_vectorizer, x_test_count_vectorizer

In [15]:
x_train_count_vectorizer, x_test_count_vectorizer = create_features_count(train_x, test_x)

In [16]:
def create_features_TFIDF_word(train_x, test_x):
  # TF-IDF word
  tf_idf_word_vectorizer = TfidfVectorizer()
  tf_idf_word_vectorizer.fit(train_x)
  x_train_tf_idf_word = tf_idf_word_vectorizer.transform(train_x)
  x_test_tf_idf_word = tf_idf_word_vectorizer.transform(test_x)

  return x_train_tf_idf_word, x_test_tf_idf_word

In [17]:
x_train_tf_idf_word, x_test_tf_idf_word = create_features_TFIDF_word(train_x, test_x)

In [18]:
def create_features_TFIDF_ngram(train_x, test_x):
  # TF-IDF ngram
  tf_idf_ngram_vectorizer = TfidfVectorizer(ngram_range=(2,3))
  tf_idf_ngram_vectorizer.fit(train_x)
  x_train_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(train_x)
  x_test_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(test_x)

  return x_train_tf_idf_ngram, x_test_tf_idf_ngram

In [19]:
x_train_tf_idf_ngram, x_test_tf_idf_ngram = create_features_TFIDF_ngram(train_x, test_x)

In [20]:
def create_features_TFIDF_chars(train_x, test_x):
  # TF-IDF Characters
  tf_idf_chars_vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(2,3))
  tf_idf_chars_vectorizer.fit(train_x)
  x_train_tf_idf_chars = tf_idf_chars_vectorizer.transform(train_x)
  x_test_tf_idf_chars = tf_idf_chars_vectorizer.transform(test_x)

  return x_train_tf_idf_chars, x_test_tf_idf_chars

In [21]:
x_train_tf_idf_chars, x_test_tf_idf_chars = create_features_TFIDF_chars(train_x, test_x)

# Create Model

In [22]:
def crate_model(train_x, test_x):
  # Count
  x_train_count_vectorizer, x_test_count_vectorizer = create_features_count(train_x, test_x)
  loj_count = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000)
  loj_model_count = loj_count.fit(x_train_count_vectorizer, train_y)
  accuracy_count = model_selection.cross_val_score(loj_model_count, x_test_count_vectorizer, test_y, cv=10).mean()
  print("Accuracy - Count Vectors: %.3f" % accuracy_count)

  # TF-IDF Word
  x_train_tf_idf_word, x_test_tf_idf_word = create_features_TFIDF_word(train_x, test_x)
  loj_word = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000)
  loj_model_word = loj_word.fit(x_train_tf_idf_word, train_y)
  accuracy_word = model_selection.cross_val_score(loj_model_word, x_test_tf_idf_word, test_y, cv=10).mean()
  print("Accuracy - TF-IDF Word: %.3f" % accuracy_word)

  # TF-IDF ngram
  x_train_tf_idf_ngram, x_test_tf_idf_ngram = create_features_TFIDF_ngram(train_x, test_x)
  loj_ngram = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000)
  loj_model_ngram = loj_ngram.fit(x_train_tf_idf_ngram, train_y)
  accuracy_ngram = model_selection.cross_val_score(loj_model_ngram, x_test_tf_idf_ngram, test_y, cv=10).mean()
  print("Accuracy TF-IDF ngram: %.3f" % accuracy_ngram)

  # TF-IDF chars

  loj_chars = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000)
  loj_model_chars = loj_chars.fit(x_train_tf_idf_chars, train_y)
  accuracy_chars = model_selection.cross_val_score(loj_model_chars, x_test_tf_idf_chars, test_y, cv=10).mean()
  print("Accuracy TF-IDF Characters: %.3f" % accuracy_chars)

  return loj_model_count, loj_model_word, loj_model_ngram, loj_model_chars


In [23]:
loj_model_count, loj_model_word, loj_model_ngram, loj_model_chars = crate_model(train_x, test_x)

Accuracy - Count Vectors: 0.837
Accuracy - TF-IDF Word: 0.833
Accuracy TF-IDF ngram: 0.748
Accuracy TF-IDF Characters: 0.781


# Prediction

In [24]:
# Count Vectorizer
def predict_count(train_x, model, new_comment):
  new_comment= pd.Series(new_comment)
  v = CountVectorizer()
  v.fit(train_x)
  new_comment = v.transform(new_comment)
  result = model.predict(new_comment)
  if result==1:
    print("Comment is Pozitive")
  else:
    print("Comment is Negative")

In [25]:
predict_count(train_x, model=loj_model_count, new_comment="this movie was a good movie")

Comment is Pozitive


In [26]:
# TF-IDF Word
def predict_word(train_x, model, new_comment):
  new_comment= pd.Series(new_comment)
  v = TfidfVectorizer()
  v.fit(train_x)
  new_comment = v.transform(new_comment)
  result = model.predict(new_comment)
  if result==1:
    print("Comment is Pozitive")
  else:
    print("Comment is Negative")

In [27]:
predict_word(train_x, model=loj_model_word, new_comment="this movie was a good movie")

Comment is Pozitive


In [28]:
# TF-IDF ngram
def predict_ngram(train_x, model, new_comment):
  new_comment= pd.Series(new_comment)
  v = TfidfVectorizer(ngram_range=(2,3))
  v.fit(train_x)
  new_comment = v.transform(new_comment)
  result = model.predict(new_comment)
  if result==1:
    print("Comment is Pozitive")
  else:
    print("Comment is Negative")

In [30]:
predict_ngram(train_x, model=loj_model_ngram, new_comment="this movie was a good movie")

Comment is Pozitive


In [36]:
# TF-IDF characters
def predict_chars(train_x, model, new_comment):
  new_comment= pd.Series(new_comment)
  v = TfidfVectorizer(analyzer="char", ngram_range=(2,3))
  v.fit(train_x)
  new_comment = v.transform(new_comment)
  result = model.predict(new_comment)
  if result==1:
    print("Comment is Pozitive")
  else:
    print("Comment is Negative")

In [44]:
predict_chars(train_x, model=loj_model_chars, new_comment="this movie was a good movie")

Comment is Negative
