In [5]:
# Sentiment Analysis using "Machine learning".
# This project uses "Natural Language Processing" (NLP) techniques to classify text (e.g., reviews or tweets) as Positive, Negative, or Neutral.
# step 1:
! pip install pandas numpy matplotlib seaborn nltk scikit-learn              # installing Required Libraries
# step 2:
import pandas as pd                                                          # import pandas and call it pd
import numpy as np                                                           # import numpy and call it np
import seaborn as sns                                                        # import seaborn and call it sns (for plotting)
import matplotlib.pyplot as plt                                              # import matpltlib and call it plt(for plotting)
import nltk                                                                  # importing the Natural Language Toolkit library for text processing and sentiment analysis
from nltk.corpus import stopwords                                            # imports a list of common words (like "the", "is", "and") that are usually removed in text preprocessing
from sklearn.model_selection import train_test_split                         # To divide the data into training and testing parts
from sklearn.feature_extraction.text import CountVectorizer                  # To Converts text into numerical data (word counts) so machine learning models can understand it
from sklearn.naive_bayes import MultinomialNB                                # Imports the Naive Bayes algorithm used for text classification (like spam detection or sentiment analysis)
from sklearn.metrics import accuracy_score,classification_report             # Measures how good the model is
# step 3:
nltk.download('stopwords')                                                   # Downloads the list of common stopwords (like "the", "is", "and") used to clean text data
# step 4:
nltk.download('movie_reviews')                                               # Download dataset
from nltk.corpus import movie_reviews                                        # Imports sample movie reviews dataset with labels (positive/negative)
import random                                                                # Used to randomly shuffle the reviews for better training
documents = [(list(movie_reviews.words(fileid)),category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]                  # Load the movie review dataset and prepare it
# step 5:
stop_words= stopwords.words('english')                                       # clean the text data.Loads english stopwords (common words to remove)
def clean_text(text):
  words=text.lower().split()                                                 # convert to lower case and split into words
  words=[word for word in words if word.isalpha()and word not in stop_words]
  return ' '.join(words)                                                     # join words back to one string.
data = pd.DataFrame(documents, columns=['Review', 'Sentiment'])
data['Review'] = [' '.join(words) for words in data['Review']]
data['Cleaned_Review'] = data['Review'].apply(clean_text)                    # This cleans every single review using your clean_text() function
# step 6:
vectorizer = CountVectorizer()
x=vectorizer.fit_transform(data['Cleaned_Review'])                           # Converts text into numbers each word gets a number
y=data['Sentiment']                                                          # target variable (positive or negative)
# step 7:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42) # splitting the data into testing and training
# step 8:
model=MultinomialNB()                                                        # Load "NaiveBayes" model
model.fit(x_train,y_train)                                                   # train the model using train data
# step 9:
# Predict and check accuracy
y_pred=model.predict(x_test)                                                 # Use model to predict label (positive/negative)
print ("Accuracy :",accuracy_score(y_test,y_pred))                           # print how many predictions were correct
print("\n Classification Report:\n",classification_report(y_test,y_pred))
# step 10:
# Give your own reviews to the test model.
my_review = [" The Movie was really fantastic and enjoyable"]
my_review_vector = vectorizer.transform(my_review)                          # clean and convert to numbers
prediction = model.predict(my_review_vector)                                 # predict sentiment
print("prediction:",prediction[0])
# second review
second_review = [" The Movie was too bad"]
second_review_vector = vectorizer.transform(second_review)
prediction = model.predict(second_review_vector)
print("prediction:",prediction[0])



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Accuracy : 0.81

 Classification Report:
               precision    recall  f1-score   support

         neg       0.78      0.86      0.82       199
         pos       0.85      0.76      0.80       201

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.81      0.81      0.81       400

prediction: pos
prediction: neg
