In [1]:
!pip install kaggle



In [2]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json  # Set permissions


In [3]:
#!/bin/bash
!kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other


In [4]:
!unzip imdb-dataset-of-50k-movie-reviews.zip -d /content/

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: /content/IMDB Dataset.csv  


In [5]:
import pandas as pd
import numpy as np


In [7]:
df = pd.read_csv('/content/IMDB Dataset.csv')

In [8]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Preprocess the Text Data

In [9]:
!pip install nltk




In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [15]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [12]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)

    # Tokenization
    words = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join words back to text
    return ' '.join(words)

In [16]:
# Apply preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode hoo...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei love time money visually stunnin...


# Convert Sentiments to Numerical Labels

In [17]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head()


Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,1,one reviewer mentioned watching oz episode hoo...
1,A wonderful little production. <br /><br />The...,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei love time money visually stunnin...


# TF-IDF Feature Extraction

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [20]:
# Fit and transform the cleaned reviews
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_review'])

In [21]:
X_tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


In [22]:
X_tfidf_df.head()

Unnamed: 0,aaron,abandoned,abc,ability,able,absence,absent,absolute,absolutely,absurd,...,yesterday,yet,york,young,younger,youngster,youth,zero,zombie,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.07792,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.223395,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.098995,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
y = df['sentiment']

# Train-Test Split

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


# Train a Logistic Regression Model

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [28]:
# Initialize and train model
model = LogisticRegression()
model.fit(X_train, y_train)


# Evaluate the Model

In [29]:
y_pred = model.predict(X_test)

In [30]:
# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Classification Report
print(classification_report(y_test, y_pred))

Model Accuracy: 0.8891
              precision    recall  f1-score   support

           0       0.90      0.87      0.89      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



# Prediction Function

In [31]:
def predict_sentiment(review, model, vectorizer):
    # Preprocess the input review
    def preprocess_text(text):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'<br\s*/?>', ' ', text)  # Remove HTML tags
        text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters
        return text

    cleaned_review = preprocess_text(review)
    # Convert text to TF-IDF features
    review_tfidf = vectorizer.transform([cleaned_review])
    # Predict sentiment
    prediction = model.predict(review_tfidf)
    # Convert numeric output to label
    return "Positive" if prediction[0] == 1 else "Negative"


In [32]:
# Example usage
sample_review = "This movie was fantastic! The acting was top-notch."
sentiment = predict_sentiment(sample_review, model, tfidf_vectorizer)
print(f"Predicted Sentiment: {sentiment}")

Predicted Sentiment: Positive
