In [None]:
import kagglehub

path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews


In [None]:
import pandas as pd
import os

csv_file_path = os.path.join(path, "IMDB Dataset.csv")
df = pd.read_csv(csv_file_path)

In [None]:
print("Data Head:")
print(df.head())

print("\nData Info:")
df.info()

print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())

Data Head:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB

Sentiment Distribution:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [None]:
import re

def preprocess_text(text):
    """
    Cleans and preprocesses a single text string.
    - Removes HTML tags
    - Removes punctuation and special characters
    - Converts to lowercase
    """
    # Remove HTML tags using a regular expression
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation and non-alphanumeric characters
    text = re.sub(r'[^\w\s]', '', text)

    # Convert text to lowercase
    text = text.lower()

    return text

# Apply the preprocessing function to the 'review' column
# We will use this preprocessed text for Logistic Regression and BERT
df['cleaned_review'] = df['review'].apply(preprocess_text)

print("Original Review:")
print(df['review'])
print("\nCleaned Review:")
print(df['cleaned_review'])

Original Review:
0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

Cleaned Review:
0        one of the other reviewers has mentioned that ...
1        a wonderful little production the filming tech...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the ti

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download the VADER lexicon (only needs to be done once)
nltk.download('vader_lexicon')

# Instantiate the sentiment intensity analyzer
sia = SentimentIntensityAnalyzer()

# Example of VADER's output on a sample sentence
sample_text = "This movie is not just good, it's absolutely FANTASTIC!!!"
print(f"Analyzing sample text: '{sample_text}'")
print(sia.polarity_scores(sample_text))

# Apply VADER to the entire 'review' column
df['vader_scores'] = df['review'].apply(lambda review: sia.polarity_scores(review))

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Analyzing sample text: 'This movie is not just good, it's absolutely FANTASTIC!!!'
{'neg': 0.161, 'neu': 0.47, 'pos': 0.369, 'compound': 0.6244}


In [None]:
# Extract the compound score and create a new column for it
df['vader_compound'] = df['vader_scores'].apply(lambda score_dict: score_dict['compound'])

# Classify sentiment based on the compound score
df['vader_prediction'] = df['vader_compound'].apply(lambda c: 'positive' if c >= 0.05 else 'negative')

# Display the results for a few reviews
print(df[['cleaned_review', 'sentiment', 'vader_compound', 'vader_prediction']].head())

                                      cleaned_review sentiment  \
0  one of the other reviewers has mentioned that ...  positive   
1  a wonderful little production the filming tech...  positive   
2  i thought this was a wonderful way to spend ti...  positive   
3  basically theres a family where a little boy j...  negative   
4  petter matteis love in the time of money is a ...  positive   

   vader_compound vader_prediction  
0         -0.9951         negative  
1          0.9641         positive  
2          0.9605         positive  
3         -0.9213         negative  
4          0.9744         positive  


In [None]:
from sklearn.metrics import classification_report, accuracy_score

# True labels
y_true_vader = df['sentiment']

# Predicted labels
y_pred_vader = df['vader_prediction']

vader_accuracy = accuracy_score(y_true_vader, y_pred_vader)
print(f"VADER Accuracy: {vader_accuracy:.4f}")

print("\nVADER Classification Report:")
print(classification_report(y_true_vader, y_pred_vader))

VADER Accuracy: 0.6974

VADER Classification Report:
              precision    recall  f1-score   support

    negative       0.79      0.54      0.64     25000
    positive       0.65      0.85      0.74     25000

    accuracy                           0.70     50000
   macro avg       0.72      0.70      0.69     50000
weighted avg       0.72      0.70      0.69     50000



In [None]:
# Logistic Regression Sentimental Analysis

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# Prepare the data for scikit-learn
# Map 'positive' to 1 and 'negative' to 0 for the target variable
df['sentiment_numeric'] = df['sentiment'].map({'positive': 1, 'negative': 0})

X = df['cleaned_review']
y = df['sentiment_numeric']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create the pipeline
tfidf_logreg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression(solver='liblinear', random_state=42))
])

print("Pipeline created successfully.")
print(tfidf_logreg_pipeline)

Pipeline created successfully.
Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('logreg',
                 LogisticRegression(random_state=42, solver='liblinear'))])


In [None]:
# Train the pipeline on the training data
print("Training the TF-IDF + Logistic Regression pipeline...")
tfidf_logreg_pipeline.fit(X_train, y_train)
print("Training complete.")

sample_text = ["I absolutely loved this product!"]
pred = tfidf_logreg_pipeline.predict(sample_text)

# Map numeric prediction back to sentiment label
label_map = {1: "positive", 0: "negative"}
print("Predicted sentiment:", label_map[pred[0]])

sample_texts = [
    "I absolutely loved this product!",
    "It was terrible and I would not buy it again.",
    "Just okay, nothing special."
]
preds = tfidf_logreg_pipeline.predict(sample_texts)
labels = [label_map[p] for p in preds]

for text, sentiment in zip(sample_texts, labels):
    print(f"Text: '{text}' -> Predicted sentiment: {sentiment}")

def predict_sentiment(texts):
    preds = tfidf_logreg_pipeline.predict(texts)
    return [label_map[p] for p in preds]

print(predict_sentiment(["I absolutely loved this product!"]))


# Make predictions on the test data
print("Making predictions on the test set...")
y_pred_logreg_tfidf = tfidf_logreg_pipeline.predict(X_test)
print(y_pred_logreg_tfidf)
print("Predictions complete.")

# Evaluate the model's performance
logreg_tfidf_accuracy = accuracy_score(y_test, y_pred_logreg_tfidf)
print(f"\nTF-IDF + Logistic Regression Accuracy: {logreg_tfidf_accuracy:.4f}")

print("\nTF-IDF + Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logreg_tfidf, target_names=['negative', 'positive']))

Training the TF-IDF + Logistic Regression pipeline...
Training complete.
Predicted sentiment: positive
Text: 'I absolutely loved this product!' -> Predicted sentiment: positive
Text: 'It was terrible and I would not buy it again.' -> Predicted sentiment: negative
Text: 'Just okay, nothing special.' -> Predicted sentiment: negative
['positive']
Making predictions on the test set...
[0 1 1 ... 0 1 0]
Predictions complete.

TF-IDF + Logistic Regression Accuracy: 0.8968

TF-IDF + Logistic Regression Classification Report:
              precision    recall  f1-score   support

    negative       0.90      0.89      0.90      5000
    positive       0.89      0.90      0.90      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [None]:
# Using BERT

In [None]:
X = df['cleaned_review'].iloc[:30000]
y = df['sentiment_numeric'].iloc[:30000]

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, TensorDataset, SequentialSampler
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load pre-trained BERT model and tokenizer
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert_feature_extractor = BertModel.from_pretrained('bert-base-uncased').to(device)
model_bert_feature_extractor.eval()  # Set model to evaluation mode

def get_bert_embeddings(texts, tokenizer, model, batch_size=64):
    """
    Generates BERT CLS token embeddings for a list of texts.
    """
    # Tokenize the input texts
    encoded_inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=256)

    # Create a DataLoader for batching
    dataset = TensorDataset(encoded_inputs['input_ids'], encoded_inputs['attention_mask'])
    sampler = SequentialSampler(dataset)
    dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)

    all_embeddings = []  # Initialize list for storing batch embeddings

    with torch.no_grad():  # Disable gradient calculations
        for batch in dataloader:
            input_ids, attention_mask = tuple(t.to(device) for t in batch)
            outputs = model_bert_feature_extractor(input_ids=input_ids, attention_mask=attention_mask)
            # Extract CLS token embedding (first token)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_embeddings.append(cls_embeddings)

    # Stack all batches vertically
    return np.vstack(all_embeddings)

# Generate embeddings for train and test sets
print("Generating BERT embeddings for the training set...")
X_train_bert_features = get_bert_embeddings(X_train.tolist(), tokenizer_bert, model_bert_feature_extractor)

print("Generating BERT embeddings for the test set...")
X_test_bert_features = get_bert_embeddings(X_test.tolist(), tokenizer_bert, model_bert_feature_extractor)

print("BERT feature extraction complete.")

# Train a Logistic Regression model on the BERT features
print("Training Logistic Regression on BERT features...")
logreg_on_bert = LogisticRegression(solver='liblinear', max_iter=1000)
logreg_on_bert.fit(X_train_bert_features, y_train)
print("Training complete.")

# Make predictions and evaluate
y_pred_logreg_bert = logreg_on_bert.predict(X_test_bert_features)
logreg_bert_accuracy = accuracy_score(y_test, y_pred_logreg_bert)

print(f"\nLogistic Regression on BERT Features Accuracy: {logreg_bert_accuracy:.4f}")
print("\nLogistic Regression on BERT Features Classification Report:")
print(classification_report(y_test, y_pred_logreg_bert, target_names=['negative', 'positive']))


# Function to predict sentiment on new sample texts using BERT + Logistic Regression
def predict_sentiment_bert(texts):
    """
    Given a list of raw texts, output predicted sentiment labels ('positive'/'negative')
    using the BERT embeddings + trained Logistic Regression classifier.
    """
    embeddings = get_bert_embeddings(texts, tokenizer_bert, model_bert_feature_extractor)
    preds = logreg_on_bert.predict(embeddings)
    label_map = {1: "positive", 0: "negative"}
    return [label_map[p] for p in preds]

# Example usage on sample texts
sample_texts = [
    "I really enjoyed this movie, it was amazing!",
    "The product was awful and broke immediately.",
    "It was okay, not the best but not the worst.",
    "You did a great job on your presentation — it was clear, engaging, and well-organized.",
    "Your report had some important data missing — make sure to double-check next time.",
    "I really appreciate how dependable you are, always meeting deadlines without being reminded.",
    "I noticed you've been arriving late lately; it’s important to be on time for team cohesion.",
    "Your creativity really shines in your design work — it's impressive.",
    "Your tone in the email came off as a bit abrupt — try softening the language next time.",
    "This project turned out even better than expected, thanks to your attention to detail.",
    "The presentation felt rushed and lacked structure — it might help to rehearse in advance.",
    "You handled that difficult situation with professionalism and empathy.",
    "I think we missed the mark on this task due to miscommunication — let's clarify roles moving forward."
]

predicted_sentiments = predict_sentiment_bert(sample_texts)
for text, sentiment in zip(sample_texts, predicted_sentiments):
    print(f"Text: '{text}' -> Predicted sentiment: {sentiment}")


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Generating BERT embeddings for the training set...
Generating BERT embeddings for the test set...
BERT feature extraction complete.
Training Logistic Regression on BERT features...
Training complete.

Logistic Regression on BERT Features Accuracy: 0.8490

Logistic Regression on BERT Features Classification Report:
              precision    recall  f1-score   support

    negative       0.85      0.85      0.85      2997
    positive       0.85      0.85      0.85      3003

    accuracy                           0.85      6000
   macro avg       0.85      0.85      0.85      6000
weighted avg       0.85      0.85      0.85      6000

Text: 'I really enjoyed this movie, it was amazing!' -> Predicted sentiment: positive
Text: 'The product was awful and broke immediately.' -> Predicted sentiment: positive
Text: 'It was okay, not the best but not the worst.' -> Predicted sentiment: positive
Text: 'You did a great job on your presentation — it was clear, engaging, and well-organized.' -> P