In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import warnings
warnings.filterwarnings('ignore')

nltk.download('punkt_tab')
nltk.download('stopwords')


class SentimentAnalyzer:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=5000)
        self.model = LogisticRegression(max_iter=1000)
        self.stop_words = set(stopwords.words('english'))

    def preprocess_text(self, text):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Tokenize
        tokens = word_tokenize(text)
        # Remove stopwords
        tokens = [t for t in tokens if t not in self.stop_words]
        return ' '.join(tokens)

    def prepare_data(self, texts, labels):
        # Preprocess all texts
        processed_texts = [self.preprocess_text(text) for text in texts]
        # Convert to TF-IDF features
        X = self.vectorizer.fit_transform(processed_texts)
        return X, labels

    def train(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, texts):
        processed_texts = [self.preprocess_text(text) for text in texts]
        X = self.vectorizer.transform(processed_texts)
        return self.model.predict(X)

    def evaluate(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        return {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred, average='binary'),
            'recall': recall_score(y_test, y_pred, average='binary')
        }

# Load and prepare IMDb dataset
def load_imdb_data(num_samples=10000):
    from tensorflow.keras.datasets import imdb
    (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)
    word_index = imdb.get_word_index()
    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

    def decode_review(encoded_text):
        return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_text])

    # Convert indices back to words
    X_train_text = [decode_review(x) for x in X_train[:num_samples//2]]
    X_test_text = [decode_review(x) for x in X_test[:num_samples//2]]
    y_train = y_train[:num_samples//2]
    y_test = y_test[:num_samples//2]

    return X_train_text + X_test_text, np.concatenate([y_train, y_test])

# Main execution
def main():
    # Load data
    print("Loading IMDb dataset...")
    texts, labels = load_imdb_data()

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )

    # Initialize and train model
    print("Training model...")
    analyzer = SentimentAnalyzer()
    X_train_processed, y_train = analyzer.prepare_data(X_train, y_train)
    X_test_processed, y_test = analyzer.prepare_data(X_test, y_test)
    analyzer.train(X_train_processed, y_train)

    # Evaluate
    metrics = analyzer.evaluate(X_test_processed, y_test)
    print("\nModel Performance:")
    for metric, value in metrics.items():
        print(f"{metric.capitalize()}: {value:.3f}")

    # Example predictions
    sample_texts = [
        "This movie was fantastic! I really enjoyed it.",
        "Terrible waste of time, wouldn't recommend."
    ]
    predictions = analyzer.predict(sample_texts)
    print("\nSample Predictions:")
    for text, pred in zip(sample_texts, predictions):
        print(f"Text: {text}")
        print(f"Sentiment: {'Positive' if pred == 1 else 'Negative'}\n")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading IMDb dataset...
Training model...

Model Performance:
Accuracy: 0.580
Precision: 0.576
Recall: 0.576

Sample Predictions:
Text: This movie was fantastic! I really enjoyed it.
Sentiment: Positive

Text: Terrible waste of time, wouldn't recommend.
Sentiment: Negative



In [7]:
#Improved model
import numpy as np
import pandas as pd
import re
import nltk
import warnings
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
import torch.nn.functional as F

warnings.filterwarnings("ignore")
nltk.download('punkt')
nltk.download('stopwords')

# Preprocessing function with lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
    return ' '.join(tokens)

# Load IMDb dataset
def load_imdb_data(num_samples=10000):
    from tensorflow.keras.datasets import imdb
    (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)
    word_index = imdb.get_word_index()
    reverse_word_index = {value: key for key, value in word_index.items()}

    def decode_review(encoded_text):
        return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_text])

    X_train_text = [decode_review(x) for x in X_train[:num_samples//2]]
    X_test_text = [decode_review(x) for x in X_test[:num_samples//2]]
    y_train = y_train[:num_samples//2]
    y_test = y_test[:num_samples//2]

    return X_train_text + X_test_text, np.concatenate([y_train, y_test])

# Load dataset
texts, labels = load_imdb_data()
texts = [preprocess_text(text) for text in texts]
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# TF-IDF Vectorization for traditional models
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression Model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_tfidf, y_train)

# Train Random Forest Model
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train_tfidf, y_train)

# Load Transformer Model (DistilBERT)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

def get_bert_predictions(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = F.softmax(logits, dim=1)
    return probs[:, 1].numpy()  # Probability of positive sentiment

# Get predictions from all models
y_pred_log_reg = log_reg.predict_proba(X_test_tfidf)[:, 1]
y_pred_rf = random_forest.predict_proba(X_test_tfidf)[:, 1]
y_pred_bert = get_bert_predictions(X_test[:100])  # Process in batches if needed

# Combine predictions using soft voting
ensemble_preds = (y_pred_log_reg[:100] + y_pred_rf[:100] + y_pred_bert) / 3
ensemble_preds = np.round(ensemble_preds)

# Evaluate
accuracy = accuracy_score(y_test[:100], ensemble_preds)
precision = precision_score(y_test[:100], ensemble_preds)
recall = recall_score(y_test[:100], ensemble_preds)

print("\nEnsemble Model Performance:")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Ensemble Model Performance:
Accuracy: 0.890
Precision: 0.833
Recall: 0.930


In [8]:
# Prediction on new reviews
def predict_sentiment(reviews):
    reviews = [preprocess_text(review) for review in reviews]
    tfidf_features = vectorizer.transform(reviews)

    log_reg_preds = log_reg.predict_proba(tfidf_features)[:, 1]
    rf_preds = random_forest.predict_proba(tfidf_features)[:, 1]
    bert_preds = get_bert_predictions(reviews)

    ensemble_preds = (log_reg_preds + rf_preds + bert_preds) / 3
    final_preds = ["Positive" if p > 0.5 else "Negative" for p in ensemble_preds]

    return final_preds

# Sample Reviews
sample_reviews = [
    "This movie was absolutely fantastic! I loved the acting and the storyline.",
    "The worst film I have ever seen. Total waste of time.",
    "An average movie with some great performances but a weak script.",
    "Brilliant! A masterpiece that everyone should watch.",
    "Horrible experience. I regret watching it."
]

predictions = predict_sentiment(sample_reviews)
for review, sentiment in zip(sample_reviews, predictions):
    print(f"Review: {review}\nSentiment: {sentiment}\n")


Review: This movie was absolutely fantastic! I loved the acting and the storyline.
Sentiment: Positive

Review: The worst film I have ever seen. Total waste of time.
Sentiment: Negative

Review: An average movie with some great performances but a weak script.
Sentiment: Positive

Review: Brilliant! A masterpiece that everyone should watch.
Sentiment: Positive

Review: Horrible experience. I regret watching it.
Sentiment: Negative

