In [1]:
import pandas as pd
import numpy as np
import joblib
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from gensim.models import Word2Vec, FastText
from sklearn.metrics import classification_report
from scipy.stats import mode

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
# Load the IMDB dataset
df = pd.read_csv('IMDB Dataset.csv')

# Display the first few rows of the dataset
print(df.head())


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [3]:
# Preprocess the reviews
stop_words = set(stopwords.words('english'))

df['review'] = df['review'].apply(lambda x: ' '.join([word for word in word_tokenize(x.lower()) if word.isalnum() and word not in stop_words]))

# Prepare labels
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Split the dataset
from sklearn.model_selection import train_test_split

X = df['review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")


Training samples: 40000
Test samples: 10000


In [4]:
# Train Bag of Words Model
count_vectorizer = CountVectorizer()
X_train_bow = count_vectorizer.fit_transform(X_train)
X_test_bow = count_vectorizer.transform(X_test)
nb_model_bow = MultinomialNB()
nb_model_bow.fit(X_train_bow, y_train)

# Train TF-IDF Model
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
nb_model_tfidf = MultinomialNB()
nb_model_tfidf.fit(X_train_tfidf, y_train)

# Train N-gram Model
ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train_ngram = ngram_vectorizer.fit_transform(X_train)
X_test_ngram = ngram_vectorizer.transform(X_test)
nb_model_ngram = MultinomialNB()
nb_model_ngram.fit(X_train_ngram, y_train)

# Tokenize text for Word2Vec and FastText models
X_train_tokenized = [word_tokenize(text.lower()) for text in X_train]
X_test_tokenized = [word_tokenize(text.lower()) for text in X_test]

# Train Word2Vec Model
word2vec_model = Word2Vec(sentences=X_train_tokenized, vector_size=100, window=5, min_count=1, workers=4)
X_train_w2v = np.array([np.mean([word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv], axis=0) for tokens in X_train_tokenized])
X_test_w2v = np.array([np.mean([word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv], axis=0) for tokens in X_test_tokenized])


In [5]:
# Train FastText Model
fasttext_model = FastText(sentences=X_train_tokenized, vector_size=100, window=5, min_count=1, workers=4)
X_train_ft = np.array([np.mean([fasttext_model.wv[word] for word in tokens if word in fasttext_model.wv], axis=0) for tokens in X_train_tokenized])
X_test_ft = np.array([np.mean([fasttext_model.wv[word] for word in tokens if word in fasttext_model.wv], axis=0) for tokens in X_test_tokenized])


In [6]:
# Save the trained models
joblib.dump(nb_model_bow, 'nb_model_bow.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(ngram_vectorizer, 'ngram_vectorizer.pkl')
joblib.dump(word2vec_model, 'word2vec_model.pkl')
joblib.dump(fasttext_model, 'fasttext_model.pkl')

# Save the vectorizer
joblib.dump(count_vectorizer, 'count_vectorizer.pkl')

print("Models and vectorizer saved successfully!")


Models and vectorizer saved successfully!


In [7]:
# Load the saved models and vectorizer
nb_model_bow = joblib.load('nb_model_bow.pkl')
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
ngram_vectorizer = joblib.load('ngram_vectorizer.pkl')
word2vec_model = joblib.load('word2vec_model.pkl')
fasttext_model = joblib.load('fasttext_model.pkl')
count_vectorizer = joblib.load('count_vectorizer.pkl')

print("Models and vectorizer loaded successfully!")


Models and vectorizer loaded successfully!


In [15]:
from scipy.stats import mode

def run_sentiment_analysis(sample_text):
    # Prepare input for models
    bow_vector = count_vectorizer.transform([sample_text])
    tfidf_vector = tfidf_vectorizer.transform([sample_text])
    ngram_vector = ngram_vectorizer.transform([sample_text])

    # Word2Vec and FastText require the text to be tokenized
    tokens = word_tokenize(sample_text.lower())
    w2v_vector = np.mean([word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv], axis=0)
    ft_vector = np.mean([fasttext_model.wv[word] for word in tokens if word in fasttext_model.wv], axis=0)

    # Predictions
    predictions = {
        'Naive Bayes (BoW)': nb_model_bow.predict(bow_vector)[0],
        'Naive Bayes (TF-IDF)': nb_model_tfidf.predict(tfidf_vector)[0],  # Assuming nb_model_tfidf is defined
        'Naive Bayes (N-gram)': nb_model_ngram.predict(ngram_vector)[0],  # Assuming nb_model_ngram is defined
        'Word2Vec': 1 if not np.isnan(w2v_vector).any() else None,  # Return None if NaN
        'FastText': 1 if not np.isnan(ft_vector).any() else None,  # Return None if NaN
    }

    # Filter out None values for mode calculation
    valid_predictions = [pred for pred in predictions.values() if pred is not None]

    # Final output: majority vote
    if valid_predictions:
        # Access the mode value directly using .mode, handle scalar and array cases
        final_prediction = mode(valid_predictions).mode
        final_prediction = final_prediction[0] if isinstance(final_prediction, np.ndarray) else final_prediction
    else:
        final_prediction = None  # If no valid predictions, set to None

    return predictions, final_prediction



In [16]:
# Example usage:
sample_text = "This movie was fantastic! I loved it."
predictions, final_output = run_sentiment_analysis(sample_text)

print("Predictions from each model:")
for model, pred in predictions.items():
    if pred is not None:
        print(f"{model}: {'Positive' if pred == 1 else 'Negative'}")
    else:
        print(f"{model}: No Prediction")

if final_output is not None:
    print(f"\nFinal predicted sentiment: {'Positive' if final_output == 1 else 'Negative'}")
else:
    print("\nFinal prediction could not be made.")

Predictions from each model:
Naive Bayes (BoW): Positive
Naive Bayes (TF-IDF): Positive
Naive Bayes (N-gram): Positive
Word2Vec: Positive
FastText: Positive

Final predicted sentiment: Positive
