In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import VotingClassifier
import pickle
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt


# Load the dataset
IMDB_data = pd.read_csv("IMDB Dataset.csv")  # Replace with the correct path to your dataset

# Preprocessing functions

lemmatizer = nltk.WordNetLemmatizer()
nltk.download('wordnet')
nltk.download("stopwords")
stopwords_l = stopwords.words("english")
token = ToktokTokenizer()

def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    clean_text = soup.get_text()
    clean_text = re.sub(r'[^A-Za-z0-9\s]+', '', clean_text)
    words = clean_text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# Clean the reviews
IMDB_data['review'] = IMDB_data['review'].apply(remove_html)

# Encode sentiment column to binary (positive=1, negative=0)
label_encoder = LabelBinarizer()
IMDB_data['sentiment'] = label_encoder.fit_transform(IMDB_data['sentiment'])

# Split dataset into training and testing sets
train_X, test_X, train_Y, test_Y = train_test_split(IMDB_data['review'], IMDB_data['sentiment'], test_size=0.2, random_state=42)

# Logistic Regression using CountVectorizer
countVec = CountVectorizer(min_df=0.0, max_df=1, binary=False, ngram_range=(1, 3))
cv_train = countVec.fit_transform(train_X)
cv_test = countVec.transform(test_X)

log_reg_cv = LogisticRegression(max_iter=1000)
log_reg_cv.fit(cv_train, train_Y)

# Save Logistic Regression model and CountVectorizer
with open('logistic_reg_countvec.pkl', 'wb') as f:
    pickle.dump(log_reg_cv, f)
    pickle.dump(countVec, f)
print("Logistic Regression with CountVectorizer model saved.")

# Logistic Regression using TfidfVectorizer
tfidfvec = TfidfVectorizer(min_df=0.0, max_df=1, binary=False, ngram_range=(1, 3))
tfidf_train = tfidfvec.fit_transform(train_X)
tfidf_test = tfidfvec.transform(test_X)

log_reg_tfidf = LogisticRegression(max_iter=1000)
log_reg_tfidf.fit(tfidf_train, train_Y)

# Save Logistic Regression model and TfidfVectorizer
with open('logistic_reg_tfidf.pkl', 'wb') as f:
    pickle.dump(log_reg_tfidf, f)
    pickle.dump(tfidfvec, f)
print("Logistic Regression with TfidfVectorizer model saved.")

# Multinomial Naive Bayes using CountVectorizer
nb_model_cv = MultinomialNB()
nb_model_cv.fit(cv_train, train_Y)

# Save Naive Bayes model and CountVectorizer
with open('nb_model_countvec.pkl', 'wb') as f:
    pickle.dump(nb_model_cv, f)
    pickle.dump(countVec, f)
print("Naive Bayes with CountVectorizer model saved.")

# Multinomial Naive Bayes using TfidfVectorizer
nb_model_tfidf = MultinomialNB()
nb_model_tfidf.fit(tfidf_train, train_Y)

# Save Naive Bayes model and TfidfVectorizer
with open('nb_model_tfidf.pkl', 'wb') as f:
    pickle.dump(nb_model_tfidf, f)
    pickle.dump(tfidfvec, f)
print("Naive Bayes with TfidfVectorizer model saved.")

# Voting Classifier using Logistic Regression and Naive Bayes
voting_clf = VotingClassifier(estimators=[
    ('log_reg_cv', log_reg_cv),
    ('log_reg_tfidf', log_reg_tfidf),
    ('nb_model_cv', nb_model_cv),
    ('nb_model_tfidf', nb_model_tfidf)
], voting='hard')
voting_clf.fit(tfidf_train, train_Y)

# Save Voting Classifier model
with open('voting_classifier.pkl', 'wb') as f:
    pickle.dump(voting_clf, f)
print("Voting Classifier model saved.")

# Model Evaluation Function
def evaluate_model(model, test_features, test_labels):
    predictions = model.predict(test_features)
    accuracy = accuracy_score(test_labels, predictions)
    report = classification_report(test_labels, predictions)
    return accuracy, report

# Evaluate Logistic Regression with CountVectorizer
log_reg_accuracy, log_reg_report = evaluate_model(log_reg_cv, cv_test, test_Y)
print(f"Logistic Regression with CountVectorizer Accuracy: {log_reg_accuracy}")
print(log_reg_report)

# Evaluate Logistic Regression with TfidfVectorizer
log_reg_tfidf_accuracy, log_reg_tfidf_report = evaluate_model(log_reg_tfidf, tfidf_test, test_Y)
print(f"Logistic Regression with TfidfVectorizer Accuracy: {log_reg_tfidf_accuracy}")
print(log_reg_tfidf_report)

# Evaluate Naive Bayes with CountVectorizer
nb_accuracy, nb_report = evaluate_model(nb_model_cv, cv_test, test_Y)
print(f"Naive Bayes with CountVectorizer Accuracy: {nb_accuracy}")
print(nb_report)

# Evaluate Naive Bayes with TfidfVectorizer
nb_tfidf_accuracy, nb_tfidf_report = evaluate_model(nb_model_tfidf, tfidf_test, test_Y)
print(f"Naive Bayes with TfidfVectorizer Accuracy: {nb_tfidf_accuracy}")
print(nb_tfidf_report)

# Evaluate Voting Classifier
voting_accuracy, voting_report = evaluate_model(voting_clf, tfidf_test, test_Y)
print(f"Voting Classifier Accuracy: {voting_accuracy}")
print(voting_report)

# Decision Tree using CountVectorizer
countVec = CountVectorizer(min_df=0.0, max_df=1.0, binary=False, ngram_range=(1, 3))
cv_train = countVec.fit_transform(train_X)
cv_test = countVec.transform(test_X)

# Decision Tree classifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(cv_train, train_Y)

# Save Decision Tree model and CountVectorizer
with open('decision_tree_model.pkl', 'wb') as f:
    pickle.dump(decision_tree, f)
    pickle.dump(countVec, f)
print("Decision Tree with CountVectorizer model saved.")

# Evaluate Decision Tree model
decision_tree_accuracy, decision_tree_report = evaluate_model(decision_tree, cv_test, test_Y)
print(f"Decision Tree with CountVectorizer Accuracy: {decision_tree_accuracy}")
print(decision_tree_report)

# Decision Tree using TfidfVectorizer
tfidfvec = TfidfVectorizer(min_df=0.0, max_df=1.0, binary=False, ngram_range=(1, 3))
tfidf_train = tfidfvec.fit_transform(train_X)
tfidf_test = tfidfvec.transform(test_X)

# Train Decision Tree classifier with TfidfVectorizer
decision_tree_tfidf = DecisionTreeClassifier()
decision_tree_tfidf.fit(tfidf_train, train_Y)

# Save Decision Tree model and TfidfVectorizer
with open('decision_tree_tfidf_model.pkl', 'wb') as f:
    pickle.dump(decision_tree_tfidf, f)
    pickle.dump(tfidfvec, f)
print("Decision Tree with TfidfVectorizer model saved.")

# Evaluate Decision Tree with TfidfVectorizer
decision_tree_tfidf_accuracy, decision_tree_tfidf_report = evaluate_model(decision_tree_tfidf, tfidf_test, test_Y)
print(f"Decision Tree with TfidfVectorizer Accuracy: {decision_tree_tfidf_accuracy}")
print(decision_tree_tfidf_report)


from sklearn.metrics import classification_report
import pandas as pd

def plot_classification_report(report, model_name):
    # Convert the classification report into a pandas DataFrame for easier plotting
    report_data = []
    for label, metrics in report.items():
        if label in ['accuracy', 'macro avg', 'weighted avg']:
            continue
        report_data.append([label, metrics['precision'], metrics['recall'], metrics['f1-score']])
    df = pd.DataFrame(report_data, columns=['Label', 'Precision', 'Recall', 'F1-Score'])



    # Plot Precision, Recall, and F1-Score

    df.plot(x='Label', kind='bar', figsize=(10, 6), ylim=(0, 1), colormap='Set3')
    plt.title(f'{model_name} Classification Metrics')
    plt.ylabel('Score')
    plt.show()

    # Create a dictionary of model names and their accuracies
model_accuracies = {
    'Logistic Regression (CountVectorizer)': log_reg_accuracy,
    'Logistic Regression (TfidfVectorizer)': log_reg_tfidf_accuracy,
    'Naive Bayes (CountVectorizer)': nb_accuracy,
    'Decision Tree (CountVectorizer)': decision_tree_accuracy
}

# Plot the accuracies
plt.figure(figsize=(10, 6))
plt.barh(list(model_accuracies.keys()), list(model_accuracies.values()), color=['blue', 'green', 'orange', 'red'])
plt.xlabel('Accuracy')
plt.title('Model Accuracy Comparison')
plt.xlim(0.5, 1.0)  # To better visualize, assuming accuracies above 50%
plt.show()

def plot_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
    plt.title(f"Confusion Matrix: {model_name}")
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show()


# Plot for Decision Tree (CountVectorizer)
decision_tree_report_dict = classification_report(test_Y, decision_tree.predict(cv_test), output_dict=True)
plot_classification_report(decision_tree_report_dict, "Decision Tree (CountVectorizer)")



# Step 1: Write the test code to a file named model.py
with open("model.py", "w") as f:
    f.write("""
import pandas as pd
import pytest

@pytest.fixture
def dummy_data():
    data = {'review': ['<html>Good movie!</html>', 'Bad movie, very bad', 'Great movie, I loved it!'], 
            'sentiment': ['positive', 'negative', 'positive']}
    return pd.DataFrame(data)

# Test data loading function
def test_load_data(dummy_data):
    assert len(dummy_data) == 3
    assert 'review' in dummy_data.columns
    assert 'sentiment' in dummy_data.columns
""")

# Step 3: Run pytest
!pytest -v model.py 

# Step 1: Create a test file named test_module.py with the test function and fixture
with open("test_module.py", "w") as f:
    f.write("""
import pandas as pd
import pytest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

@pytest.fixture
def dummy_data():
    data = {'review': ['<html>Good movie!</html>', 'Bad movie, very bad', 'Great movie, I loved it!'], 
            'sentiment': ['positive', 'negative', 'positive']}
    return pd.DataFrame(data)

def test_logistic_regression_training(dummy_data):
    # Step 1: Vectorize the text data
    countVec = CountVectorizer(min_df=0.0, max_df=1, binary=False, ngram_range=(1, 3))
    cv_train = countVec.fit_transform(dummy_data['review'])

    # Step 2: Train a Logistic Regression model
    log_reg = LogisticRegression(max_iter=1000)
    log_reg.fit(cv_train, [1, 0, 1])
    
    # Step 3: Assert that the model can make predictions on new data
    test_data = countVec.transform(['new review'])
    assert log_reg.predict(test_data).shape == (1,)
""")

# Step 2: Run pytest to validate the test function
!pytest -v test_module.py 


# Step 1: Install pytest and scikit-learn
!pip install pytest scikit-learn

# Step 2: Create your_module.py with remove_html
with open("your_module.py", "w") as f:
    f.write("""
import re

def remove_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)
""")

# Step 3: Create test_module.py with test cases
with open("test_module.py", "w") as f:
    f.write("""
import os
import pickle
import pandas as pd
import pytest
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

@pytest.fixture
def dummy_data():
    data = {'review': ['<html>Good movie!</html>', 'Bad movie, very bad', 'Great movie, I loved it!'], 
            'sentiment': ['positive', 'negative', 'positive']}
    return pd.DataFrame(data)

def test_saving_logistic_regression_model(tmpdir, dummy_data):
    from your_module import remove_html

    # Clean data
    dummy_data['review'] = dummy_data['review'].apply(remove_html)

    # Vectorize data
    countVec = CountVectorizer(min_df=0.0, max_df=1, binary=False, ngram_range=(1, 3))
    cv_train = countVec.fit_transform(dummy_data['review'])
    
    # Train Logistic Regression model
    log_reg = LogisticRegression(max_iter=1000)
    log_reg.fit(cv_train, [1, 0, 1])

    # Save the model and vectorizer
    save_path = tmpdir.join('logistic_reg_countvec.pkl')
    with open(save_path, 'wb') as f:
        pickle.dump(log_reg, f)
        pickle.dump(countVec, f)

    # Check if the file is created
    assert os.path.exists(save_path)

    # Load the model and vectorizer back
    with open(save_path, 'rb') as f:
        loaded_model = pickle.load(f)
        loaded_vectorizer = pickle.load(f)

    # Assert that the loaded model and vectorizer are not None
    assert loaded_model is not None
    assert loaded_vectorizer is not None

    # Assert that the loaded model can predict correctly
    test_data = loaded_vectorizer.transform(['new review'])
    assert loaded_model.predict(test_data).shape == (1,)
""")

# Step 4: Run pytest
!pytest -v test_module.py

# Step 2: Create the module file your_module.py with the remove_html function
with open("your_module.py", "w") as f:
    f.write("""
import re

def remove_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)
""")

# Step 3: Create a test file named test_module.py with test_remove_html
with open("test_module.py", "w") as f:
    f.write("""
def test_remove_html():
    from your_module import remove_html
    
    html_string = "<p>This is a <strong>test</strong></p>"
    expected_output = "This is a test"
    
    assert remove_html(html_string) == expected_output
    print("Test passed!")
""")

# Step 4: Run pytest
!pytest -v test_module.py 