In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
import requests
from io import StringIO

# URLs for the training and test data
train_url = "https://ml-course2-upgrad.s3.amazonaws.com/Naive+Bayes/Naive+Bayes+for+Text+Classification/movie_review_train.csv"
test_url = "https://ml-course2-upgrad.s3.amazonaws.com/Naive+Bayes/Naive+Bayes+for+Text+Classification/movie_review_test.csv"

# Function to load data from URL
def load_data(url):
    response = requests.get(url)
    csv_data = StringIO(response.text)
    df = pd.read_csv(csv_data)
    return df

# Load training and test data
train_data = load_data(train_url)
test_data = load_data(test_url)

# Extract X and y from training and test data
X_train = train_data['text'].values
y_train = train_data['class'].values
X_test = test_data['text'].values
y_test = test_data['class'].values

# Initialize CountVectorizer with specified parameters
vectorizer = CountVectorizer(stop_words='english', min_df=0.03, max_df=0.8)

# Fit and transform CountVectorizer on the training data
X_train_transformed = vectorizer.fit_transform(X_train)

# Transform test data using the fitted CountVectorizer
X_test_transformed = vectorizer.transform(X_test)

# Train Bernoulli Naive Bayes model
nb_classifier = BernoulliNB()
nb_classifier.fit(X_train_transformed, y_train)

# Predict classes for test set
y_pred = nb_classifier.predict(X_test_transformed)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Print confusion matrix
print("Confusion Matrix:")
print(cm)

# Extract values from confusion matrix
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TP = cm[1, 1]

# Calculate the number of reviews actually negative but classified as positive (False Positives)
false_positives = FP

# Print the result
print("\nNumber of reviews actually negative but classified as positive (False Positives):", false_positives)


Confusion Matrix:
[[177  23]
 [ 61 139]]

Number of reviews actually negative but classified as positive (False Positives): 23
