# Question 4

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

imdbdata = pd.read_csv('/Users/Shivanibommagani/OneDrive/Desktop/IMDBDataset.csv')
imdbXtr, imdbXte, imdbytr, imdbyte = train_test_split(imdbdata['review'], imdbdata['sentiment'], test_size=0.2, random_state=42)

vect = CountVectorizer(binary=True)
imdbXtr_bow = vect.fit_transform(imdbXtr)
imdbXte_bow = vect.transform(imdbXte)

classifier = LogisticRegression(max_iter=1000)
classifier.fit(imdbXtr_bow, imdbytr)

imdbypred = classifier.predict(imdbXte_bow)

imdbypred_binary = pd.Series(imdbypred).map({'negative': 0, 'positive': 1})

imdbyte_binary = imdbyte.map({'negative': 0, 'positive': 1})

a = metrics.accuracy_score(imdbyte_binary, imdbypred_binary)
p = metrics.precision_score(imdbyte_binary, imdbypred_binary)
r = metrics.recall_score(imdbyte_binary, imdbypred_binary)
f1 = metrics.f1_score(imdbyte_binary, imdbypred_binary)

print(f"Accuracy: {a}")
print(f"Precision: {p}")
print(f"Recall: {r}")
print(f"F1 Score: {f1}")

imdbsample_reviews = imdbXte.sample(n=5)
imdbsample_bow = vect.transform(imdbsample_reviews)
imdbsample_predictions = classifier.predict(imdbsample_bow)

for review, prediction in zip(imdbsample_reviews, imdbsample_predictions):
    print(f"Review: {review}")
    print(f"Prediction: {prediction}\n")


Accuracy: 0.885
Precision: 0.883303765030554
Recall: 0.8892637428061123
F1 Score: 0.8862737341772151
Review: I got this DVD from a friend, who got it from someone else (and that probably keeps going on..) Even the cover of the DVD looks cheap, as is the entire movie. Gunshots and fist fights with delayed sound effects, some of the worst actors I´ve seen in my life, a very simple plot, it made me laugh ´till my stomach hurt! With very few financial resources, I must admit it looked pretty professional. Seen as a movie, it was one of the 13 in a dozen wannabe gangsta flicks nobody´s waiting for. So: if you´re tired and want a cheap laugh, see this movie. If not, throw it out of the window.
Prediction: negative

Review: 'It's easy to kill a monster, but it's hard to kill a human being.'<br /><br />Set in St. Thomas Housing Project and Angola Prison in New Orleans, "Dead Man Walking" is the true story of Helen Prejean (Susan Sarandon), a Louisiana nun Sister who befriended Matthew Poncelet

# Question 11

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

cr_data = pd.read_csv('/Users/Shivanibommagani/OneDrive/Desktop/redmi6.csv', encoding='ISO-8859-1')

vect = CountVectorizer(binary=True)
crX_bow = vect.fit_transform(cr_data['Review Title'])

vocabularysize = len(vect.get_feature_names_out())

numberoffeaturecombinations = 2 ** vocabularysize

print(f"Vocabulary Size: {vocabularysize}")
print(f"Number of Possible Feature Combinations: {numberoffeaturecombinations}")

Vocabulary Size: 261
Number of Possible Feature Combinations: 3705346855594118253554271520278013051304639509300498049262642688253220148477952


# Question 12

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
import numpy as np


data = pd.read_csv('/Users/Shivanibommagani/OneDrive/Desktop/spam_or_not_spam.csv')

data['email'].fillna('', inplace=True)


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['email'], data['label'], test_size=0.2, random_state=42)
print("Unique classes in y_train:", np.unique(y_train))


vectorizer = CountVectorizer(binary=True, min_df=1, max_df=0.95)

X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

alpha = 1.0  
model = BernoulliNB(alpha=alpha)
model.fit(X_train_bow, y_train)

feature_names = vectorizer.get_feature_names_out()

log_likelihood_spam = model.feature_log_prob_[model.classes_ == model.classes_[1]]
log_likelihood_ham = model.feature_log_prob_[model.classes_ == model.classes_[0]]

print("Shape of log_likelihood_spam:", log_likelihood_spam.shape)
print("Shape of log_likelihood_ham:", log_likelihood_ham.shape)

for i, (spam_log_likelihood, ham_log_likelihood) in enumerate(zip(log_likelihood_spam, log_likelihood_ham)):
    feature = feature_names[i]
    print(f"Feature: {feature}")
    print(f"Log-Likelihood (spam): {spam_log_likelihood}")
    print(f"Log-Likelihood (ham): {ham_log_likelihood}\n")



Unique classes in y_train: [0 1]
Shape of log_likelihood_spam: (1, 30330)
Shape of log_likelihood_ham: (1, 30330)
Feature: __
Log-Likelihood (spam): [-6.00881319 -6.00881319 -4.21705372 ... -5.315666   -5.315666
 -5.315666  ]
Log-Likelihood (ham): [-6.50078904 -6.90625415 -7.59940133 ... -7.59940133 -7.59940133
 -7.59940133]



# Question 13

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

newsdata = pd.read_csv('/Users/Shivanibommagani/OneDrive/Desktop/BBCNewsTrain.csv')

newsXtr, newsXte, newsytr, newsyte = train_test_split(newsdata['Text'], newsdata['Category'], test_size=0.2, random_state=42)

# Vectorize the text data
vect = CountVectorizer(binary=True)
newsXtr_bow = vect.fit_transform(newsXtr)
newsXte_bow = vect.transform(newsXte)


model = BernoulliNB()
model.fit(newsXtr_bow, newsytr)
newssample_reviews = newsXte.sample(n=5)
newssample_bow = vect.transform(newssample_reviews)
newssample_predictions = model.predict(newssample_bow)

for review, prediction in zip(newssample_reviews, newssample_predictions):
    print(f"Review: {review}")
    print(f"Prediction: {prediction}\n")


newsypred = model.predict(newsXte_bow)
newsypred = pd.Series(newsypred).map({'business': 0, 'tech': 1,'politics':2,'sport':3,'entertainment':4})
newsyte= newsyte.map({'business': 0, 'tech': 1,'politics':2,'sport':3,'entertainment':4})

a = accuracy_score(newsyte, newsypred)
p = precision_score(newsyte, newsypred, average=None)  # For binary classification
r = recall_score(newsyte, newsypred, average=None)
f1 = f1_score(newsyte, newsypred, average=None)

print(f"Accuracy: {a}")
print(f"Precision: {p}")
print(f"Recall: {r}")
print(f"F1 Score: {f1}")

print("Classification Report:")
print(classification_report(newsyte, newsypred))


Review: sydney return for henin-hardenne olympic champion justine henin-hardenne will return to action in january s sydney international tournament.  the belgian has not competed since losing her top world ranking at the us open in september  where she was beaten in the fourth round by nadia petrova. she took time out to shake off a virus but will defend her titles in sydney and at the australian open. women s world number one lindsay davenport and french open champion anastasia myskina will also compete.  in the men s event  world number three lleyton hewitt returns to defend his title  along with runner-up carlos moya. moya  spain s davis cup final hero in their recent win over the us  had to retire with an ankle injury in the first set of the final.  tournament director craig watson said:  i had a message relayed to me from him after spain s davis cup victory  saying he was looking forward to trying to make up for his disappointment in the (2004) final. the tournament will take plac