In [4]:
# Import required libraries (Q1)
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score

# Load datasets (Q2, Q3, Q4)
train_data = pd.read_csv("newsgroups_train.csv")
test_data = pd.read_csv("newsgroups_test.csv")

# Print all target labels (Q5)
print("Q5 - Target Categories:", train_data['category'].unique())

# Print training set target names (Q9)
print("\nQ9 - Training Set Target Names:\n", train_data[['target', 'category']].drop_duplicates())

# Print 5th training article (Q10)
print("\nQ10 - 5th Training Article:\n", train_data.iloc[4]['text'])

# Print shape of data (Q11)
print("\nQ11 - Training Data Shape:", train_data.shape)
print("Q11 - Testing Data Shape:", test_data.shape)

# Convert training data to numerical format using CountVectorizer (Q13)
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(train_data['text'])
X_test_counts = vectorizer.transform(test_data['text'])
y_train = train_data['target']
y_test = test_data['target']

# Train BernoulliNB model (Q14)
bernoulli_nb = BernoulliNB()
bernoulli_nb.fit(X_train_counts, y_train)

# Convert test data using CountVectorizer (Q15)
# (Already transformed above)

# Predict target labels for testing set (Q16)
y_pred_counts = bernoulli_nb.predict(X_test_counts)

# Find accuracy score on test set (Q17)
accuracy_counts = accuracy_score(y_test, y_pred_counts)
print("\nQ17 - Test Accuracy (CountVectorizer + BernoulliNB):", accuracy_counts)

# Use TfidfVectorizer and train MultinomialNB (Q18)
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['text'])
X_test_tfidf = tfidf_vectorizer.transform(test_data['text'])

multinomial_nb = MultinomialNB()
multinomial_nb.fit(X_train_tfidf, y_train)

# Predict and evaluate on test set (Q19)
y_pred_tfidf = multinomial_nb.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print("\nQ19 - Test Accuracy (TF-IDF + MultinomialNB):", accuracy_tfidf)

# Avoid stopwords and repeat the process (Q20)
tfidf_vectorizer_sw = TfidfVectorizer(stop_words='english')
X_train_tfidf_sw = tfidf_vectorizer_sw.fit_transform(train_data['text'])
X_test_tfidf_sw = tfidf_vectorizer_sw.transform(test_data['text'])

multinomial_nb_sw = MultinomialNB()
multinomial_nb_sw.fit(X_train_tfidf_sw, y_train)

# Predict and evaluate without stopwords
y_pred_tfidf_sw = multinomial_nb_sw.predict(X_test_tfidf_sw)
accuracy_tfidf_sw = accuracy_score(y_test, y_pred_tfidf_sw)
print("\nQ20 - Test Accuracy (TF-IDF + MultinomialNB, No Stopwords):", accuracy_tfidf_sw)

Q5 - Target Categories: ['sci.space' 'comp.graphics' 'alt.atheism']

Q9 - Training Set Target Names:
    target       category
0       2      sci.space
1       1  comp.graphics
2       0    alt.atheism

Q10 - 5th Training Article:
 From: henry@zoo.toronto.edu (Henry Spencer)
Subject: Re: TRUE "GLOBE", Who makes it?
Organization: U of Toronto Zoology
Lines: 12

In article <bill.047m@xpresso.UUCP> bill@xpresso.UUCP (Bill Vance) writes:
>It has been known for quite a while that the earth is actually more pear
>shaped than globular/spherical.  Does anyone make a "globe" that is accurate
>as to actual shape, landmass configuration/Long/Lat lines etc.?

I don't think you're going to be able to see the differences from a sphere
unless they are greatly exaggerated.  Even the equatorial bulge is only
about 1 part in 300 -- you'd never notice a 1mm error in a 30cm globe --
and the other deviations from spherical shape are much smaller.
-- 
SVR4 resembles a high-speed collision   | Henry Spencer 