In [None]:
# Import necessary libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load data from two categories: baseball and hockey
newsgroups = fetch_20newsgroups(
    subset='all',  # use all data (train + test)
    categories=['rec.sport.baseball', 'rec.sport.hockey'],
    shuffle=True,
    random_state=42
)

# Convert text documents into TF-IDF feature vectors
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
X = vectorizer.fit_transform(newsgroups.data)  # feature matrix
y = newsgroups.target                          # labels: 0 (baseball), 1 (hockey)

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = model.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.96
