In [1]:
# Real Bernoulli Naive Bayes example on 20 Newsgroups dataset

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split



In [9]:
# 1. Load dataset (let’s use a few related categories for clarity)
categories = ['sci.space', 'talk.politics.misc', 'rec.sport.hockey']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

X_texts = newsgroups.data
y_labels = newsgroups.target

# 2. Convert text to binary word features
vectorizer = CountVectorizer(binary=True, stop_words='english', max_features=5000)
X = vectorizer.fit_transform(X_texts)

# 3. Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y_labels, test_size=0.5, random_state=42)

# 4. Train Bernoulli Naive Bayes model
bnb = BernoulliNB()
bnb.fit(X_train, y_train)

# 5. Predict and evaluate
y_pred = bnb.predict(X_test)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, target_names=categories))

# 6. Try a new document
new_docs = [
    "NASA announced a new mission to Mars next year.",
    "The government passed a controversial policy.",
    "The hockey team won the final game 4-1!"
]
X_new = vectorizer.transform(new_docs)
preds = bnb.predict(X_new)

print("\nPredictions:")
for doc, cat in zip(new_docs, preds):
    print(f"  '{doc}' → {categories[cat]}")


✅ Accuracy: 0.8638667632150615

Classification report:
                     precision    recall  f1-score   support

         sci.space       0.79      0.98      0.88       485
talk.politics.misc       0.90      0.86      0.88       486
  rec.sport.hockey       0.95      0.73      0.83       410

          accuracy                           0.86      1381
         macro avg       0.88      0.86      0.86      1381
      weighted avg       0.88      0.86      0.86      1381


Predictions:
  'NASA announced a new mission to Mars next year.' → talk.politics.misc
  'The government passed a controversial policy.' → talk.politics.misc
  'The hockey team won the final game 4-1!' → sci.space
