# Practice Problem 8

##  Implement Naïve Bayes theorem to classify the English text

In [None]:
pip install scikit-learn

In [6]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the 20 Newsgroups dataset (fetch the full dataset first)
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Take a smaller subset of the dataset for quicker execution
X = newsgroups.data[:500]  # Use the first 500 documents
y = newsgroups.target[:500]  # Corresponding labels

# Convert text data into numerical format using CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.3, random_state=42)

# Train the Naïve Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=newsgroups.target_names))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 31.33%

Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.25      0.09      0.13        11
           comp.graphics       0.40      0.25      0.31         8
 comp.os.ms-windows.misc       0.27      0.40      0.32        10
comp.sys.ibm.pc.hardware       0.42      0.50      0.45        10
   comp.sys.mac.hardware       0.21      0.60      0.32         5
          comp.windows.x       0.43      0.38      0.40         8
            misc.forsale       0.67      0.80      0.73         5
               rec.autos       0.50      0.25      0.33         8
         rec.motorcycles       0.50      0.12      0.20         8
      rec.sport.baseball       0.50      0.14      0.22         7
        rec.sport.hockey       0.83      0.45      0.59        11
               sci.crypt       0.25      0.12      0.17         8
         sci.electronics       0.40      0.33      0.36         6
                 sci.med       1.0