In [2]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

# Use a smaller number of samples for faster run time
n_samples = 10000

raw_texts = fetch_20newsgroups('all')


print('\n\n-------------- example training document from 20 newsgroups dataset --------------')
print('\ndocument contents:\n')
print(raw_texts.data[0])

target_index = raw_texts.target[0]
print("\nassociated label: {}\n\n".format(raw_texts.target_names[target_index]))

print('-------------- class names --------------\n')
for x in range(len(raw_texts.target_names)):
  print("{}".format(raw_texts.target_names[x]))

# Learn the vocabulary dictionary and create the word count vectors.  (This is the feature extraction step.)
count_vectorizer = CountVectorizer()
X = count_vectorizer.fit_transform(raw_texts.data)
y = raw_texts.target

# slice in order to use a subset of the total data
X = X[:n_samples]  
y = y[:n_samples]


# split into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, test_size=0.1)

print('\n\n\n-------------- training data and test data --------------\n')
print("Number of samples:\n  X_train: {}\n  y_train: {}\n  X_test:  {}\n  y_test:  {}\n".format(X_train.shape[0],
                                                                                                y_train.shape[0],
                                                                                                X_test.shape[0],
                                                                                                y_test.shape[0]))

train_samples, n_features = X_train.shape
n_classes = np.unique(y).shape[0]

print("Number of features:       {}\n".format(n_features))
print("Number of unique classes: {}\n\n".format(n_classes))

# Train the classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# How well did we do?
score = clf.score(X_test, y_test)

print('-------------- Multinomial Naive Bayes prediction results --------------\n')
print("score: {}\n".format(score))


# Try to predict the category of a new document
documents_to_be_classified = [
    'A lunar rover or Moon rover is a space exploration vehicle (rover) designed to move across the surface of the Moon. Some rovers have been designed to transport members of a human spaceflight crew, such as the U.S. Apollo program''s Lunar Roving Vehicle; others have been partially or fully autonomous robots, such as Soviet Lunokhods and Chinese Yutu. Three countries have had rovers on the Moon: the Soviet Union, the United States and China; Japan and India currently have planned missions.'
]

# Extract the features from our newly-introduced document
X_doc = count_vectorizer.transform(documents_to_be_classified)

predicted_label_index = clf.predict(X_doc)

print('\nnew document contents:\n')
print(documents_to_be_classified[0])
print("\npredicted label: {}".format(raw_texts.target_names[predicted_label_index[0]]))




Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)




-------------- example training document from 20 newsgroups dataset --------------

document contents:

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----






associated label: rec.autos


-------------- class names --------------

alt.atheism
comp.graphics
comp.os.ms-windows.misc
comp.sys.ibm.pc.hardware
comp.sys.mac.hardware
c