In [10]:
from sklearn.datasets import fetch_20newsgroups

categories = ["alt.atheism", "soc.religion.christian", "comp.graphics", "sci.med"]

data = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

print(data.target_names, '\n') # category names
print(data.target[:10], '\n') # id corresponding to a category name
print("\n".join(data.data[0].split("\n")[:3])) # sample data

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian'] 

[1 1 3 3 3 3 3 2 2 2] 

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [11]:
for t in data.target[:10]:
  print(data.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


In [15]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data.data)

print(X_train_counts.shape)

(2257, 35788)


In [20]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tfidf_transformer.transform(X_train_counts)

print(X_train_tf.shape)

(2257, 35788)


In [21]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tf, data.target)

docs_new = ['God is love', 'OpenGL on the GPU is fast']

X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, cat in zip(docs_new, predicted):
  print(f'{doc} => {data.target_names[cat]}')

God is love => soc.religion.christian
OpenGL on the GPU is fast => comp.graphics
