In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline


In [2]:
# We defined the categories which we want to classify
categories = ['rec.motorcycles', 'sci.electronics',
              'comp.graphics', 'sci.med']

# sklearn provides us with subset data for training and testing
train_data = fetch_20newsgroups(subset='train',
                                categories=categories, shuffle=True, random_state=42)

print(train_data.target_names)

print("\n".join(train_data.data[0].split("\n")[:3]))
print(train_data.target_names[train_data.target[0]])

# Let's look at categories of our first ten training data
for t in train_data.target[:10]:
    print(train_data.target_names[t])



['comp.graphics', 'rec.motorcycles', 'sci.electronics', 'sci.med']
From: kreyling@lds.loral.com (Ed Kreyling 6966)
Subject: Sun-os and 8bit ASCII graphics
Organization: Loral Data Systems
comp.graphics
comp.graphics
comp.graphics
rec.motorcycles
comp.graphics
sci.med
sci.electronics
sci.electronics
comp.graphics
rec.motorcycles
sci.electronics


In [3]:
# Builds a dictionary of features and transforms documents to feature vectors and convert our text documents to a
# matrix of token counts (CountVectorizer)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_data.data)

# transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


In [4]:
knn = KNeighborsClassifier(n_neighbors=7)

# training our classifier ; train_data.target will be having numbers assigned for each category in train data
clf = knn.fit(X_train_tfidf, train_data.target)

# Input Data to predict their classes of the given categories
docs_new = ['I have a Harley Davidson and Yamaha.', 'I have a GTX 1050 GPU']
# building up feature vector of our input
X_new_counts = count_vect.transform(docs_new)
# We call transform instead of fit_transform because it's already been fit
X_new_tfidf = tfidf_transformer.transform(X_new_counts)



In [5]:
# predicting the category of our input text: Will give out number for category
predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, train_data.target_names[category]))


    

'I have a Harley Davidson and Yamaha.' => rec.motorcycles
'I have a GTX 1050 GPU' => sci.med


In [6]:
# We can use Pipeline to add vectorizer -> transformer -> classifier all in a one compound classifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', knn),
])
# Fitting our train data to the pipeline
text_clf.fit(train_data.data, train_data.target)

# Test data 
test_data = fetch_20newsgroups(subset='test',
                               categories=categories, shuffle=True, random_state=42)
docs_test = test_data.data
# Predicting our test data
predicted = text_clf.predict(docs_test)
print('We got an accuracy of',np.mean(predicted == test_data.target)*100, '% over the test data.')


We got an accuracy of 82.67766497461929 % over the test data.


multinomial naive bayes

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, train_data.target, test_size=0.30, random_state=42)

print('Training Data Shape:', X_train.shape)
print('Testing Data Shape: ', X_test.shape)

Training Data Shape: (1656, 35653)
Testing Data Shape:  (711, 35653)


In [8]:
from sklearn.naive_bayes import MultinomialNB
lr_model = MultinomialNB()
lr_model.fit(X_train, y_train)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
predictions = lr_model.predict(X_test)

In [10]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[161   0   4   2]
 [  0 187   1   1]
 [  5   3 164   0]
 [  3   2   3 175]]


In [11]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96       167
           1       0.97      0.99      0.98       189
           2       0.95      0.95      0.95       172
           3       0.98      0.96      0.97       183

    accuracy                           0.97       711
   macro avg       0.97      0.97      0.97       711
weighted avg       0.97      0.97      0.97       711



In [12]:
print(metrics.accuracy_score(y_test,predictions))

0.9662447257383966


## Test the Accuracy of the Model

In [13]:
X_train.shape

(1656, 35653)

In [14]:
X_train_counts.shape

(2367, 35653)

In [16]:
X_train_tfidf.shape

(2367, 35653)

In [20]:
# We can use Pipeline to add vectorizer -> transformer -> classifier all in a one compound classifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
# Fitting our train data to the pipeline
text_clf.fit(train_data.data, train_data.target)

# Test data 
test_data = fetch_20newsgroups(subset='test',
                               categories=categories, shuffle=True, random_state=42)
docs_test = test_data.data
# Predicting our test data
predicted = text_clf.predict(docs_test)
print('We got an accuracy of',np.mean(predicted == test_data.target)*100, '% over the test data.')


We got an accuracy of 91.49746192893402 % over the test data.


#naive bayes

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, train_data.target, test_size=0.30, random_state=42)

print('Training Data Shape:', X_train.shape)
print('Testing Data Shape: ', X_test.shape)

Training Data Shape: (1656, 35653)
Testing Data Shape:  (711, 35653)


In [29]:
y1=X_train.todense()

In [30]:
from sklearn.naive_bayes import GaussianNB
lr_model = GaussianNB()
lr_model.fit(y1, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [32]:
y2=X_test.todense()

In [33]:
predictions = lr_model.predict(y2)

In [34]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[150   1   9   7]
 [  4 177   3   5]
 [ 18   4 144   6]
 [  1   2   8 172]]


In [35]:
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.87      0.90      0.88       167
           1       0.96      0.94      0.95       189
           2       0.88      0.84      0.86       172
           3       0.91      0.94      0.92       183

    accuracy                           0.90       711
   macro avg       0.90      0.90      0.90       711
weighted avg       0.90      0.90      0.90       711



In [37]:
print(metrics.accuracy_score(y_test,predictions))

0.9043600562587905
