In [39]:
import pandas as pd
from sklearn.linear_model import Perceptron
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from sklearn import metrics

In [40]:
categories = ['alt.atheism','talk.religion.misc','comp.graphics','sci.space',]
remove = ('headers', 'footers', 'quotes')
data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=50,
                                remove=remove)

In [41]:
data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=50,
                               remove=remove)
print(data_test)

       'C:\\Users\\Janhavi\\scikit_learn_data\\20news_home\\20news-bydate-test\\talk.religion.misc\\84302',
       'C:\\Users\\Janhavi\\scikit_learn_data\\20news_home\\20news-bydate-test\\talk.religion.misc\\83911',
       ...,
       'C:\\Users\\Janhavi\\scikit_learn_data\\20news_home\\20news-bydate-test\\alt.atheism\\53645',
       'C:\\Users\\Janhavi\\scikit_learn_data\\20news_home\\20news-bydate-test\\comp.graphics\\39042',
       'C:\\Users\\Janhavi\\scikit_learn_data\\20news_home\\20news-bydate-test\\sci.space\\61301'],
      dtype='<U96'), 'target_names': ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc'], 'target': array([3, 3, 3, ..., 0, 1, 2], dtype=int32), 'DESCR': None, 'description': 'the 20 newsgroups by date dataset'}


In [42]:
y_train, y_test = data_train.target, data_test.target

print(y_train, y_test)

[2 3 2 ... 1 0 2] [3 3 3 ... 0 1 2]


In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
X_test = vectorizer.transform(data_test.data)

In [44]:
weights = np.asarray(X_train.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': vectorizer.get_feature_names(), 'tf-idf': weights})
weights_df.sort_values(by='tf-idf')

Unnamed: 0,term,tf-idf
18542,pmjpeg11,0.000002
12646,imagefx,0.000002
1589,68020,0.000002
6413,compilations,0.000002
10793,fs,0.000002
822,25192,0.000002
24545,tt,0.000002
17066,nutterbrink,0.000002
228,1145040,0.000002
18807,praetzel,0.000002


In [45]:
classifier = Perceptron(max_iter=20)
model = classifier.fit(X_train, y_train)

In [46]:
obtained_y = model.predict(X_test)

In [47]:
metrics.confusion_matrix(y_test, obtained_y)

array([[193,  14,  24,  88],
       [ 10, 340,  23,  16],
       [ 23,  28, 314,  29],
       [ 64,  14,  17, 156]], dtype=int64)

In [48]:
print(metrics.classification_report(y_test, obtained_y))

             precision    recall  f1-score   support

          0       0.67      0.61      0.63       319
          1       0.86      0.87      0.87       389
          2       0.83      0.80      0.81       394
          3       0.54      0.62      0.58       251

avg / total       0.75      0.74      0.74      1353

