In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
newgroups_data = fetch_20newsgroups()

In [4]:
newgroups_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [5]:
print(newgroups_data.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      

In [6]:
newgroups_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [7]:
newgroups_data.data[3]

'From: jgreen@amber (Joe Green)\nSubject: Re: Weitek P9000 ?\nOrganization: Harris Computer Systems Division\nLines: 14\nDistribution: world\nNNTP-Posting-Host: amber.ssd.csd.harris.com\nX-Newsreader: TIN [version 1.1 PL9]\n\nRobert J.C. Kyanko (rob@rjck.UUCP) wrote:\n> abraxis@iastate.edu writes in article <abraxis.734340159@class1.iastate.edu>:\n> > Anyone know about the Weitek P9000 graphics chip?\n> As far as the low-level stuff goes, it looks pretty nice.  It\'s got this\n> quadrilateral fill command that requires just the four points.\n\nDo you have Weitek\'s address/phone number?  I\'d like to get some information\nabout this chip.\n\n--\nJoe Green\t\t\t\tHarris Corporation\njgreen@csd.harris.com\t\t\tComputer Systems Division\n"The only thing that really scares me is a person with no sense of humor."\n\t\t\t\t\t\t-- Jonathan Winters\n'

In [8]:
newgroups_data.target[4]

14

In [9]:
np.unique(newgroups_data.target)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [10]:
len(newgroups_data.data),len(newgroups_data.target)

(11314, 11314)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf_vect = TfidfVectorizer(stop_words='english')

newgroups_data_transformed = tfidf_vect.fit_transform(newgroups_data.data)

In [13]:
newgroups_data_transformed.shape

(11314, 129796)

In [14]:
len(tfidf_vect.get_feature_names())

129796

In [15]:
import random

random.sample(tfidf_vect.vocabulary_.items(),10)

[('sadists', 104095),
 ('currely', 44044),
 ('ein', 50773),
 ('shul', 106920),
 ('striking', 110831),
 ('csp1dwd', 43766),
 ('8je', 21077),
 ('bitched', 33484),
 ('macplus', 78579),
 ('myopia', 85268)]

In [16]:
print(newgroups_data_transformed[0])

  (0, 86416)	0.14330464297977982
  (0, 35135)	0.10188109676312235
  (0, 65968)	0.10658183340971177
  (0, 114195)	0.06002582888934523
  (0, 78809)	0.06524029473980168
  (0, 76578)	0.0752490171119318
  (0, 57203)	0.16977226500364592
  (0, 67023)	0.07965653370342658
  (0, 63238)	0.09086750717799585
  (0, 95944)	0.11792442679286105
  (0, 127721)	0.0660283455431985
  (0, 109044)	0.11811852219269026
  (0, 51651)	0.10581100308545811
  (0, 83103)	0.09633120317294654
  (0, 113755)	0.1926949257821117
  (0, 73061)	0.04662587301170703
  (0, 34131)	0.09493746671845804
  (0, 101175)	0.08899924936054199
  (0, 105907)	0.10749912859686628
  (0, 35560)	0.1446512460011004
  (0, 26070)	0.10385185139503332
  (0, 108033)	0.08197182211166716
  (0, 99619)	0.06171903092868097
  (0, 48552)	0.1263844988551673
  (0, 34943)	0.18203649549572573
  :	:
  (0, 76574)	0.09842306773884467
  (0, 109354)	0.11773212031617089
  (0, 48550)	0.10908149802523066
  (0, 45232)	0.07212208178051426
  (0, 104609)	0.09217540920934716


In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(newgroups_data_transformed,newgroups_data.target,shuffle=True,test_size=0.2)

In [19]:
X_train.shape,y_train.shape

((9051, 129796), (9051,))

In [20]:
X_test.shape, y_test.shape

((2263, 129796), (2263,))

In [21]:
from sklearn.neural_network import MLPClassifier

In [22]:
mlp_clf = MLPClassifier(activation='relu',hidden_layer_sizes=(32,),solver='adam',verbose=True,max_iter=50)

In [23]:
mlp_clf.fit(X_train,y_train)

Iteration 1, loss = 2.90897430
Iteration 2, loss = 2.52674594
Iteration 3, loss = 2.01002200
Iteration 4, loss = 1.46193632
Iteration 5, loss = 0.99636251
Iteration 6, loss = 0.66804633
Iteration 7, loss = 0.45732917
Iteration 8, loss = 0.32390167
Iteration 9, loss = 0.23769358
Iteration 10, loss = 0.18014943
Iteration 11, loss = 0.14027991
Iteration 12, loss = 0.11192833
Iteration 13, loss = 0.09133433
Iteration 14, loss = 0.07588974
Iteration 15, loss = 0.06418386
Iteration 16, loss = 0.05505808
Iteration 17, loss = 0.04786635
Iteration 18, loss = 0.04214089
Iteration 19, loss = 0.03748547
Iteration 20, loss = 0.03372857
Iteration 21, loss = 0.03057891
Iteration 22, loss = 0.02790185
Iteration 23, loss = 0.02570801
Iteration 24, loss = 0.02373562
Iteration 25, loss = 0.02209489
Iteration 26, loss = 0.02063907
Iteration 27, loss = 0.01939770
Iteration 28, loss = 0.01829000
Iteration 29, loss = 0.01738451
Iteration 30, loss = 0.01648222
Iteration 31, loss = 0.01578403
Iteration 32, los

MLPClassifier(hidden_layer_sizes=(32,), max_iter=50, verbose=True)

In [24]:
y_pred = mlp_clf.predict(X_test)

In [25]:
pred_results = pd.DataFrame({'y_test':y_test,'y_pred':y_pred})

pred_results.sample(10)

Unnamed: 0,y_test,y_pred
2244,8,8
1390,15,15
37,14,14
2185,6,6
1195,0,0
1436,4,4
1643,12,12
64,7,7
102,17,17
543,12,12


In [26]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,y_pred)

0.9297392841361025