In [1]:
import sklearn

import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
newsgroups_data = fetch_20newsgroups()

In [4]:
newsgroups_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [5]:
print(newsgroups_data.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features       

In [6]:
newsgroups_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [7]:
print(newsgroups_data.data[4])

From: jcm@head-cfa.harvard.edu (Jonathan McDowell)
Subject: Re: Shuttle Launch Question
Organization: Smithsonian Astrophysical Observatory, Cambridge, MA,  USA
Distribution: sci
Lines: 23

From article <C5owCB.n3p@world.std.com>, by tombaker@world.std.com (Tom A Baker):
>>In article <C5JLwx.4H9.1@cs.cmu.edu>, ETRAT@ttacs1.ttu.edu (Pack Rat) writes...
>>>errors. ...".  I am wondering what an "expected error" might
>>>be.  Sorry if this is a really dumb question, but
> 
> Parity errors in memory or previously known conditions that were waivered.
>    "Yes that is an error, but we already knew about it"
> I'd be curious as to what the real meaning of the quote is.
> 
> tom


My understanding is that the 'expected errors' are basically
that don't have the right values in yet because they aren't
set till after launch, and suchlike. Rather than fix the code
and possibly introduce new bugs, they just tell the crew

 - Jonathan





In [8]:
print(newsgroups_data.target[4])

14


In [9]:
np.unique(newsgroups_data.target)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [10]:
len(newsgroups_data.data), len(newsgroups_data.target)

(11314, 11314)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf_vect = TfidfVectorizer(stop_words='english')

newsgroups_data_transformed = tfidf_vect.fit_transform(newsgroups_data.data)

In [13]:
newsgroups_data_transformed.shape

(11314, 129796)

In [14]:
len(tfidf_vect.get_feature_names())

129796

In [15]:
import random

random.sample(tfidf_vect.vocabulary_.items(), 10)

[('2s1ns', 11138),
 ('coudln', 42809),
 ('cdrom2', 38429),
 ('thucydides', 114667),
 ('consignment', 42032),
 ('912', 21581),
 ('16wm', 5459),
 ('2lwie', 10940),
 ('gratitude', 60092),
 ('zb', 128790)]

In [16]:
print(newsgroups_data_transformed[0])

  (0, 75215)	0.38538985156422345
  (0, 122887)	0.282869751755441
  (0, 118013)	0.23076236589534987
  (0, 50455)	0.05948476266845307
  (0, 114439)	0.06768238878777005
  (0, 111094)	0.020865105019220037
  (0, 37722)	0.41534653529092685
  (0, 87451)	0.03885306291479392
  (0, 94962)	0.03754552571724598
  (0, 63970)	0.03857974543636419
  (0, 98748)	0.17501596694257227
  (0, 90192)	0.021706106200820422
  (0, 118714)	0.04039328791909072
  (0, 79519)	0.11911704310036365
  (0, 40939)	0.08497090499024601
  (0, 91885)	0.10797335594250271
  (0, 75888)	0.020933445618156278
  (0, 4605)	0.06897342558445459
  (0, 124627)	0.0967471326603278
  (0, 51714)	0.1460907895102532
  (0, 104609)	0.09217540920934718
  (0, 45232)	0.07212208178051426
  (0, 48550)	0.10908149802523068
  (0, 109354)	0.1177321203161709
  (0, 76574)	0.09842306773884468
  :	:
  (0, 34943)	0.18203649549572576
  (0, 48552)	0.12638449885516734
  (0, 99619)	0.061719030928680974
  (0, 108033)	0.08197182211166718
  (0, 26070)	0.103851851395033

In [17]:
from sklearn.model_selection import train_test_split

In [36]:
x_train, x_test, y_train, y_test = train_test_split(newsgroups_data_transformed, 
                                                    newsgroups_data.target, 
                                                    shuffle=True,
                                                    test_size = 0.2)

In [19]:
x_train.shape, y_train.shape

((9051, 129796), (9051,))

In [20]:
x_test.shape, y_test.shape

((2263, 129796), (2263,))

In [21]:
from sklearn.neural_network import MLPClassifier

In [29]:
mlp_clf = MLPClassifier(activation = 'relu',
                    hidden_layer_sizes= (32,), 
                    solver='adam', 
                    verbose=True,
                    max_iter=50)

In [30]:
mlp_clf.fit(x_train, y_train)

Iteration 1, loss = 2.90795433
Iteration 2, loss = 2.53222252
Iteration 3, loss = 2.00954034
Iteration 4, loss = 1.46034655
Iteration 5, loss = 1.00528596
Iteration 6, loss = 0.68470195
Iteration 7, loss = 0.47570613
Iteration 8, loss = 0.34082672
Iteration 9, loss = 0.25198263
Iteration 10, loss = 0.19165515
Iteration 11, loss = 0.14967723
Iteration 12, loss = 0.11948682
Iteration 13, loss = 0.09746897
Iteration 14, loss = 0.08100922
Iteration 15, loss = 0.06838114
Iteration 16, loss = 0.05854348
Iteration 17, loss = 0.05085615
Iteration 18, loss = 0.04466532
Iteration 19, loss = 0.03971831
Iteration 20, loss = 0.03562180
Iteration 21, loss = 0.03222111
Iteration 22, loss = 0.02937627
Iteration 23, loss = 0.02696730
Iteration 24, loss = 0.02493335
Iteration 25, loss = 0.02320549
Iteration 26, loss = 0.02169038
Iteration 27, loss = 0.02030993
Iteration 28, loss = 0.01915091
Iteration 29, loss = 0.01814736
Iteration 30, loss = 0.01713224
Iteration 31, loss = 0.01643001
Iteration 32, los



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(32,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=50, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=True, warm_start=False)

In [31]:
y_pred = mlp_clf.predict(x_test)

In [32]:
pred_results = pd.DataFrame({'y_test': y_test,
                             'y_pred': y_pred})

pred_results.sample(10)

Unnamed: 0,y_test,y_pred
1185,12,12
1264,15,15
520,17,17
766,0,0
552,6,6
1980,13,13
2109,13,13
1415,12,12
254,6,6
2071,18,18


In [33]:
newsgroups_data.target_names[11]

'sci.crypt'

In [34]:
newsgroups_data_crosstab = pd.crosstab(pred_results.y_test, pred_results.y_pred)

newsgroups_data_crosstab

y_pred,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
y_test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,91,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
1,0,110,6,5,0,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,1,94,4,1,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,2,2,95,8,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,1,2,2,102,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,6,2,0,1,103,1,1,0,0,0,1,0,0,0,0,0,0,0,0
6,0,1,2,4,1,0,111,5,0,0,1,0,4,0,0,0,0,0,0,0
7,0,0,0,0,0,0,1,112,1,1,1,0,2,0,1,0,0,0,1,0
8,0,0,0,0,0,0,0,0,107,0,0,0,0,1,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,1,112,1,0,1,0,0,0,0,0,0,0


In [35]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.931064958020327