In [None]:
!pip install keras
!pip install tensorflow
!pip install gensim

In [2]:
import pandas as pd
import numpy as np
from sklearn import datasets
from keras.utils import np_utils

news_groups = datasets.fetch_20newsgroups()

categories = news_groups.target_names
newsgroups_train = datasets.fetch_20newsgroups(subset='train', categories=categories)

train_df = pd.DataFrame({'text': newsgroups_train.data, 'target': newsgroups_train.target})
train_df[::1440]

Unnamed: 0,text,target
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1440,From: sandvik@newton.apple.com (Kent Sandvik)\...,19
2880,From: thad@cup.portal.com (Thad P Floryan)\nSu...,6
4320,From: guyd@austin.ibm.com (Guy Dawson)\nSubjec...,2
5760,From: kxgst1+@pitt.edu (Kenneth Gilbert)\nSubj...,13
7200,From: julie@eddie.jpl.nasa.gov (Julie Kangas)\...,18
8640,From: edm@twisto.compaq.com (Ed McCreary)\nSub...,0
10080,From: sieferme@stein.u.washington.edu (Eric Si...,0


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(11314, 130107)

## Naive Bayes classifier

In [4]:
from sklearn.naive_bayes import MultinomialNB

newsgroups_test = datasets.fetch_20newsgroups(subset='test', categories=categories)
vectors_test = vectorizer.transform(newsgroups_test.data)

In [5]:
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)

MultinomialNB(alpha=0.01)

In [6]:
# 予測結果
predict_y = clf.predict(vectors_test)
predict_y

array([ 7, 11,  0, ...,  9,  3, 15])

In [7]:
# 正解データ
Y_index = newsgroups_test.target
Y_index

array([ 7,  5,  0, ...,  9,  6, 15])

In [8]:
print('True:', np.count_nonzero(predict_y == Y_index))
print('False:', np.count_nonzero(predict_y != Y_index))

True: 6291
False: 1241


In [9]:
from sklearn.metrics import classification_report

print(classification_report(Y_index, predict_y))

              precision    recall  f1-score   support

           0       0.82      0.78      0.80       319
           1       0.69      0.75      0.72       389
           2       0.74      0.63      0.68       394
           3       0.65      0.75      0.69       392
           4       0.83      0.84      0.83       385
           5       0.84      0.78      0.81       395
           6       0.82      0.78      0.80       390
           7       0.89      0.90      0.90       396
           8       0.93      0.96      0.95       398
           9       0.95      0.94      0.95       397
          10       0.95      0.97      0.96       399
          11       0.89      0.93      0.91       396
          12       0.79      0.77      0.78       393
          13       0.89      0.84      0.86       396
          14       0.87      0.91      0.89       394
          15       0.82      0.95      0.88       398
          16       0.76      0.91      0.83       364
          17       0.97    

## Random Forest Model

In [10]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=144)
clf.fit(vectors, newsgroups_train.target)

DecisionTreeClassifier(random_state=144)

In [11]:
# 予測結果
predict_y = clf.predict(vectors_test)
predict_y

array([ 4, 12, 15, ..., 12, 12, 15])

In [12]:
# 正解データ
Y_index = newsgroups_test.target
Y_index

array([ 7,  5,  0, ...,  9,  6, 15])

In [13]:
print('True:', np.count_nonzero(predict_y == Y_index))
print('False:', np.count_nonzero(predict_y != Y_index))

True: 4163
False: 3369


In [14]:
from sklearn.metrics import classification_report

print(classification_report(Y_index, predict_y))

              precision    recall  f1-score   support

           0       0.48      0.47      0.48       319
           1       0.41      0.43      0.42       389
           2       0.52      0.55      0.53       394
           3       0.44      0.41      0.43       392
           4       0.53      0.59      0.56       385
           5       0.51      0.46      0.49       395
           6       0.69      0.72      0.70       390
           7       0.61      0.57      0.59       396
           8       0.73      0.74      0.73       398
           9       0.54      0.53      0.53       397
          10       0.65      0.67      0.66       399
          11       0.77      0.69      0.73       396
          12       0.34      0.36      0.35       393
          13       0.54      0.44      0.49       396
          14       0.66      0.64      0.65       394
          15       0.67      0.73      0.70       398
          16       0.48      0.61      0.54       364
          17       0.74    