In [6]:
# 必要なライブラリのインポート
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint

# データセットの読み込み
train_set = fetch_20newsgroups(subset='train', random_state=42)
test_set = fetch_20newsgroups(subset='test', random_state=42)

# 訓練データとテストデータの準備
X_train = train_set.data
y_train = train_set.target
X_test = test_set.data
y_test = test_set.target

# 20種類のカテゴリ名・1番目の記事内容とカテゴリラベルの表示
print('カテゴリ一覧')
pprint(train_set.target_names)
print('\n')
print('記事その1')
print(f'News Script:\n{X_train[0]}')
print('記事その1のカテゴリ')
print(f'Text Category label: {y_train[0]}')

カテゴリ一覧
['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


記事その1
News Script:
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this ca

In [9]:
# 必要なライブラリのインポート
from sklearn.feature_extraction.text import CountVectorizer

# BoW表現への変換（テキストをベクトル化）
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(X_train)
X_train_bow = vectorizer.transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# 1番目の文書のBoW表現のベクトルを表示
print('(テキスト番号, 単語番号)  出現回数')
print(X_train_bow[0])
print('\nBoW表現ベクトル')
print(X_train_bow[0].toarray())

(テキスト番号, 単語番号)  出現回数
  (0, 4605)	1
  (0, 16574)	1
  (0, 18299)	1
  (0, 26070)	1
  (0, 34131)	1
  (0, 34943)	1
  (0, 35135)	1
  (0, 35560)	1
  (0, 37378)	1
  (0, 37722)	5
  (0, 40939)	1
  (0, 45232)	1
  (0, 48550)	1
  (0, 48552)	1
  (0, 50039)	1
  (0, 50455)	2
  (0, 51651)	1
  (0, 51714)	1
  (0, 57203)	1
  (0, 63238)	1
  (0, 63970)	1
  (0, 65968)	1
  (0, 67023)	1
  (0, 73061)	1
  (0, 74552)	1
  :	:
  (0, 79519)	1
  (0, 83103)	1
  (0, 86416)	1
  (0, 87451)	1
  (0, 90192)	1
  (0, 91885)	1
  (0, 94962)	1
  (0, 95944)	1
  (0, 98748)	1
  (0, 99619)	1
  (0, 101175)	1
  (0, 104609)	1
  (0, 105907)	1
  (0, 108033)	1
  (0, 109044)	1
  (0, 109354)	1
  (0, 111094)	1
  (0, 113755)	1
  (0, 114195)	1
  (0, 114439)	1
  (0, 118013)	2
  (0, 118714)	1
  (0, 122887)	2
  (0, 124627)	1
  (0, 127721)	1

BoW表現ベクトル
[[0 0 0 ... 0 0 0]]


In [10]:
# 必要なライブラリのインポート
from sklearn.naive_bayes import MultinomialNB

# MultinomialNBの適用(alpha=0.4)
mnb = MultinomialNB(alpha=0.4)
mnb.fit(X_train_bow, y_train)

# Accuracyの表示
print(f'Train Accuracy: {mnb.score(X_train_bow, y_train):.3f}')
print(f'Test Accuracy: {mnb.score(X_test_bow, y_test):.3f}')

Train Accuracy: 0.951
Test Accuracy: 0.811


In [11]:
mnb_small = MultinomialNB(alpha=0.001)
mnb_small.fit(X_train_bow,y_train)

mnb_large = MultinomialNB(alpha=100)
mnb_large.fit(X_train_bow,y_train)

# Accuracyの表示
print(f'Train Accuracy(alpha=0.001): {mnb_small.score(X_train_bow, y_train):.3f}')
print(f'Test Accuracy(alpha=0.001): {mnb_small.score(X_test_bow, y_test):.3f}')
print(f'Train Accuracy(alpha=100): {mnb_large.score(X_train_bow, y_train):.3f}')
print(f'Test Accuracy(alpha=100): {mnb_large.score(X_test_bow, y_test):.3f}')


Train Accuracy(alpha=0.001): 0.988
Test Accuracy(alpha=0.001): 0.799
Train Accuracy(alpha=100): 0.747
Test Accuracy(alpha=100): 0.632
