# 文章データの機械学習


### メールの文章のデータをもとに機械学習を行い、メールがスパムメールかを分類するプログラム。

#### ・※スパム・・・無差別送信されるジャンクメール

#### ・※ハム・・・スパムではないメールのこと

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import glob
import collections
from sklearn.feature_extraction import DictVectorizer

# データの読み込み

### ・easy_ham：普通のメール
### ・hard_ham：スパムと区別しづらい普通のメール
### ・spam：スパムメール
### ファイル名の先頭が「0」から始まるファイルを抽出しています。

In [27]:
easy_ham_files = glob.glob("./spam_ham/easy_ham/0*") #フォルダのファイルを列挙するには、glob.globを使う
hard_ham_files = glob.glob("./spam_ham/hard_ham/0*")
spam_files = glob.glob("./spam_ham/spam/0*")

In [28]:
print(len(easy_ham_files))
print(len(hard_ham_files))
print(len(spam_files))

2500
250
500


In [5]:
filenames = easy_ham_files + hard_ham_files + spam_files
len(filenames)

3250

In [6]:
ENCODING = 'latin-1'

In [30]:
with open(filenames[0], 'r', encoding=ENCODING) as f: #最初のファイルの内容を表示
    text = f.read()
    print(text)

From pudge@perl.org  Wed Sep 18 11:50:50 2002
Return-Path: <pudge@perl.org>
Delivered-To: yyyy@localhost.spamassassin.taint.org
Received: from localhost (jalapeno [127.0.0.1])
	by jmason.org (Postfix) with ESMTP id 30EA616F03
	for <jm@localhost>; Wed, 18 Sep 2002 11:50:50 +0100 (IST)
Received: from jalapeno [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for jm@localhost (single-drop); Wed, 18 Sep 2002 11:50:50 +0100 (IST)
Received: from cpu59.osdn.com (slashdot.org [64.28.67.73] (may be forged))
    by dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g8I205C06815 for
    <jm-use-perl@jmason.org>; Wed, 18 Sep 2002 03:00:05 +0100
Received: from [10.2.181.14] (helo=perl.org) by cpu59.osdn.com with smtp
    (Exim 3.35 #1 (Debian)) id 17rU62-00061F-00 for <jm-use-perl@jmason.org>;
    Tue, 17 Sep 2002 21:57:42 -0400
Date: Wed, 18 Sep 2002 02:00:22 +0000
From: pudge@perl.org
Subject: [use Perl] Headlines for 2002-09-18
To: yyyy-use-perl@spamassassin.taint.org
Precedence: list
X-Bulk

In [31]:
with open(filenames[-1], 'r', encoding=ENCODING) as f: #最後のファイルの内容を表示
    text = f.read()
    print(text)

From vinnet@mail.com  Wed Sep 25 17:22:05 2002
Return-Path: <vinnet@mail.com>
Delivered-To: zzzz@localhost.spamassassin.taint.org
Received: from localhost (jalapeno [127.0.0.1])
	by zzzzason.org (Postfix) with ESMTP id AD14B16F03
	for <zzzz@localhost>; Wed, 25 Sep 2002 17:22:04 +0100 (IST)
Received: from jalapeno [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for zzzz@localhost (single-drop); Wed, 25 Sep 2002 17:22:04 +0100 (IST)
Received: from webnote.net (mail.webnote.net [193.120.211.219]) by
    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g8PFMQC01001 for
    <zzzz@jmason.org>; Wed, 25 Sep 2002 16:22:26 +0100
Received: from localhst1916.com ([213.181.64.18]) by webnote.net
    (8.9.3/8.9.3) with SMTP id QAA26122 for <zzzz@spamassassin.taint.org>;
    Wed, 25 Sep 2002 16:23:00 +0100
Message-Id: <200209251523.QAA26122@webnote.net>
From: "MR.VINCENT NNAJI." <vinnet@mail.com>
Reply-To: vinnety@mail.com
To: zzzz@spamassassin.taint.org
Date: Thu, 26 Sep 2002 04:22:43 +0200


# データの前処理(形態素解析と特徴量の作成)

In [9]:
# 文字列の中で使われている単語ごとの数を返す
def get_word_count(text, min_length=3):
    # ノイズの除去：不要と思われる文字を除去する
    for ch in ".,:;!?-+*/=()[]{}<>~^#$@%&'\"_0123456789":
        text = text.replace(ch, ' ')

    # 形態素解析：文章を単語に分割
    _words = text.strip().split()

    # 表記のゆれの補正：
    # 単語のリストを受け取り、指定された文字数以上の単語だけを全て小文字にして返す
    _words = [_word.lower() for _word in _words if len(_word) >= min_length]

    # collections.Counterの戻り値は辞書型のサブクラス
    _count = collections.Counter(_words)

    # 辞書型に変換して返す
    return dict(_count)

In [36]:
with open(filenames[0], 'r', encoding=ENCODING) as f:
    text = f.read()

get_word_count(text)

{'from': 8,
 'pudge': 5,
 'perl': 14,
 'org': 13,
 'wed': 5,
 'sep': 6,
 'return': 1,
 'path': 1,
 'delivered': 1,
 'yyyy': 2,
 'localhost': 5,
 'spamassassin': 2,
 'taint': 2,
 'received': 5,
 'jalapeno': 2,
 'jmason': 3,
 'postfix': 1,
 'with': 4,
 'esmtp': 2,
 'for': 5,
 'ist': 2,
 'imap': 1,
 'fetchmail': 1,
 'single': 1,
 'drop': 1,
 'cpu': 3,
 'osdn': 3,
 'com': 3,
 'slashdot': 1,
 'may': 1,
 'forged': 1,
 'dogma': 1,
 'slashnull': 1,
 'use': 9,
 'helo': 1,
 'smtp': 1,
 'exim': 1,
 'debian': 1,
 'tue': 1,
 'date': 1,
 'subject': 1,
 'headlines': 1,
 'precedence': 1,
 'list': 1,
 'bulkmail': 1,
 'message': 2,
 'daily': 1,
 'headline': 1,
 'mailer': 1,
 'subscribe': 1,
 'the': 1,
 'review': 1,
 'posted': 1,
 'tuesday': 1,
 'september': 1,
 'links': 1,
 'http': 2,
 'article': 1,
 'sid': 1,
 'copyright': 1,
 'all': 1,
 'rights': 1,
 'reserved': 1,
 'you': 3,
 'have': 1,
 'this': 2,
 'because': 1,
 'subscribed': 1,
 'stop': 1,
 'receiving': 1,
 'and': 2,
 'other': 1,
 'messages': 3,
 

In [11]:
word_count_data = []

# 全てのファイルに対して実施
for filename in filenames:
    with open(filename, 'r', encoding=ENCODING) as f:
        text = f.read()
        count = get_word_count(text)
        word_count_data.append(count)

In [12]:
len(word_count_data)

3250

In [13]:
word_count_data[0]

{'from': 8,
 'pudge': 5,
 'perl': 14,
 'org': 13,
 'wed': 5,
 'sep': 6,
 'return': 1,
 'path': 1,
 'delivered': 1,
 'yyyy': 2,
 'localhost': 5,
 'spamassassin': 2,
 'taint': 2,
 'received': 5,
 'jalapeno': 2,
 'jmason': 3,
 'postfix': 1,
 'with': 4,
 'esmtp': 2,
 'for': 5,
 'ist': 2,
 'imap': 1,
 'fetchmail': 1,
 'single': 1,
 'drop': 1,
 'cpu': 3,
 'osdn': 3,
 'com': 3,
 'slashdot': 1,
 'may': 1,
 'forged': 1,
 'dogma': 1,
 'slashnull': 1,
 'use': 9,
 'helo': 1,
 'smtp': 1,
 'exim': 1,
 'debian': 1,
 'tue': 1,
 'date': 1,
 'subject': 1,
 'headlines': 1,
 'precedence': 1,
 'list': 1,
 'bulkmail': 1,
 'message': 2,
 'daily': 1,
 'headline': 1,
 'mailer': 1,
 'subscribe': 1,
 'the': 1,
 'review': 1,
 'posted': 1,
 'tuesday': 1,
 'september': 1,
 'links': 1,
 'http': 2,
 'article': 1,
 'sid': 1,
 'copyright': 1,
 'all': 1,
 'rights': 1,
 'reserved': 1,
 'you': 3,
 'have': 1,
 'this': 2,
 'because': 1,
 'subscribed': 1,
 'stop': 1,
 'receiving': 1,
 'and': 2,
 'other': 1,
 'messages': 3,
 

In [37]:
vec = DictVectorizer()
dataset = vec.fit_transform(word_count_data)

In [15]:
dataset

<3250x121830 sparse matrix of type '<class 'numpy.float64'>'
	with 756721 stored elements in Compressed Sparse Row format>

In [32]:
vec.get_feature_names()

['\x01\x14www',
 '\x13c\x14',
 '\x14\x01\x15',
 '\\\\\\\\',
 '\\\\\\x\\',
 '\\\\cnet',
 '\\backup',
 '\\cf',
 '\\drivers\\version',
 '\\ek',
 '\\fbkillall',
 '\\finame\\fp',
 '\\for',
 '\\foreignbureaus\\archi',
 '\\frac',
 '\\ho',
 '\\home',
 '\\home\\phone',
 '\\incoming',
 '\\iox',
 '\\kor',
 '\\kuh',
 '\\lx',
 '\\marketing\\cartel\\inventory',
 '\\n\\n',
 '\\ncode',
 '\\ndiag',
 '\\nobject\\nq\\x',
 '\\npv',
 '\\nsig',
 '\\oc',
 '\\peh',
 '\\sjfnl',
 '\\sqrt',
 '\\subkey',
 '\\tab',
 '\\work',
 '\\xbbok\\x',
 '\\xe',
 '\\xfd\\x',
 '\\xff',
 '``anthrax',
 '``benefit',
 '``carol',
 '``commercial',
 '``completely',
 '``cost',
 '``driving',
 '``dumbstruck',
 '``elegance',
 '``enron',
 '``explode',
 '``hard',
 '``i',
 '``initially',
 '``intuition',
 '``it',
 '``opted',
 '``paid',
 '``pioneering',
 '``programming',
 '``public',
 '``put',
 '``real',
 '``shall',
 '``she',
 '``the',
 '``there',
 '``toy',
 '``unsolicited',
 '``we',
 '``what',
 '``when',
 '``who',
 '``you',
 '`am',
 '`athalon

In [33]:
type(dataset)

scipy.sparse.csr.csr_matrix

In [38]:
print(dataset[0])

  (0, 1467)	1.0
  (0, 3508)	1.0
  (0, 4099)	2.0
  (0, 5383)	1.0
  (0, 8172)	1.0
  (0, 11719)	1.0
  (0, 12830)	1.0
  (0, 14229)	2.0
  (0, 16271)	3.0
  (0, 17139)	1.0
  (0, 17526)	3.0
  (0, 19268)	1.0
  (0, 19474)	1.0
  (0, 20056)	1.0
  (0, 20405)	1.0
  (0, 22703)	1.0
  (0, 23616)	1.0
  (0, 28179)	2.0
  (0, 29023)	1.0
  (0, 30500)	1.0
  (0, 32096)	5.0
  (0, 32149)	1.0
  (0, 32843)	8.0
  (0, 38967)	1.0
  (0, 39434)	1.0
  :	:
  (0, 90468)	6.0
  (0, 90484)	1.0
  (0, 91428)	1.0
  (0, 91666)	1.0
  (0, 92144)	1.0
  (0, 92148)	1.0
  (0, 92541)	1.0
  (0, 93154)	2.0
  (0, 94405)	1.0
  (0, 94767)	1.0
  (0, 94806)	1.0
  (0, 94807)	1.0
  (0, 96118)	2.0
  (0, 97422)	1.0
  (0, 97535)	1.0
  (0, 97653)	2.0
  (0, 100060)	1.0
  (0, 100063)	1.0
  (0, 103727)	9.0
  (0, 103744)	1.0
  (0, 109276)	5.0
  (0, 110202)	4.0
  (0, 116508)	3.0
  (0, 116526)	3.0
  (0, 117670)	2.0


# 機械学習の実施
### ハム=0、スパム=1

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [20]:
# 目的変数の用意
# ham = 2500 + 250 = 2750、spam = 500
Y = [0]*2750 + [1]*500

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(dataset, Y, test_size=0.3, random_state=0)
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.3, random_state=0)

In [35]:
logistic_model = LogisticRegression(solver='lbfgs')
logistic_model.fit(X_train, Y_train)
Y_pred = logistic_model.predict(X_valid)

print(classification_report(Y_valid, Y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       587
           1       0.98      0.92      0.95        96

    accuracy                           0.99       683
   macro avg       0.98      0.96      0.97       683
weighted avg       0.99      0.99      0.99       683



# 学習結果の考察

In [23]:
print("datasetの大きさ:", dataset.shape)
print("単語辞書の長さ:", len(vec.get_feature_names()))
print("モデルのパラメータ数:", logistic_model.coef_.shape)

datasetの大きさ: (3250, 121830)
単語辞書の長さ: 121830
モデルのパラメータ数: (1, 121830)


In [39]:
import pandas as pd

coef_df = pd.DataFrame({
    'feature_name': vec.get_feature_names(),
    'coef': logistic_model.coef_[0]
})

coef_df.head()

Unnamed: 0,feature_name,coef
0,www,0.02111037
1,c,2.324806e-07
2,,0.02111037
3,\\\\,-0.0001868804
4,\\\x\,-0.001646597


In [40]:
# スパムらしさに対する寄与が高い単語一覧
coef_df.sort_values('coef', ascending=False).head(20)

Unnamed: 0,feature_name,coef
121017,zzzz,0.800439
109225,webnote,0.331334
92541,smtp,0.303915
121023,zzzzason,0.302887
105963,vip,0.224231
20081,dec,0.223779
116526,your,0.223218
77528,please,0.21675
64650,money,0.213215
116508,you,0.206369


In [26]:
# ハムらしさに対する寄与が高い一覧
coef_df.sort_values('coef').head(20)

Unnamed: 0,feature_name,coef
87944,rssfeeds,-0.383435
67817,netnoteinc,-0.36731
71419,oct,-0.253391
93154,spamassassin,-0.246184
117670,yyyy,-0.238791
114911,yahoogroups,-0.231211
46696,infinetivity,-0.217304
121037,zzzzteana,-0.21344
19474,date,-0.210124
97395,that,-0.201649
