In [17]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

入力データの読み込み

In [3]:
dataframe =  pd.read_csv("SMSSpamCollection", sep='\t', header=None)

In [4]:
dataframe.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


PandasのDataFrameに列名をつける

In [5]:
dataframe.columns = ['spam', 'message']

In [6]:
dataframe.head()

Unnamed: 0,spam,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


データに欠損値がないかを確認

In [7]:
dataframe.isnull().any(axis=0)

spam       False
message    False
dtype: bool

すべての列がfalseになっているため、データに欠損値はない

スパムと正常なメッセージとの個数を出す

In [16]:
dataframe['spam'].value_counts()

ham     4825
spam     747
Name: spam, dtype: int64

In [25]:
X = dataframe['message']
y = dataframe['spam']

In [85]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from scipy.sparse import hstack

訓練データとテストデータに分ける

In [71]:
train_x, test_x, train_y, test_y = train_test_split(X, y)

テキストデータをTfidfVectorizerを使って数値データに変換する
( https://narengowda.github.io/logistic-regression-sms-spam-ham-classification-using-tfidf-vectorizer/ ) も参照

In [79]:
vectorizer = TfidfVectorizer(
    sublinear_tf = True,
    strip_accents = 'unicode',
    analyzer = 'word',
    token_pattern = r'\w{1,}',
    stop_words = 'english',
    ngram_range = (1, 1),
    max_features = 10000
    )

In [82]:
vectorizer.fit(dataframe['message'])
train_word_features = word_vectorizer.transform(train_x)
test_word_features = word_vectorizer.transform(test_x)

In [86]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=50000)
char_vectorizer.fit(dataframe['message'])
train_char_features = char_vectorizer.transform(train_x)
test_char_features = char_vectorizer.transform(test_x)

train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

ロジスティック回帰を行う

In [95]:
classifier = LogisticRegression(C=0.5, solver='sag')

classifier.fit(train_features, train_y)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

予測する

In [96]:
pred = classifier.predict(test_features)

正解率を判定する

In [97]:
print('Accuracy on Test Set:',accuracy_score(test_y, pred))

Accuracy on Test Set: 0.9698492462311558
