In [1]:
# 機械学習帳 Chap.5
# https://chokkan.github.io/mlnote/classification/01binary.html

In [2]:
include("../src/Helper/data_helper.jl")
include("../src/Classification/logistic_regression.jl")
using LinearAlgebra

In [3]:
# ダウンロード
using HTTP
resp = HTTP.get("https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip")
open("tmp.zip","w") do f
    write(f, resp.body)
end

# 7-Zipで回答
run(`7z e tmp.zip -aoa`)

# ファイルを読み込んで、データにセットする
D = []
open("SMSSpamCollection", "r") do f
    tokenize(s) = [rstrip(t,'.') for t in split(s, ' ')]
    toTuple(line) = let ss = split(line,"\t"); return (ss[1], counter(tokenize(ss[2]))); end    
    global D = [toTuple(line) for line in eachline(f)]
end
# training dataとtest dataに9:1で、ランダムに分ける
(Dtrain, Dtest) = randomsplit(D, 0.9)

# ダウンロード/解凍したファイルを削除
run(`rm SMSSpamCollection`)
run(`rm readme`)
run(`rm tmp.zip`)


7-Zip 19.00 (x64) : Copyright (c) 1999-2018 Igor Pavlov : 2019-02-21

Scanning the drive for archives:
1 file, 203415 bytes (199 KiB)

Extracting archive: tmp.zip
--
Path = tmp.zip
Type = zip
Physical Size = 203415

Everything is Ok

Files: 2
Size:       483775
Compressed: 203415


Process(`[4mrm[24m [4mtmp.zip[24m`, ProcessExited(0))

In [4]:
# 解析可能な形式にデータを変換する
## training dataに含まれる各単語のリストを作成する。
vocab  = unique(Iterators.flatten([keys(d[2]) for d in Dtrain]))
## Nx(M+1)行列に変換する。(M=上で求めた単語リストのlength。+1は定数項)
Xtrain = makeMatrixfromDicts([d[2] for d in Dtrain], vocab)
Xtest  = makeMatrixfromDicts([d[2] for d in Dtest], vocab)
## 文字列のベクトルを0,1の二値をとるベクトルに変換する(# spam:1 ham:0)
label2id(label) = label == "ham" ? 0 : 1 
Ytrain = map(label2id, [d[1] for d in Dtrain])
Ytest  = map(label2id, [d[1] for d in Dtest])
println("made data")

made data


In [5]:
# ロジスティック回帰モデルの学習
N = length(Dtrain)
M = length(vocab)
# (M+1)の+1は定数項分
w_sgd = logistic_regression(Xtrain, Ytrain, SGD(zeros(M+1), t -> 1/sqrt(1+t), 1e-4, 1_000))

# 求めたパラメータを評価
## 定数項を除いたパラメータを重み順にソート
sorted_w = sort([(w_sgd[j],vocab[j]) for j in 1:M])
println("==Top 20 parameters with the smallest value==")
for i in (1:20); println(sorted_w[i]); end
println("==Top 20 parameters with the largest value==")
for i in (M-19:M); println(sorted_w[i]); end
## 正解率、適合率、再現率、F1スコア
Ŷtest = map(v -> v > 0 ? 1 : 0, Xtest * w_sgd)
TP=0;FP=0;FN=0;TN=0
for i in (1:length(Dtest))
    if Ytest[i]==1
        if Ŷtest[i] == 1; TP+=1; else FN+=1; end
    else
        if Ŷtest[i] == 1; FP+=1; else TN+=1; end
    end
end
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
f1score = 2 * precision * recall / (precision + recall)
println("==Scores==")
println("Accuracy  = $accuracy")
println("Precision = $precision")
println("Recall    = $recall")
println("F1Score   = $f1score")
GC.gc()

==Top 20 parameters with the smallest value==
(-0.8595199199113788, "I")
(-0.7701058923871961, "i")
(-0.678535216586869, "my")
(-0.5666784618868096, "you")
(-0.4937909284872803, "")
(-0.48016851423791856, "me")
(-0.4742823518123879, "doing")
(-0.414702036657556, "lor")
(-0.4081733124150848, "still")
(-0.40037934026150923, "come")
(-0.3997120500354399, "start")
(-0.37707770212725095, "but")
(-0.36335808182639845, "leh")
(-0.3624122809827245, "show")
(-0.3621270999429258, "busy")
(-0.355729004129994, "tv")
(-0.3540251474381818, "Waiting")
(-0.35355339059327373, "report?")
(-0.3452896105303562, "in")
(-0.30204526820869854, "d")
==Top 20 parameters with the largest value==
(0.33879960199348513, "unlimited")
(0.34122812448266093, "Text")
(0.3412696019798457, "CALL")
(0.3450721101021217, "service")
(0.3470880159425676, "contact")
(0.3497646897588685, "for")
(0.35618382016122885, "Please")
(0.364643811049581, "apply")
(0.3698627098893932, "Call")
(0.3978318104341168, "prize")
(0.3996058530128