In [1]:
# 機械学習帳 Chap.5
# https://chokkan.github.io/mlnote/classification/01binary.html

In [2]:
include("../src/data_helper.jl")
include("../src/stochastic_gradient_descent.jl")
using LinearAlgebra

In [3]:
# ダウンロード
using HTTP
resp = HTTP.get("https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip")
open("tmp.zip","w") do f
    write(f, resp.body)
end

# 7-Zipで回答
run(`7z e tmp.zip -aoa`)

# ファイルを読み込んで、データにセットする
D = []
open("SMSSpamCollection", "r") do f
    tokenize(s) = [rstrip(t,'.') for t in split(s, ' ')]
    toTuple(line) = let ss = split(line,"\t"); return (ss[1], counter(tokenize(ss[2]))); end    
    global D = [toTuple(line) for line in eachline(f)]
end
# training dataとtest dataに9:1で、ランダムに分ける
(Dtrain, Dtest) = randomsplit(D, 0.9)

# ダウンロード/解凍したファイルを削除
run(`rm SMSSpamCollection`)
run(`rm readme`)
run(`rm tmp.zip`)


7-Zip 19.00 (x64) : Copyright (c) 1999-2018 Igor Pavlov : 2019-02-21

Scanning the drive for archives:
1 file, 203415 bytes (199 KiB)

Extracting archive: tmp.zip
--
Path = tmp.zip
Type = zip
Physical Size = 203415

Everything is Ok

Files: 2
Size:       483775
Compressed: 203415


Process(`[4mrm[24m [4mtmp.zip[24m`, ProcessExited(0))

In [4]:
# 解析可能な形式にデータを変換する
## training dataに含まれる各単語のリストを作成する。
vocab  = unique(Iterators.flatten([keys(d[2]) for d in Dtrain]))
## Nx(M+1)行列に変換する。(M=上で求めた単語リストのlength。+1は定数項)
Xtrain = makeMatrixfromDicts([d[2] for d in Dtrain], vocab)
Xtest  = makeMatrixfromDicts([d[2] for d in Dtest], vocab)
## 文字列のベクトルを0,1の二値をとるベクトルに変換する(# spam:1 ham:0)
label2id(label) = label == "ham" ? 0 : 1 
Ytrain = map(label2id, [d[1] for d in Dtrain])
Ytest  = map(label2id, [d[1] for d in Dtest])
println("made data")

made data


In [5]:
# ロジスティック回帰モデルの学習
sigmoid(x) = 1 / (1+exp(-x))
calcgrad(i,w) = -(Ytrain[i]-sigmoid(dot(w,Xtrain[i,:]))) * Xtrain[i,:]
N = length(Dtrain)
M = length(vocab)
η(t) = 1 / sqrt(1+t)
ϵ = 1e-4
MAX_EPOCHS = 1_000
vectorabs(vector) = sum(abs.(vector))
# (M+1)の+1は定数項分
w_sgd = stochastic_gradient_descent(calcgrad, N, zeros(M+1), η, ϵ, MAX_EPOCHS, vectorabs)

# 求めたパラメータを評価
## 定数項を除いたパラメータを重み順にソート
sorted_w = sort([(w_sgd[j],vocab[j]) for j in 1:M])
println("==Top 20 parameters with the smallest value==")
for i in (1:20); println(sorted_w[i]); end
println("==Top 20 parameters with the largest value==")
for i in (M-19:M); println(sorted_w[i]); end
## 正解率、適合率、再現率、F1スコア
Ŷtest = map(v -> v > 0 ? 1 : 0, Xtest * w_sgd)
TP=0;FP=0;FN=0;TN=0
for i in (1:length(Dtest))
    if Ytest[i]==1
        if Ŷtest[i] == 1; TP+=1; else FN+=1; end
    else
        if Ŷtest[i] == 1; FP+=1; else TN+=1; end
    end
end
accuracy = (TP+TN)/(TP+TN+FP+FN)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
f1score = 2 * precision * recall / (precision + recall)
println("==Scores==")
println("Accuracy  = $accuracy")
println("Precision = $precision")
println("Recall    = $recall")
println("F1Score   = $f1score")
GC.gc()

==Top 20 parameters with the smallest value==
(-0.8371700676461906, "i")
(-0.7232746742968215, "I")
(-0.6417394729600232, "know")
(-0.6258365331145282, "my")
(-0.5682449875509326, "me")
(-0.5422642215945389, "")
(-0.4764589075027166, "it")
(-0.4132434390876563, "but")
(-0.40319255833457146, "u")
(-0.38399481993009976, "dont")
(-0.38040466396017725, "again")
(-0.35780999327095064, "oh")
(-0.35355339059327373, "Me,")
(-0.33202557279674644, "you")
(-0.3212368724861954, "in")
(-0.29760680075349527, "do")
(-0.2962179921615618, "come")
(-0.2876885512402456, "Happy")
(-0.26401794424131575, "It")
(-0.26249926805157875, "5")
==Top 20 parameters with the largest value==
(0.30593573348105674, "with")
(0.3093851270202117, "ur")
(0.312124465549163, "£1000")
(0.3333798605813944, "txt")
(0.3439460211138959, "To")
(0.35353149206794493, "stop")
(0.38912368372474826, "claim")
(0.40743414395851846, "FREE")
(0.4200261025037076, "your")
(0.4381641410805507, "content")
(0.4468322215329658, "our")
(0.5440132