# 第9章 機械学習の適用2 - Web アプリケーション

Web アプリケーションにおける機械学習の応用例
- 入力フォームのバリデーション
- 検索エンジン
- EC サイトのレコメンドシステム
- スパム検出

## 9章のゴール

- トレーニングした機械学習モデルの状態を保存する
- データストレージとして SQLite を使用する
- Web フレームワーク Flask を使って Web アプリケーションを開発する
- 機械学習アプリケーションを Web サーバにデプロイする

## 9.1 学習済みの scikit-learn 推定器をシリアライズする
問題
- 機械学習モデルのトレーニングの計算コストは非常に高いため、都度モデルをトレーニングしたり、新しい予測を生成したりするのは非常に非効率である。

解決策
- pickle モジュールを用いてPython オブジェクトをコンパクトなバイトコードにシリアライズし、モデルを永続化する



In [1]:
# Load stop model
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')

# ちゃんと動くか確認
def tokenizer_porter(text): 
    porter = PorterStemmer()
    return [porter.stem(word) for word in text.split()]

[w for w in tokenizer_porter('a runner likse running and runs a lot')[-10:] if w not in stop]


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/s12723/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['runner', 'liks', 'run', 'run', 'lot']

### 前処理・モデルの構築


In [1]:
import nltk
import numpy as np 
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop = stopwords.words('english')
porter = PorterStemmer()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/s12723/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
next(stream_docs(path='datasets/movie_data.csv'))

('"My family and I normally do not watch local movies for the simple reason that they are poorly made, they lack the depth, and just not worth our time.<br /><br />The trailer of ""Nasaan ka man"" caught my attention, my daughter in law\'s and daughter\'s so we took time out to watch it this afternoon. The movie exceeded our expectations. The cinematography was very good, the story beautiful and the acting awesome. Jericho Rosales was really very good, so\'s Claudine Barretto. The fact that I despised Diether Ocampo proves he was effective at his role. I have never been this touched, moved and affected by a local movie before. Imagine a cynic like me dabbing my eyes at the end of the movie? Congratulations to Star Cinema!! Way to go, Jericho and Claudine!!"',
 1)

In [3]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
import pyprind

vect = HashingVectorizer(decode_error='ignore',  n_features=2**21, preprocessor=None,  tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path='datasets/movie_data.csv')

pbar = pyprind.ProgBar(45)

classes = np.array([0, 1])
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:23


In [4]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.866


### シリアライズ


In [17]:
import pickle
import os

dest = os.path.join('movieclassifier', 'pkl_objects')

if not os.path.exists(dest):
    os.makedirs(dest)

pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)




In [52]:
from sklearn.feature_extraction.text import HashingVectorizer
from pathlib import Path
import re
import os
import pickle

# cur_dir = os.path.dirname(__file__)
cur_dir = Path().resolve()
stop = pickle.load(open(os.path.join(cur_dir, 'movieclassifier', 'pkl_objects', 'stopwords.pkl'), 'rb'))

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)

In [53]:

import pickle
import re
import os
# from vectorizer import vect
clf = pickle.load(open(os.path.join('movieclassifier', 'pkl_objects', 'classifier.pkl'), 'rb'))

In [54]:
import numpy as np
label = {0:'negative', 1:'positive'}

example = ["I love this movie. It's amazing."]
X = vect.transform(example)

print(clf)
print('Prediction: %s\nProbability: %.2f%%' % (label[clf.predict(X)[0]],  np.max(clf.predict_proba(X))*100))

SGDClassifier(loss='log', max_iter=1, random_state=1)
Prediction: positive
Probability: 94.47%


### SQLite DB の設定

In [55]:
import sqlite3
import os 

conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()

# `review_db` テーブルが存在していたら削除（初期化）
c.execute('DROP TABLE IF EXISTS review_db') 
# `teview_db` テーブルを作成
#  - review: TEXT レビューコメント
#  - sentiment: INTEGER 感情（0 = ネガ, 1 = ポジ）
#  - date: TEXT 作成日
c.execute('CREATE TABLE review_db (review TEXT, sentiment INTEGER, date TEXT)')

# テストデータの挿入
example1 = 'I love this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example1, 1))
example2 = 'I disliked this movie'
c.execute("INSERT INTO review_db (review, sentiment, date) VALUES (?, ?, DATETIME('now'))", (example2, 0))

conn.commit()
conn.close()

In [57]:
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()

c.execute("SELECT * FROM review_db WHERE date BETWEEN '2017-01-01 10:10:10' AND DATETIME('now')")
results = c.fetchall()

conn.close()

print(results)

[('I love this movie', 1, '2021-06-11 05:57:49'), ('I disliked this movie', 0, '2021-06-11 05:57:49')]
