# 資料訓練模組

## 前置套件安裝
在Python3.6, pip3.6 的環境下安裝所需套件

In [1]:
!pip3.6 install pandas
!pip3.6 install numpy
!pip3.6 install scipy
!pip3.6 install sklearn



# 設定訓練集以及測試集大小

In [2]:
# 測試資料大小
training_size = 700
# 訓練資料大小
testing_size = 100
# 訓練資料來源
output_file_path="/Users/Steve/PycharmProjects/Phantacy/result_all.json"

# 訓練程式
使用SVM作為訓練模型

In [3]:
import json
import logging
import threading

import pandas as pd
import jieba
import jieba.analyse
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from IPython.display import display, HTML

jieba.set_dictionary('dict.txt.big')

with open('stop_words.txt', encoding='utf-8') as stop_file:
    stopwords = stop_file.readlines()
stopwords = [w.strip() for w in stopwords]

svc = SVC()
nb = MultinomialNB()

def tokenize(sentence):
    return [token for token in jieba.cut(sentence) if token not in stopwords]


def read_result(file_name):
    with open(file_name, encoding='utf-8') as json_data:
        posts = json.load(json_data)
        return posts


def get_training_pipeline(classifier=nb):
#     anova_filter = SelectKBest(f_regression, k=10)
    text_clf = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),
                         ('tfidf', TfidfTransformer()),
#                          ('anova', anova_filter),
                         ('clf', classifier)])
    return text_clf


class TrainingTask(threading.Thread):
    def __init__(self, output):
        super(TrainingTask, self).__init__()
        self.is_complete = False
        self.output = output
        self.result_location = output_file_path
        self.training_size = training_size
        self.testing_size = testing_size

    def print_to_output(self, text):
        try:
            self.output.refresh_text(text)
        except:
            print(text)

    def prepare_data(self):
        posts = read_result(self.result_location)
        contents = [post['post_content'] for post in posts]
        replies = [post['fan_page_reply'] for post in posts]

        self.print_to_output("訓練集大小: {}".format(self.training_size))
        self.print_to_output("測試集大小: {}".format(self.testing_size))
        self.target_contents = contents[-self.testing_size:]
        self.target_replies = replies[-self.testing_size:]
        self.train_contents = contents[:self.training_size]
        self.train_replies = replies[:self.training_size]

    def run(self):
        self.prepare_data()
        self.print_to_output("模型訓練中 請耐心等待")

        pipeline = get_training_pipeline()
        pipeline.fit(self.train_contents, self.train_replies)
        self.predicted = pipeline.predict(self.target_contents)
        self.is_complete = True

        self.print_to_output("留言：")
        output_dict = {
            "是否該回應": self.predicted,
            "留言": [content.strip() for content in self.target_contents]
        }
    
        df = pd.DataFrame(output_dict)
        display(df)
        rate = np.mean(self.predicted == self.target_replies)
        self.print_to_output("準確率: {0:.2f}%".format(rate * 100))
    



  if 'order' in inspect.getargspec(np.copy)[0]:


呼叫主程式開始訓練

In [4]:
def main():
    TrainingTask(None).run()
    

if __name__ == '__main__':
    main()

Building prefix dict from /Users/Steve/PycharmProjects/Phantacy/dict.txt.big ...
Loading model from cache /var/folders/f4/zb5h9lfx05n037z67ppm2dlh0000gn/T/jieba.u06f842be9aa390fb9509e48cb4a603a3.cache


訓練集大小: 700
測試集大小: 100
模型訓練中 請耐心等待


Loading model cost 3.437 seconds.
Prefix dict has been built succesfully.


留言：


Unnamed: 0,是否該回應,留言
0,False,首次發言，如有失禮之處望乞見諒。我是一位交通警隊員，人生至今大概也僅剩下替百姓處理交通事故的...
1,False,關於「颱風假」的問題想請教承辦的長官，由於外勤員警的勤務24小時不間斷，若29日為全日停止上...
2,False,這件衣服是給你們用來服務民眾的，不是供你們拍照打卡的。怎麼現在一堆實習生穿著衣服到處拍照呢這...
3,False,《根本完全敷衍~24小時不變，副署長倒下了，所長大家不想當，2線3以上不算-因爲會影響警政署...
4,False,針對新北市的創新作為：受理住宅竊盜案件到府服務 ...
5,False,◎員警因公受傷案，代表署長慰問案◎臺北市政府警察局大同分局重慶北路派出所警員黃威傑、警員賴世...
6,False,請問署長及各位長官，為何當保全公司報110指出其營業項目之公司警報器響起時，警察人員是否必須...
7,False,想要請最關心基層的署長，颱風夜風大雨大，路上僅剩下往來稀疏的車子，但是這種天氣卻要員警開車去...
8,False,◎慰問員警因救災受傷案◎ 新北市政府警察局淡水分局中山路派出所警員李瑞庭在昨(29)日...
9,False,#合理的懷疑警察的公正度ㄧ名自稱西螺鎮和心派出所警員的林勝智先生和自稱和心所所長的廖毓田先生...


準確率: 62.00%
