# Naive Bayes

In [1]:
import os
import glob
import pandas as pd

def get_data(directrory):
    data = {"news":[], "ans":[]}
    dirs = glob.glob(os.path.join(directrory, "*"))
    
    for d in dirs:
        path_d = os.path.join(d, "*.txt")            # windows：副檔名會自動轉換
#         path_d = os.path.join(d, "*.[tT][xX][tT]") # MacBook：副檔名有大小寫之分

        fs = glob.glob(path_d)  # 用 list 把所有檔案路徑列出來

        for f in fs:
            with open(f, "r", encoding="utf-8") as file:
                news = file.read()
                data["news"].append(news)
                
                ans = os.path.split(d)[-1]                
                data["ans"].append(ans)

    df = pd.DataFrame(data, columns=["news", "ans"])
    return df

In [2]:
# 匯入資料
train_df = get_data("news/chinese_news_train")
test_df = get_data("news/chinese_news_test")
test_df

Unnamed: 0,news,ans
0,日月光華 -- Traffic_Info精華區文章閱讀\n-----------------...,交通
1,日月光華 -- Traffic_Info精華區文章閱讀\n-----------------...,交通
2,日月光華 -- Traffic_Info精華區文章閱讀\n-----------------...,交通
3,三趟火車停開 乘客可全額退票 \n瀏覽次數：1180 \n 昨日，來自鐵路部門的消息說...,交通
4,日月光華 -- Traffic_Info精華區文章閱讀\n-----------------...,交通
...,...,...
96,最優秀選手無緣亞運會健美賽\n \n \n 健美在亞洲運動會上是“新生兒”——韓國釜...,體育
97,\n各國記者眼中的羽毛球世錦賽\n \n\n-----------------------...,體育
98,\n友好運動會第五天 東道主選手大顯神威\n2001年09月03日02:51:40 新華社 ...,體育
99,不靠技術比運氣\n\n 第二屆奧運會在巴黎舉行，同時這裡也正在舉行國際博覽會，東道主把一些...,體育


In [3]:
# 計算值數量：value_counts
# 取不重複值；unique

# 製作目標/答案轉為數字的字典
news_kind = train_df["ans"].unique()
trans = {k:i for i,k in enumerate(news_kind)}
print(trans)

# 反轉目標/答案字典，之後解讀資料用
reverse_trans = {i:k for k,i in trans.items()}

{'交通': 0, '政治': 1, '教育': 2, '環境': 3, '經濟': 4, '藝術': 5, '計算機': 6, '軍事': 7, '醫藥': 8, '體育': 9}


In [4]:
# 將目標/答案轉為數字
y_train = train_df["ans"].replace(trans)
y_test  = test_df["ans"].replace(trans)

In [5]:
# 下載 jieba大辭典
import os
from urllib.request import urlretrieve

DICT_PATH = "dict.txt.big"
if not os.path.exists(DICT_PATH):
    print("字典不存在，下載中")
    url = "https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big"
    urlretrieve(url, DICT_PATH)

In [6]:
# 對資料做文字分詞
import jieba

jieba.set_dictionary(DICT_PATH)

def newscut(s):
    s = " ".join(jieba.cut(s))
    s = s.replace("\r","").replace("\n","")
    return s

x_train_raw = train_df["news"].apply(newscut)
x_test_raw  = test_df["news"].apply(newscut)

Building prefix dict from d:\Python\GitHube\Machine_Learning\dict.txt.big ...
Loading model from cache C:\Users\yalon\AppData\Local\Temp\jieba.u97a541a3dec7202620b0e6479afd340f.cache
Loading model cost 1.186 seconds.
Prefix dict has been built successfully.


In [7]:
# 轉換成計算資料出現次數：CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer()
x_train = vec.fit_transform(x_train_raw)
x_test  = vec.transform(x_test_raw)

In [8]:
# 反轉特徵欄位字典，之後解讀資料用
reverse_voca = { v:k for k, v in vec.vocabulary_.items()}

In [9]:
# 訓練模型（做公式）：MultinomialNB
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB() 
clf.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
# 預測結果：predict
pre = clf.predict(x_test)

In [11]:
# 驗證模型：計算正確率
from sklearn.metrics import accuracy_score

accuracy_score(pre, y_test)

1.0

In [12]:
# 驗證模型：評估分類準確性
from sklearn.metrics import confusion_matrix

mat = confusion_matrix(y_test, pre)
c = [f"{s}(預測)" for s in news_kind]
r = [f"{s}(目標)" for s in news_kind]
pd.DataFrame(mat, columns=c, index=r)

Unnamed: 0,交通(預測),政治(預測),教育(預測),環境(預測),經濟(預測),藝術(預測),計算機(預測),軍事(預測),醫藥(預測),體育(預測)
交通(目標),10,0,0,0,0,0,0,0,0,0
政治(目標),0,11,0,0,0,0,0,0,0,0
教育(目標),0,0,10,0,0,0,0,0,0,0
環境(目標),0,0,0,10,0,0,0,0,0,0
經濟(目標),0,0,0,0,10,0,0,0,0,0
藝術(目標),0,0,0,0,0,10,0,0,0,0
計算機(目標),0,0,0,0,0,0,10,0,0,0
軍事(目標),0,0,0,0,0,0,0,10,0,0
醫藥(目標),0,0,0,0,0,0,0,0,10,0
體育(目標),0,0,0,0,0,0,0,0,0,10


## 讓使用者輸入新聞做預測

In [14]:
s = input("請輸入新聞：\n") 
s = vec.transform([newscut(s)])

pre = clf.predict(s)[0]
ans = reverse_trans[pre]
print("\n新聞分類應該是：", ans)

print("新聞各分類百分比：")
pre_proba = clf.predict_proba(s)[0]
pre_proba = zip(news_kind, pre_proba)
for kind, proba in sorted(pre_proba, key=lambda x:x[1], reverse=True):
    print(kind, ":", round(proba*100, 2), "%")


新聞分類應該是： 經濟
新聞各分類百分比：
經濟 : 100.0 %
政治 : 0.0 %
軍事 : 0.0 %
計算機 : 0.0 %
教育 : 0.0 %
環境 : 0.0 %
藝術 : 0.0 %
交通 : 0.0 %
體育 : 0.0 %
醫藥 : 0.0 %
