In [None]:
from sklearn.feature_extraction.text import CountVectorizer
texts = ["I love programming.", "Programming is fun."]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)

In [None]:
print(vectorizer.get_feature_names_out())
print(X.toarray())

['fun' 'is' 'love' 'programming']
[[0 0 1 1]
 [1 1 0 1]]


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
texts = ["I love programming.", "Programming is fun."]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

In [None]:
print(vectorizer.get_feature_names_out())
print(X.toarray())

['fun' 'is' 'love' 'programming']
[[0.         0.         0.81480247 0.57973867]
 [0.6316672  0.6316672  0.         0.44943642]]


In [None]:
import jieba
list(jieba.cut('太多的巧合就是有緣。'))

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.876 seconds.
DEBUG:jieba:Loading model cost 0.876 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


['太多', '的', '巧合', '就是', '有緣', '。']

In [None]:
!wget -O dict.txt https://raw.githubusercontent.com/imchihchao/aop113b/main/materials/04-dict.txt

--2025-06-03 12:05:49--  https://raw.githubusercontent.com/imchihchao/aop113b/main/materials/04-dict.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4154480 (4.0M) [application/octet-stream]
Saving to: ‘dict.txt’


2025-06-03 12:05:49 (49.0 MB/s) - ‘dict.txt’ saved [4154480/4154480]



In [None]:
import jieba
#繁中字典
jieba.set_dictionary('dict.txt')

In [None]:
!wget -O stopwords.txt https://raw.githubusercontent.com/imchihchao/aop113b/main/materials/04-stopwords.txt

--2025-06-03 12:06:47--  https://raw.githubusercontent.com/imchihchao/aop113b/main/materials/04-stopwords.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8129 (7.9K) [text/plain]
Saving to: ‘stopwords.txt’


2025-06-03 12:06:47 (14.5 MB/s) - ‘stopwords.txt’ saved [8129/8129]



In [None]:
import jieba
# 載入停用詞表
def load_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        stopwords = set(line.strip() for line in f if line.strip())
    return stopwords
# 分詞並過濾停用詞
def jieba_cut_with_stopwords(text, stopwords):
    words = jieba.cut(text)
    return [word for word in words if word not in stopwords and word.strip() != '']
# 使用範例
stopwords = load_stopwords("stopwords.txt")


In [None]:
filtered_words = jieba_cut_with_stopwords('太多的巧合就是有緣。', stopwords)
filtered_words

Building prefix dict from /content/dict.txt ...
DEBUG:jieba:Building prefix dict from /content/dict.txt ...
Dumping model to file cache /tmp/jieba.ueb620ec8402181953a0299d7957c0d6e.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.ueb620ec8402181953a0299d7957c0d6e.cache
Loading model cost 0.701 seconds.
DEBUG:jieba:Loading model cost 0.701 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


['太多', '巧合', '有緣']

In [None]:
import jieba
saying = ["努力也是一種天賦。","太多的巧合就是有緣。"]
saying = [ ' '.join(list(jieba.cut(s))) for s in saying ]
saying

['努力 也 是 一種 天賦 。', '太多 的 巧合 就是 有緣 。']

### 資料收集

In [None]:
import pandas as pd
url = 'https://raw.githubusercontent.com/imchihchao/aop113b/main/materials/04-ettoday_news.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,類別,標題
0,政治,青年座談不被AI取代的關鍵力　葉丙成：只有這兩種人才能真正把握時間紅利
1,政治,蕭美琴直播初體驗曝「咪琴嚴選」　邀網友挺花蓮
2,政治,疑似國軍「在美受訓」照曝光！　陸軍不評論
3,政治,「蔡英文、柯文哲有談NCC」　府：沒談人選、政黨比例更沒私下喬
4,政治,總統府否認蔡英文向柯喬NCC委員　黃國昌嗆：醜聞被揭露還敢說謊


### 資料前處理

#### 資料清理

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29216 entries, 0 to 29215
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   類別      29216 non-null  object
 1   標題      29216 non-null  object
dtypes: object(2)
memory usage: 456.6+ KB


探索性分析

#### 資料分割

In [None]:
!wget -O dict.txt https://raw.githubusercontent.com/imchihchao/aop113b/main/materials/04-dict.txt
!wget -O stopwords.txt https://raw.githubusercontent.com/imchihchao/aop113b/main/materials/04-stopwords.txt

--2025-06-03 12:10:15--  https://raw.githubusercontent.com/imchihchao/aop113b/main/materials/04-dict.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4154480 (4.0M) [application/octet-stream]
Saving to: ‘dict.txt’


2025-06-03 12:10:16 (46.4 MB/s) - ‘dict.txt’ saved [4154480/4154480]

--2025-06-03 12:10:16--  https://raw.githubusercontent.com/imchihchao/aop113b/main/materials/04-stopwords.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8129 (7.9K) [text/plain]
Saving to: ‘stopwords.txt’


2025-06-03 12:10:16 (29.9 MB/s) - ‘

In [None]:
import jieba
jieba.set_dictionary('dict.txt')
def load_stopwords(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        stopwords = set(line.strip() for line in f if line.strip())
    return stopwords
def jieba_cut_with_stopwords(text, stopwords):
    words = jieba.cut(text)
    return [word for word in words if word not in stopwords and word.strip() != '']

stopwords = load_stopwords("stopwords.txt")

In [None]:
df['標題'] = df['標題'].apply(lambda x: ' '.join(jieba_cut_with_stopwords(x, stopwords)))

Building prefix dict from /content/dict.txt ...
DEBUG:jieba:Building prefix dict from /content/dict.txt ...
Dumping model to file cache /tmp/jieba.ueb620ec8402181953a0299d7957c0d6e.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.ueb620ec8402181953a0299d7957c0d6e.cache
Loading model cost 1.094 seconds.
DEBUG:jieba:Loading model cost 1.094 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


In [None]:
df

Unnamed: 0,類別,標題
0,政治,青年 座談 AI 取代 關鍵 力 葉丙成 這兩種 人才 真正 把握 時間 紅利
1,政治,蕭美琴 直播 體驗 曝 咪琴 嚴選 邀 網友 挺 花蓮
2,政治,疑似 國軍 美 受訓 曝光 陸軍 評論
3,政治,蔡英文 柯文哲 有談 NCC 府 沒談 人選 政黨 比例 私下 喬
4,政治,總統府 否認 蔡英文 柯喬 NCC 委員 黃國昌 嗆 醜聞 揭露 還敢 說謊
...,...,...
29211,生活,瞞 未婚夫 下海 拍片 休息 月 後 I 級 AV 女優 宣布 引退 原因 曝
29212,生活,努力 考上 清大 家人 句 嫁 有錢 工程師 心冷
29213,生活,北榮 藝廊 閃 一週 陳威明 鎮館 寶 古柏 園 象徵 醫者 精神
29214,生活,收到 一整 串 粽子 媽 氣炸 大票 愣 每年 吃到 吐 沒聽過


In [None]:
X,y = df['標題'],df['類別']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### 類別轉換

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

特徵縮放

### 模型訓練

In [None]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha=1.0)
mnb.fit(X_train, y_train)

In [None]:
mnb.score(X_train, y_train)

0.9073677905185692

### 模型評估

In [None]:
from sklearn.metrics import accuracy_score
y_pred = mnb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy",accuracy)

Accuracy 0.7838809034907598


模型調整

### 模型部屬

儲存模型

#### 推論預測

In [None]:
text = '桌球/開戰?嗆體育署憑一面之詞霸凌桌協 選訓委員:勿用政治手段處理'
s = vectorizer.transform([' '.join(jieba_cut_with_stopwords(text, stopwords))])
c = mnb.predict(s)[0]
category = label_encoder.classes_[c]
print(category)

政治
