# 匯入套件

In [39]:
# 匯入套件
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import jieba

# 英文部分

In [None]:
# 假設我們有這些英文文本
docs = [
    "The city issued a typhoon warning and announced school closures for tomorrow; several bus routes will run on a reduced schedule.",
    "Due to the storm, multiple flights were canceled; the airport advised travelers to check updates and arrive early for security screening.",
    "Rail operators added extra trains for passenger evacuation, while highways experienced flooding and temporary road closures.",
    "The stock market turned volatile after the typhoon forecast; insurance and shipping stocks fell as investors shifted to safe-haven assets.",
    "Emergency crews inspected rivers and operated pumping stations; residents were warned about landslides and asked to avoid mountain areas."
]

# 建立 TF-IDF 範例
'''
ngram_range 可調整 n-gram 範圍，
(1,1) 表示只使用 unigram，
(2,2) 表示只使用 bigram, 
(1,2) 表示同時使用 unigram 和 bigram
'''
vectorizer = TfidfVectorizer(
    stop_words='english', # 移除英文停用詞
    ngram_range=(1, 2), # 使用單字和雙字詞組合
)

# 進行 TF-IDF 轉換
X = vectorizer.fit_transform(docs)

# 顯示結果
print(vectorizer.get_feature_names_out())
print(X.toarray())

['added' 'added extra' 'advised' 'advised travelers' 'airport'
 'airport advised' 'announced' 'announced school' 'areas' 'arrive'
 'arrive early' 'asked' 'asked avoid' 'assets' 'avoid' 'avoid mountain'
 'bus' 'bus routes' 'canceled' 'canceled airport' 'check' 'check updates'
 'city' 'city issued' 'closures' 'closures tomorrow' 'crews'
 'crews inspected' 'early' 'early security' 'emergency' 'emergency crews'
 'evacuation' 'evacuation highways' 'experienced' 'experienced flooding'
 'extra' 'extra trains' 'fell' 'fell investors' 'flights'
 'flights canceled' 'flooding' 'flooding temporary' 'forecast'
 'forecast insurance' 'haven' 'haven assets' 'highways'
 'highways experienced' 'inspected' 'inspected rivers' 'insurance'
 'insurance shipping' 'investors' 'investors shifted' 'issued'
 'issued typhoon' 'landslides' 'landslides asked' 'market' 'market turned'
 'mountain' 'mountain areas' 'multiple' 'multiple flights' 'operated'
 'operated pumping' 'operators' 'operators added' 'passenger'
 '

In [41]:
# 顯示 dataframe 所有欄位
pd.set_option('display.max_columns', None)

# 透過 pandas 來預覽結果
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df

Unnamed: 0,added,added extra,advised,advised travelers,airport,airport advised,announced,announced school,areas,arrive,arrive early,asked,asked avoid,assets,avoid,avoid mountain,bus,bus routes,canceled,canceled airport,check,check updates,city,city issued,closures,closures tomorrow,crews,crews inspected,early,early security,emergency,emergency crews,evacuation,evacuation highways,experienced,experienced flooding,extra,extra trains,fell,fell investors,flights,flights canceled,flooding,flooding temporary,forecast,forecast insurance,haven,haven assets,highways,highways experienced,inspected,inspected rivers,insurance,insurance shipping,investors,investors shifted,issued,issued typhoon,landslides,landslides asked,market,market turned,mountain,mountain areas,multiple,multiple flights,operated,operated pumping,operators,operators added,passenger,passenger evacuation,pumping,pumping stations,rail,rail operators,reduced,reduced schedule,residents,residents warned,rivers,rivers operated,road,road closures,routes,routes run,run,run reduced,safe,safe haven,schedule,school,school closures,screening,security,security screening,shifted,shifted safe,shipping,shipping stocks,stations,stations residents,stock,stock market,stocks,stocks fell,storm,storm multiple,temporary,temporary road,tomorrow,tomorrow bus,trains,trains passenger,travelers,travelers check,turned,turned volatile,typhoon,typhoon forecast,typhoon warning,updates,updates arrive,volatile,volatile typhoon,warned,warned landslides,warning,warning announced
0,0.0,0.0,0.0,0.0,0.0,0.0,0.202853,0.202853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.202853,0.202853,0.0,0.0,0.0,0.0,0.202853,0.202853,0.16366,0.202853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.202853,0.202853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.202853,0.202853,0.0,0.0,0.0,0.0,0.0,0.0,0.202853,0.202853,0.202853,0.202853,0.0,0.0,0.202853,0.202853,0.202853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.202853,0.202853,0.0,0.0,0.0,0.0,0.0,0.0,0.16366,0.0,0.202853,0.0,0.0,0.0,0.0,0.0,0.0,0.202853,0.202853
1,0.0,0.0,0.2,0.2,0.2,0.2,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.0,0.0,0.0
2,0.201411,0.201411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.162497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.201411,0.201411,0.201411,0.201411,0.201411,0.201411,0.0,0.0,0.0,0.0,0.201411,0.201411,0.0,0.0,0.0,0.0,0.201411,0.201411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.201411,0.201411,0.201411,0.201411,0.0,0.0,0.201411,0.201411,0.0,0.0,0.0,0.0,0.0,0.0,0.201411,0.201411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.201411,0.201411,0.0,0.0,0.201411,0.201411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186823,0.186823,0.0,0.0,0.0,0.0,0.186823,0.186823,0.186823,0.186823,0.0,0.0,0.0,0.0,0.186823,0.186823,0.186823,0.186823,0.0,0.0,0.0,0.0,0.186823,0.186823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186823,0.186823,0.0,0.0,0.0,0.0,0.0,0.0,0.186823,0.186823,0.186823,0.186823,0.0,0.0,0.186823,0.186823,0.186823,0.186823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186823,0.186823,0.150728,0.186823,0.0,0.0,0.0,0.186823,0.186823,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19245,0.0,0.0,0.19245,0.19245,0.0,0.19245,0.19245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19245,0.19245,0.0,0.0,0.19245,0.19245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19245,0.19245,0.0,0.0,0.0,0.0,0.0,0.0,0.19245,0.19245,0.0,0.0,0.19245,0.19245,0.0,0.0,0.19245,0.19245,0.0,0.0,0.0,0.0,0.19245,0.19245,0.0,0.0,0.0,0.0,0.19245,0.19245,0.19245,0.19245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19245,0.19245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19245,0.19245,0.0,0.0


In [42]:
# 使用 TF-IDF 進行搜尋函式範例
def search_tfidf_en(query: str, top_k: int = 3, min_score: float = 0.0):
    '''
    以 TF-IDF + cosine similarity 做簡易搜尋排序。
    - query: 使用者查詢字串
    - top_k: 回傳前 k 筆
    - min_score: 低於此分數不顯示（避免全是 0 的結果）
    '''

    # 將查詢轉換為 TF-IDF 向量
    q_vec = vectorizer.transform([query])

    # 計算查詢向量與文件向量的餘弦相似度
    scores = cosine_similarity(q_vec, X).flatten()

    # 依分數由高到低排序
    ranked_idx = np.argsort(scores)[::-1]

    # 取得前 top_k 筆且分數高於 min_score 的結果
    results = []

    # 回傳 (文件索引, 分數, 內容)
    for idx in ranked_idx[:top_k]:
        if scores[idx] >= min_score:
            results.append((idx, scores[idx], docs[idx]))
    return results

# 查詢範例
queries = [
    "flight canceled airport security",
    "school closure bus schedule",
    "landslide pumping station river",
    "stock market insurance safe haven",
]
for q in queries:
    print(f"Query: {q}")
    for doc_id, score, text in search_tfidf_en(q, top_k=3, min_score=0.0):
        print(f"  score={score:.3f}  doc#{doc_id+1}: {text[:90]}...")

Query: flight canceled airport security
  score=0.400  doc#2: Due to the storm, multiple flights were canceled; the airport advised travelers to check u...
  score=0.000  doc#5: Emergency crews inspected rivers and operated pumping stations; residents were warned abou...
  score=0.000  doc#4: The stock market turned volatile after the typhoon forecast; insurance and shipping stocks...
Query: school closure bus schedule
  score=0.000  doc#5: Emergency crews inspected rivers and operated pumping stations; residents were warned abou...
  score=0.000  doc#4: The stock market turned volatile after the typhoon forecast; insurance and shipping stocks...
Query: landslide pumping station river
  score=0.192  doc#5: Emergency crews inspected rivers and operated pumping stations; residents were warned abou...
  score=0.000  doc#4: The stock market turned volatile after the typhoon forecast; insurance and shipping stocks...
  score=0.000  doc#3: Rail operators added extra trains for passenger evac

# 中文部分

In [43]:
# 假設我們有這些中文文本
documents = [
    "中央氣象署發布颱風警報，台北市宣布明天停班停課，部分路線公車減班。",
    "颱風影響航班取消，高鐵與台鐵加開班次疏運旅客，機場提醒提早報到。",
    "學校公告因豪雨停課，校園進行排水與樹木修剪，家長關心補課安排。",
    "股市受颱風消息影響震盪，部分保險與航運類股下跌，投資人轉向避險。",
    "市政府加強河川巡檢與抽水站運轉，呼籲民眾遠離河堤並注意土石流警戒。"
]

# 建立 TF-IDF 範例
vectorizer = TfidfVectorizer(
    tokenizer=jieba.lcut,  # 使用 jieba 進行中文分詞
    token_pattern=None,  # 使用自訂的分詞器
)

# 進行 TF-IDF 轉換
X = vectorizer.fit_transform(documents)

# 顯示結果
print(vectorizer.get_feature_names_out())
print(X.toarray())

['。' '下跌' '並' '中央' '保險' '修剪' '停班' '停課' '公告' '公車' '加強' '取消' '受' '台' '台北市'
 '呼籲民眾' '因' '土石' '報到' '學校' '安排' '宣布' '家長' '巡檢' '市政府' '影響' '投資人' '抽水站' '排水'
 '提早' '提醒' '旅客' '明天' '校園' '樹木' '機場' '氣象署' '河堤' '河川' '注意' '流' '消息' '減班'
 '班次' '疏運' '發布' '股' '股市' '與' '航班' '航運類' '補課' '警戒' '豪雨' '路線' '轉向' '進行' '運轉'
 '遠離' '避險' '部分' '鐵加開' '關心' '震盪' '風' '風影響' '風警報' '颱' '高鐵' '，']
[[0.1278748  0.         0.         0.26835962 0.         0.
  0.26835962 0.21651085 0.         0.26835962 0.         0.
  0.         0.         0.26835962 0.         0.         0.
  0.         0.         0.         0.26835962 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.26835962 0.         0.         0.
  0.26835962 0.         0.         0.         0.         0.
  0.26835962 0.         0.         0.26835962 0.         0.
  0.         0.         0.         0.         0.         0.
  0.26835962 0.         0.         0.         0.         0.
  0.21651085 0.         0.         0.       

In [44]:
# 透過 pandas 來預覽結果
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df

Unnamed: 0,。,下跌,並,中央,保險,修剪,停班,停課,公告,公車,加強,取消,受,台,台北市,呼籲民眾,因,土石,報到,學校,安排,宣布,家長,巡檢,市政府,影響,投資人,抽水站,排水,提早,提醒,旅客,明天,校園,樹木,機場,氣象署,河堤,河川,注意,流,消息,減班,班次,疏運,發布,股,股市,與,航班,航運類,補課,警戒,豪雨,路線,轉向,進行,運轉,遠離,避險,部分,鐵加開,關心,震盪,風,風影響,風警報,颱,高鐵,，
0,0.127875,0.0,0.0,0.26836,0.0,0.0,0.26836,0.216511,0.0,0.26836,0.0,0.0,0.0,0.0,0.26836,0.0,0.0,0.0,0.0,0.0,0.0,0.26836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26836,0.0,0.0,0.0,0.26836,0.0,0.0,0.0,0.0,0.0,0.26836,0.0,0.0,0.26836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26836,0.0,0.0,0.0,0.0,0.0,0.216511,0.0,0.0,0.0,0.0,0.0,0.26836,0.179724,0.0,0.25575
1,0.12344,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.259053,0.0,0.259053,0.0,0.0,0.0,0.0,0.259053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.259053,0.259053,0.259053,0.0,0.0,0.0,0.259053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.259053,0.259053,0.0,0.0,0.0,0.145946,0.259053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.259053,0.0,0.0,0.0,0.259053,0.0,0.173491,0.259053,0.246881
2,0.12261,0.0,0.0,0.0,0.0,0.257312,0.0,0.207598,0.257312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.257312,0.0,0.0,0.257312,0.257312,0.0,0.257312,0.0,0.0,0.0,0.0,0.0,0.257312,0.0,0.0,0.0,0.0,0.257312,0.257312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.144965,0.0,0.0,0.257312,0.0,0.257312,0.0,0.0,0.257312,0.0,0.0,0.0,0.0,0.0,0.257312,0.0,0.0,0.0,0.0,0.0,0.0,0.245221
3,0.12083,0.253574,0.0,0.0,0.253574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.253574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.253574,0.253574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.253574,0.0,0.0,0.0,0.0,0.253574,0.253574,0.142859,0.0,0.253574,0.0,0.0,0.0,0.0,0.253574,0.0,0.0,0.0,0.253574,0.204582,0.0,0.0,0.253574,0.253574,0.0,0.0,0.169822,0.0,0.241659
4,0.123981,0.0,0.260188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.260188,0.0,0.0,0.0,0.0,0.260188,0.0,0.260188,0.0,0.0,0.0,0.0,0.0,0.260188,0.260188,0.0,0.0,0.260188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.260188,0.260188,0.260188,0.260188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146585,0.0,0.0,0.0,0.260188,0.0,0.0,0.0,0.0,0.260188,0.260188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.123981


In [45]:
# 使用 TF-IDF 進行搜尋函式範例
def search_tfidf_zh(query, top_k=3, min_score=0.0):
    q_vec = vectorizer.transform([query])
    scores = cosine_similarity(q_vec, X).flatten()
    idx = np.argsort(scores)[::-1][:top_k]
    results = []
    for i in idx:
        if scores[i] >= min_score:
            results.append((i, scores[i], documents[i]))
    return results

# 查詢範例
query = "航班取消 機場 提早報到"
results = search_tfidf_zh(query, top_k=3, min_score=0.0)
for rank, score, text in results:
    print(f"Rank {rank}: score={score:.3f}, text={text}")

Rank 1: score=0.579, text=颱風影響航班取消，高鐵與台鐵加開班次疏運旅客，機場提醒提早報到。
Rank 4: score=0.000, text=市政府加強河川巡檢與抽水站運轉，呼籲民眾遠離河堤並注意土石流警戒。
Rank 3: score=0.000, text=股市受颱風消息影響震盪，部分保險與航運類股下跌，投資人轉向避險。
