In [25]:
import os
import jieba
import jieba.analyse
import pandas as pd
import pyodbc
import configparser
from bs4 import BeautifulSoup
import re
import nltk

In [26]:
config = configparser.ConfigParser()
config.read('config.env')
db_UserName = config.get('DEFAULT', 'DB_USERNAME')
db_Password = config.get('DEFAULT', 'DB_PASSWORD')
db_Name = config.get('DEFAULT', 'DB_NAME')
db_Host = config.get('DEFAULT', 'DB_HOST')

cnxn_str = ("Driver={ODBC Driver 17 for SQL Server};"
            f"Server={db_Host};"
            f"Database={db_Name};"
            f"UID={db_UserName};"
            f"PWD={db_Password};")

cnxn = pyodbc.connect(cnxn_str)
# Create a cursor from the connection
cursor = cnxn.cursor()

In [27]:
# SQL查詢語句
query = ("select id,title ,context from ("
         "select a.id,title,context from pttpost_referendum_3 a "
         " inner join pttpost b on a.source=b.source and a.id=b.Id "
         " where not exists (select * from keyword where source=99 and (b.title like '%'+keyname+'%' or b.context like '%'+keyname+'%')) "
         " union all "
         " select a.id,title,context from pttpost_referendum_3 a "
         " inner join pttpostgossing b on a.source=b.source and a.id=b.Id "
         " where not exists (select * from keyword where source=99 and (b.title like '%'+keyname+'%' or b.context like '%'+keyname+'%')) "
         " union all "
         " select convert(varchar,a.id),title,content from dcard.dbo.pttpost_referendum_3 a "
         " inner join dcard.dbo.post b on a.source=b.forum and a.id=b.Id "
         " where not exists (select * from keyword where source=99 and (b.title like '%'+keyname+'%' or b.content like '%'+keyname+'%')) "
         " ) m "
         "where 1=1")

In [28]:
# 讀取資料表
df = pd.read_sql(query, cnxn)



In [29]:
# 設置TF-IDF參數
topK = 50
withWeight = True

# 載入知網詞庫
jieba.set_dictionary('C:\project\python\dict.big5.txt')

# 載入自定義詞庫
jieba.load_userdict('C:\project\python\main.txt')

# 設置停用詞
jieba.analyse.set_stop_words('C:\project\python\stopWord2.txt')

# 讀取停用詞表
with open('C:\project\python\stopWord2.txt', 'r', encoding='utf-8') as f:
    stop_words = f.read().split()



Building prefix dict from C:\project\python\dict.big5.txt ...
Loading model from cache C:\Users\HuanChen\AppData\Local\Temp\jieba.u7bf78fb8a3e5c528afaa2a9a1de33675.cache
Loading model cost 1.237 seconds.
Prefix dict has been built successfully.


In [30]:
def PrintKeyWord(col1, col2, resultString):
 # 取出所有關鍵詞
    print('get all key words')
    keywords = []
    for index, row in df.iterrows():
        for keyword, weight in row[col1]:
            keywords.append((keyword, weight))
        for keyword, weight in row[col2]:
            keywords.append((keyword, weight))

    # 轉換為dataframe
    print('轉換為dataframe')
    keywords_df = pd.DataFrame(keywords, columns=['keyword', 'weight'])

    # 合併相同的關鍵詞，計算權重總和
    keywords_grouped = keywords_df.groupby(
        ['keyword']).agg({'weight': 'sum'}).reset_index()

    # 按權重從大到小排序
    print('按權重從大到小排序')
    keywords_sorted = keywords_grouped.sort_values('weight', ascending=False)

    # 取出前50個關鍵詞
    print('取出前50個關鍵詞')
    top_keywords = keywords_sorted.head(50)['keyword'].tolist()

    # 輸出結果
    print(f'{resultString} result:')
    print(top_keywords)

In [31]:
def pagerank(graph, weight=None, alpha=0.85, max_iter=100, tol=1e-6, weight_args=None):
    # 初始化權重
    if weight is None:
        weight = uniform_weight
    # 初始化分數
    scores = {node: 1.0 / len(graph) for node in graph}
    # 開始迭代
    for _ in range(max_iter):
        # 計算每個節點的分數
        new_scores = {}
        for node in graph:
            new_score = 0.0
            for neighbor in graph[node]:
                weight_value = weight(node, neighbor, graph, weight_args)
                new_score += weight_value * scores[neighbor]
            new_scores[node] = new_score
        # 計算調整因子
        sum_diff = sum(abs(new_scores[node] - scores[node]) for node in graph)
        if sum_diff < tol:
            break
        # 更新分數
        for node in graph:
            scores[node] = alpha * new_scores[node] + (1 - alpha) / len(graph)
    return scores

In [32]:
# 定義權重函數
def uniform_weight(x, y, graph, weight_args):
    return 1.0 / len(graph[x])


In [33]:
# 定義 text-rank 分析函數
def get_keywords_textrank(content):
    # 使用 jieba 進行斷詞
    words = jieba.lcut(content)
    # 去除停用詞和非中文詞
    words = [word for word in words if word not in stop_words and re.match(
        '^[\u4e00-\u9fa5]+$', word)]

    # 建立關鍵詞圖
    graph = {}
    for i in range(len(words)):
        if words[i] not in graph:
            graph[words[i]] = set()
        for j in range(i+1, len(words)):
            if words[j] not in graph:
                graph[words[j]] = set()
            if j - i > 5:
                break
            graph[words[i]].add(words[j])
            graph[words[j]].add(words[i])
    # 計算關鍵詞權重
    scores = pagerank(graph, weight=None, alpha=0.85,
                      max_iter=100, tol=1e-6, weight_args=None)
    # 取得前 topK 個權重最大的關鍵詞
    tr_keywords = []
    for word, score in sorted(scores.items(), key=lambda x: x[1], reverse=True)[:topK]:
        tr_keywords.append((word, score))
    return tr_keywords


In [34]:
# 定義處理字串的函式
def process_text(text):
    # 使用 jieba 分詞
    words = jieba.cut(text)
    # 去除停用詞
    words = [word for word in words if word not in stop_words]
    # 回傳字詞列表
    return words

In [35]:
# 去除 HTML tag
print('remove html tag')
df['context'] = df['context'].apply(
    lambda x: BeautifulSoup(x, "html.parser").get_text())
df['title'] = df['title'].apply(
    lambda x: BeautifulSoup(x, "html.parser").get_text())

remove html tag




In [36]:
# 去除特殊符号
print('remove special word')
df['context'] = df['context'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['title'] = df['title'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

remove special word


In [37]:
# 去除HTML tag
print('去除HTML tag')
df['context'] = df['context'].apply(lambda x: re.sub(r'<[^<]+?>', '', x))
df['title'] = df['title'].apply(lambda x: re.sub(r'<[^<]+?>', '', x))

去除HTML tag


In [38]:
# 去除標點符號及數字
print('去除標點符號及數字')
df['context'] = df['context'].apply(
    lambda x: re.sub(r'[^\u4e00-\u9fa5]+', '', x))
df['title'] = df['title'].apply(lambda x: re.sub(r'[^\u4e00-\u9fa5]+', '', x))

去除標點符號及數字


In [39]:
# 去除停用词
print('remove stop words')
df['context'] = df['context'].apply(lambda x: ' '.join(
    [word for word in jieba.analyse.extract_tags(x, topK=topK, withWeight=False) if word not in stop_words]))
df['title'] = df['title'].apply(lambda x: ' '.join([word for word in jieba.analyse.extract_tags(
    x, topK=topK, withWeight=False) if word not in stop_words]))

remove stop words


In [40]:
print('start analyze title stop word')
# 使用jieba對每個記錄的title欄位進行中文斷詞
df['title_cut'] = df['title'].apply(lambda x: ' '.join(jieba.cut(x)))

start analyze title stop word


In [41]:
print('start analyze context  stop word')
# 使用jieba對每個記錄的context欄位進行中文斷詞
df['context_cut'] = df['context'].apply(lambda x: ' '.join(jieba.cut(x)))

start analyze context  stop word


In [42]:
print('start analyze title tf-idf')
# 使用tf-idf算法計算每個記錄的title欄位的關鍵詞
df['title_keywords'] = df['title_cut'].apply(
    lambda x: jieba.analyse.extract_tags(x, topK=topK, withWeight=withWeight))

start analyze title tf-idf


In [43]:
print('start analyze context tf-idf')
# 使用tf-idf算法計算每個記錄的context欄位的關鍵詞
df['context_keywords'] = df['context_cut'].apply(
    lambda x: jieba.analyse.extract_tags(x, topK=topK, withWeight=withWeight))

start analyze context tf-idf


In [44]:
PrintKeyWord('title_keywords', 'context_keywords','tf-idf')

get all key words
轉換為dataframe
按權重從大到小排序
取出前50個關鍵詞
tf-idf result:
['問卦', '新聞', '台灣', '中國', '八卦', '爆卦', '公投', '核四', '蔡英文', '停電', '黑特', '美國', '總統', '地震', '發文', '相關', '個月', '也算', '前請', '柯文哲', '台灣人', '政治', '台電', '塔綠班', '缺電', '國家', '對岸', '網軍', '四個', '禁止', '不同意', '媒體', '發電', '高雄', '鄉民', '核廢料', '請注意', '藻礁', '記者', '中共', '萊豬', '保護', '重啟', '反核', '選舉', '中國人', '台北', '政府', '投票', '環境']


In [45]:
# 將 text-rank 分析結果加入 DataFrame 中
print('start analyze context textrank')
# df['tr_content_keywords'] = df['context_cut'].apply(get_keywords_textrank)
df['tr_content_keywords'] = df['context_cut'].apply(lambda x: jieba.analyse.textrank(
    x, topK=topK, withWeight=withWeight, allowPOS=('ns', 'n', 'vn', 'v')))


start analyze context textrank


In [46]:
print('start analyze title textrank')
# df['tr_title_keywords'] = df['title_cut'].apply(get_keywords_textrank)
df['tr_title_keywords'] = df['title_cut'].apply(
    lambda x: jieba.analyse.textrank(x, topK=topK, withWeight=withWeight, allowPOS=('ns', 'n', 'vn', 'v')))


start analyze title textrank


In [47]:
PrintKeyWord('tr_content_keywords', 'tr_title_keywords','text-rank')

get all key words
轉換為dataframe
按權重從大到小排序
取出前50個關鍵詞
text-rank result:
['問卦', '八卦', '政治', '禁止', '政府', '公投', '好像', '台北', '日本', '不用', '支持', '朋友', '人民', '地震', '民主', '投票', '影片', '事情', '文章', '地方', '核能', '事件', '小孩', '希望', '世界', '垃圾', '小弟', '公司', '同路人', '代表', '分享', '安安', '台北市', '能源', '爆卦', '原本', '民意', '不到', '工作', '政策', '留言', '道歉', '想到', '方式', '抹黑', '肥宅', '缺水', '原因', '理由', '蟑螂']


In [48]:
print('start analyze title tf')
# 處理 context 欄位
corpus_context = [process_text(text) for text in df['context']]
# 處理 title 欄位
corpus_title = [process_text(text) for text in df['title']]
# 合併兩個 corpus
corpus = corpus_context + corpus_title

# 使用 nltk.FreqDist 計算詞頻
word_freq = nltk.FreqDist(word for words in corpus for word in words)
# 取出前 50 筆(去掉空格多取一筆)
top_words = word_freq.most_common(51)

# 顯示結果
for word, freq in top_words:
    # print(f"{word}: {freq}")
    print(f"{word}")


start analyze title tf
 
問卦
台灣
新聞
八卦
中國
政治
發文
相關
個月
也算
禁止
前請
美國
記者
媒體
公投
總統
完整
政府
核四
國家
蔡英文
台灣人
新聞標題
請注意
報導
中共
停電
鄉民
對岸
東西
台電
爆卦
日本
環境
好像
柯文哲
台北
選舉
保護
也有
支持
塔綠班
不用
發電
社會
地震
不同意
自己的
民眾
