In [1]:
import os
import jieba
import jieba.analyse
import pandas as pd
import pyodbc
import configparser
from bs4 import BeautifulSoup
import re
import nltk

In [2]:
config = configparser.ConfigParser()
config.read('config.env')
db_UserName = config.get('DEFAULT', 'DB_USERNAME')
db_Password = config.get('DEFAULT', 'DB_PASSWORD')
db_Name = config.get('DEFAULT', 'DB_NAME')
db_Host = config.get('DEFAULT', 'DB_HOST')

cnxn_str = ("Driver={ODBC Driver 17 for SQL Server};"
            f"Server={db_Host};"
            f"Database={db_Name};"
            f"UID={db_UserName};"
            f"PWD={db_Password};")

cnxn = pyodbc.connect(cnxn_str)
# Create a cursor from the connection
cursor = cnxn.cursor()

In [3]:
# SQL查詢語句
query = ("select id,title ,context from ("
         "select a.id,title,context from pttpost_referendum_2 a "
         " inner join pttpost b on a.source=b.source and a.id=b.Id "
         " where not exists (select * from keyword where source=99 and (b.title like '%'+keyname+'%' or b.context like '%'+keyname+'%')) "
         " union all "
         " select a.id,title,context from pttpost_referendum_2 a "
         " inner join pttpostgossing b on a.source=b.source and a.id=b.Id "
         " where not exists (select * from keyword where source=99 and (b.title like '%'+keyname+'%' or b.context like '%'+keyname+'%')) "
         " union all "
         " select convert(varchar,a.id),title,content from dcard.dbo.pttpost_referendum_2 a "
         " inner join dcard.dbo.post b on a.source=b.forum and a.id=b.Id "
         " where not exists (select * from keyword where source=99 and (b.title like '%'+keyname+'%' or b.content like '%'+keyname+'%')) "
         " ) m "
         "where 1=1")

# 讀取資料表
df = pd.read_sql(query, cnxn)



In [5]:
with open('hownet_postive.txt', encoding='utf-8', mode='r') as f:
        positive_words = []
        for l in f:
            positive_words.append(l.strip())

with open('hownet_negative.txt', encoding='utf-8', mode='r') as f:
    negative_words = []
    for l in f:
        negative_words.append(l.strip())

In [6]:
# 定義處理負面字串的函式
def process_text_negative(text):
    # 使用 jieba 分詞
    words = jieba.cut(text)
    
    words = [word for word in words if word in negative_words]   

    # 回傳字詞列表
    return words

In [7]:
# 定義處理正面字串的函式
def process_text_postive(text):
    # 使用 jieba 分詞
    words = jieba.cut(text)
    
    words = [word for word in words if word in positive_words]   

    # 回傳字詞列表
    return words

In [8]:
# 載入知網詞庫
jieba.set_dictionary('C:\project\python\dict.big5.txt')

# 載入自定義詞庫
jieba.load_userdict('C:\project\python\main.txt')

Building prefix dict from C:\project\python\dict.big5.txt ...
Loading model from cache C:\Users\HuanChen\AppData\Local\Temp\jieba.u7bf78fb8a3e5c528afaa2a9a1de33675.cache
Loading model cost 1.326 seconds.
Prefix dict has been built successfully.


In [9]:
print('start analyze emotion word in context')
# 處理 context 欄位
corpus_context_negative = [process_text_negative(text) for text in df['context']]
corpus_context_postive = [process_text_postive(text) for text in df['context']]

start analyze emotion word in context


In [10]:
print('start analyze emotion word in title')
# 處理 title 欄位
corpus_title_negative = [process_text_negative(text) for text in df['title']]
corpus_title_postive = [process_text_postive(text) for text in df['title']]

start analyze emotion word in title


In [11]:
# 合併兩個 corpus
corpus_postive = corpus_context_postive + corpus_title_postive
corpus_negative = corpus_context_negative + corpus_title_negative

In [12]:
# 使用 nltk.FreqDist 計算詞頻
word_freq = nltk.FreqDist(word for words in corpus_postive for word in words if len(word)>1)
# 取出前 10 筆
top_words = word_freq.most_common(20)

# 顯示結果
print("正面詞彙")
for word, freq in top_words:    
    print(f"{word}: {freq}")

正面詞彙
完整: 22097
其實: 12296
根本: 10852
可能: 9619
支持: 7732
完全: 7418
希望: 6973
同意: 6323
需要: 5760
民主: 4949
決定: 4179
安全: 4010
一定: 3826
接受: 3718
剛剛: 3673
喜歡: 3593
經濟: 3561
自由: 3443
通過: 3223
負責: 3015


In [13]:
word_freq = nltk.FreqDist(word for words in corpus_negative for word in words if len(word)>1)
# 取出前 10 筆
top_words = word_freq.most_common(20)

# 顯示結果
print("負面詞彙")
for word, freq in top_words:    
    print(f"{word}: {freq}")

負面詞彙
攻擊: 4074
不同意: 3960
活動: 3304
質疑: 3104
錯誤: 2917
嚴重: 2542
隨便: 2394
擔心: 2073
智障: 2003
可憐: 1936
簡單: 1864
不行: 1786
似乎: 1747
抗議: 1616
背後: 1509
不爽: 1486
不滿: 1408
懷疑: 1369
危險: 1317
厲害: 1281
