In [3]:
import os
import jieba
import jieba.analyse
import pandas as pd
import pyodbc
import configparser
from bs4 import BeautifulSoup
import re
import nltk

In [4]:
config = configparser.ConfigParser()
config.read('config.env')
db_UserName = config.get('DEFAULT', 'DB_USERNAME')
db_Password = config.get('DEFAULT', 'DB_PASSWORD')
db_Name = config.get('DEFAULT', 'DB_NAME')
db_Host = config.get('DEFAULT', 'DB_HOST')

cnxn_str = ("Driver={ODBC Driver 17 for SQL Server};"
            f"Server={db_Host};"
            f"Database={db_Name};"
            f"UID={db_UserName};"
            f"PWD={db_Password};")

cnxn = pyodbc.connect(cnxn_str)
# Create a cursor from the connection
cursor = cnxn.cursor()

In [5]:
# SQL查詢語句
query = ("select id,title ,context from ("
         "select a.id,title,context from pttpost_referendum_1 a "
         " inner join pttpost b on a.source=b.source and a.id=b.Id "
         " where not exists (select * from keyword where source=99 and (b.title like '%'+keyname+'%' or b.context like '%'+keyname+'%')) "
         " union all "
         " select a.id,title,context from pttpost_referendum_1 a "
         " inner join pttpostgossing b on a.source=b.source and a.id=b.Id "
         " where not exists (select * from keyword where source=99 and (b.title like '%'+keyname+'%' or b.context like '%'+keyname+'%')) "
         " union all "
         " select convert(varchar,a.id),title,content from dcard.dbo.pttpost_referendum_1 a "
         " inner join dcard.dbo.post b on a.source=b.forum and a.id=b.Id "
         " where not exists (select * from keyword where source=99 and (b.title like '%'+keyname+'%' or b.content like '%'+keyname+'%')) "
         " ) m "
         "where 1=1")

# 讀取資料表
df = pd.read_sql(query, cnxn)



In [6]:
with open('NTUSD_positive_unicode.txt', encoding='utf-8', mode='r') as f:
        positive_words = []
        for l in f:
            positive_words.append(l.strip())

with open('NTUSD_negative_unicode.txt', encoding='utf-8', mode='r') as f:
    negative_words = []
    for l in f:
        negative_words.append(l.strip())

In [7]:
# 定義處理負面字串的函式
def process_text_negative(text):
    # 使用 jieba 分詞
    words = jieba.cut(text)
    
    words = [word for word in words if word in negative_words]   

    # 回傳字詞列表
    return words

In [8]:
# 定義處理正面字串的函式
def process_text_postive(text):
    # 使用 jieba 分詞
    words = jieba.cut(text)
    
    words = [word for word in words if word in positive_words]   

    # 回傳字詞列表
    return words

In [9]:
# 載入知網詞庫
jieba.set_dictionary('C:\project\python\dict.big5.txt')

# 載入自定義詞庫
jieba.load_userdict('C:\project\python\main.txt')

Building prefix dict from C:\project\python\dict.big5.txt ...
Loading model from cache C:\Users\HuanChen\AppData\Local\Temp\jieba.u7bf78fb8a3e5c528afaa2a9a1de33675.cache
Loading model cost 0.696 seconds.
Prefix dict has been built successfully.


In [10]:
print('start analyze emotion word in context')
# 處理 context 欄位
corpus_context_negative = [process_text_negative(text) for text in df['context']]
corpus_context_postive = [process_text_postive(text) for text in df['context']]

start analyze emotion word in context


In [11]:
print('start analyze emotion word in title')
# 處理 title 欄位
corpus_title_negative = [process_text_negative(text) for text in df['title']]
corpus_title_postive = [process_text_postive(text) for text in df['title']]

start analyze emotion word in title


In [12]:
# 合併兩個 corpus
corpus_postive = corpus_context_postive + corpus_title_postive
corpus_negative = corpus_context_negative + corpus_title_negative

In [13]:
# 使用 nltk.FreqDist 計算詞頻
word_freq = nltk.FreqDist(word for words in corpus_postive for word in words)
# 取出前 10 筆
top_words = word_freq.most_common(20)

# 顯示結果
print("正面詞彙")
for word, freq in top_words:    
    print(f"{word}: {freq}")

正面詞彙
完整: 17166
很多: 8403
知道: 5785
希望: 4817
進行: 4144
同意: 3979
發展: 3746
當然: 3609
成為: 3471
安全: 3122
決定: 3072
才能: 2770
增加: 2728
能力: 2705
通過: 2654
接受: 2603
說明: 2483
相信: 2437
喜歡: 2330
自由: 2307


In [14]:
word_freq = nltk.FreqDist(word for words in corpus_negative for word in words)
# 取出前 10 筆
top_words = word_freq.most_common(20)

# 顯示結果
print("負面詞彙")
for word, freq in top_words:    
    print(f"{word}: {freq}")

負面詞彙
沒有: 19347
不是: 13276
問題: 9093
不要: 7664
不會: 7236
不知道: 7080
不能: 5209
無法: 5112
東西: 4777
要求: 4274
刪除: 3970
超過: 3339
反對: 2806
垃圾: 2802
怎樣: 2740
突然: 2719
不到: 2608
不同意: 2587
攻擊: 2418
禁止: 2410
