In [1]:
import re
import numpy as np
import pandas as pd
import jieba
import openpyxl
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
jieba.dt.cache_file = 'jieba.cache.lmspeech'

In [2]:
data = pd.read_excel("C:/Users/User/Documents/Operation Holy Grail/Phase 4/Data/Pork n beef import/Ma/readfile_py/7th_sweh01.xlsx")
data.fillna("",inplace=True)
lmspeech = list(data["Text"])
data.head()

Unnamed: 0,Name,Sitting,Date,Type,Text,KMTchair
0,孔文吉,立法院第7屆第4會期社會福利及衛生環境委員會第11次全體委員會議,981026,1質詢,\n主席、各位列席官員、各位同仁。我要先跟各位委員說明一下，談到開放美國牛肉進口，行政院從來...,1
1,孔文吉,立法院第7屆第4會期社會福利及衛生環境委員會第11次全體委員會議,981029,1質詢,\n當然反對。\n\n這個案子與第五案內容相同，請主席直接處理。\n\n有關「衛生署所屬業務...,1
2,王幸男,立法院第7屆第4會期社會福利及衛生環境委員會第11次全體委員會議,981026,1質詢,\n他們已經宣布了，連協商現在都要……\n\n主席，本席是很尊重妳的，今天……\n\n不是一...,1
3,田秋堇,立法院第7屆第4會期社會福利及衛生環境委員會第11次全體委員會議,981026,1質詢,\n主席、各位列席官員、各位同仁。老實說，本席實在不太想質詢署長，因為我不相信署長會做出這麼...,1
4,田秋堇,立法院第7屆第4會期社會福利及衛生環境委員會第11次全體委員會議,981029,1質詢,\n主席、各位列席官員、各位同仁。我針對食品附履歷提供一點意見請大家參考，因為本案內文是要求...,1


In [3]:
#字典匯入與斷字
jieba.load_userdict('beefnpork.txt')
with open('stopwords.txt', encoding = 'UTF-8') as f:
    stop_words = f.readlines()
stop_words = [w.replace('\n', '') for w in stop_words]
stop_words = [w.replace(' ', '') for w in stop_words]
rule = re.compile(r"[^\u4e00-\u9fa5]")
lmspeech = [list(jieba.cut(rule.sub('', speech))) for speech in lmspeech]
for idx, speech in enumerate(lmspeech):
    lmspeech[idx] = ' '.join([word for word in speech if word.strip() not in stop_words])

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\User\AppData\Local\Temp\jieba.cache.lmspeech
Loading model cost 0.695 seconds.
Prefix dict has been built successfully.


In [4]:
#Bag of Words
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(lmspeech)
cv_matrix

#Output
cv_matrix = cv_matrix.toarray()
cv_matrix

array([[0, 1, 0, ..., 0, 2, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [5]:
#Making a LDA model
n_topics = 3
n_top_words = 20
tf_vectorizer = CountVectorizer(token_pattern='[\u4e00-\u9fff]{2,6}',max_features=500)
tf = tf_vectorizer.fit_transform(lmspeech)
lda = LDA(n_components=n_topics, max_iter=100,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)
feature_names = tf_vectorizer.get_feature_names()
compoments = lda.components_

In [6]:
#Model Performance
doc_topic_matrix = lda.fit_transform(cv_matrix)
print(lda.score(cv_matrix))
print(lda.perplexity(cv_matrix))

-87060.8767454497
2076.126253540552


In [9]:
doc_topic_df = pd.DataFrame(doc_topic_matrix, columns=['1', '2', '3'])
doc_topic_df

Unnamed: 0,1,2,3
0,0.001075,0.001075,0.997851
1,0.022854,0.02283,0.954315
2,0.0016,0.0016,0.9968
3,0.000649,0.000649,0.998702
4,0.000373,0.000373,0.999253
5,0.001291,0.001291,0.997418
6,0.000638,0.000638,0.998724
7,0.001694,0.001694,0.996612
8,0.021108,0.021106,0.957786
9,0.00261,0.00261,0.99478


In [10]:
topic_word_matrix = lda.components_

In [11]:
#Get all unique words in the corpus
vocab = cv.get_feature_names()
pd.DataFrame(topic_word_matrix, columns=vocab)

Unnamed: 0,一下子,一事,一人,一件,一份,一位,一來,一例,一共,一再,...,黑手,黑箱,黑臉,黑鍋,黑龍,默契,點半還,點名,黨團,鼓勵
0,0.346146,0.34315,0.345312,0.341469,0.342716,0.343304,0.344599,0.342832,0.343279,0.344365,...,0.343151,0.342705,0.343742,0.343673,0.342838,0.342622,0.343183,0.344148,0.3419,0.34229
1,0.342905,0.34208,0.340994,0.342728,0.3438,0.349472,0.341547,0.343122,0.34422,0.345554,...,0.342624,0.344744,0.343053,0.343552,0.342195,0.342102,0.341416,0.344569,0.343736,0.346662
2,3.297132,1.32769,1.327611,2.313467,4.286742,4.283694,3.298435,2.314393,2.314864,6.253895,...,1.326123,1.329553,1.328293,1.32971,1.326431,1.330097,1.33047,2.310439,2.315343,5.267552


In [12]:
df = pd.DataFrame(topic_word_matrix, columns=vocab)
df.to_excel("tfidftestfile.xlsx",index=False)

In [2]:
#立法院第7屆第4會期社會福利及衛生環境委員會第11次全體委員會議
data01 = pd.read_excel("C:/Users/User/Documents/Operation Holy Grail/Phase 4/Data/Pork n beef import/Ma/readfile_py/7th_sweh01.xlsx")
data01.fillna("",inplace=True)
lmspeech = list(data01['Text'])
data01.head()

Unnamed: 0,Name,Sitting,Date,Type,Text,KMTchair
0,孔文吉,立法院第7屆第4會期社會福利及衛生環境委員會第11次全體委員會議,981026,1質詢,\n主席、各位列席官員、各位同仁。我要先跟各位委員說明一下，談到開放美國牛肉進口，行政院從來...,1
1,王幸男,立法院第7屆第4會期社會福利及衛生環境委員會第11次全體委員會議,981026,1質詢,\n他們已經宣布了，連協商現在都要……\n\n主席，本席是很尊重妳的，今天……\n\n不是一...,1
2,田秋堇,立法院第7屆第4會期社會福利及衛生環境委員會第11次全體委員會議,981026,1質詢,\n主席、各位列席官員、各位同仁。老實說，本席實在不太想質詢署長，因為我不相信署長會做出這麼...,1
3,江義雄,立法院第7屆第4會期社會福利及衛生環境委員會第11次全體委員會議,981029,1質詢,\n主席、各位列席官員、各位同仁。國安會蘇起秘書長認為我們與美國簽訂的議定書效力比國內法律還...,1
4,吳育昇,立法院第7屆第4會期社會福利及衛生環境委員會第11次全體委員會議,981029,1質詢,\n主席、各位列席官員、各位同仁。\n\n\n你有為有守，依你的個性，你可以這樣表達。以上是...,1


In [3]:
jieba.set_dictionary('beefnpork.txt')
jieba.load_userdict('beefnpork.txt')
with open('stopwords.txt', encoding = 'UTF-8') as f:
    stop_words = f.readlines()
stop_words = [w.replace('\n', '') for w in stop_words]
stop_words = [w.replace(' ', '') for w in stop_words]
rule = re.compile(r"[^\u4e00-\u9fa5]")
lmspeech = [list(jieba.cut(rule.sub('', speech))) for speech in lmspeech]
for idx, speech in enumerate(lmspeech):
    lmspeech[idx] = ' '.join([word for word in speech if word.strip() not in stop_words])

Building prefix dict from c:\venv\workspace\source\Text mining\beefnpork.txt ...
Loading model from cache C:\Users\User\AppData\Local\Temp\jieba.cache.lmspeech
Loading model cost 0.636 seconds.
Prefix dict has been built successfully.


In [4]:
tf_vectorizer = CountVectorizer(token_pattern='[\u4e00-\u9fff]{2,6}',max_features=500)
dtm_tf = tf_vectorizer.fit_transform(lmspeech)
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(lmspeech)

lda_tf = LDA(n_components=3, random_state=0)
lda_tf.fit(dtm_tf)

lda_tfidf = LDA(n_components=20, random_state=0)
lda_tfidf.fit(dtm_tfidf)

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [28]:
#tfidf for every legislators
tfidfscore = pd.DataFrame(dtm_tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names())
tfidfscore
tfidfscore.to_excel("tfidftestfile.xlsx",index=False)

In [8]:
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(lmspeech)
cv_matrix

<33x3659 sparse matrix of type '<class 'numpy.int64'>'
	with 7003 stored elements in Compressed Sparse Row format>

In [11]:
feature_names = tf_vectorizer.get_feature_names()

['一再',
 '一句',
 '一天',
 '一直',
 '一致',
 '一般',
 '一項',
 '上任',
 '上面',
 '下台',
 '不一樣',
 '不住',
 '不准',
 '不吃',
 '不好意思',
 '不宜',
 '不帶',
 '不必',
 '不想',
 '不應',
 '不敢',
 '不然',
 '不用',
 '不管',
 '不行',
 '不足',
 '不過',
 '世界',
 '並沒有',
 '中國',
 '主導',
 '主決議',
 '主管',
 '主要',
 '之一',
 '之下',
 '之前',
 '事件',
 '事先',
 '事實',
 '交換',
 '人家',
 '人民',
 '什麼樣',
 '代罪羔羊',
 '代表',
 '以上',
 '以下',
 '以前',
 '以及',
 '作成',
 '你講',
 '使用',
 '例如',
 '依照',
 '保證',
 '保障',
 '信心',
 '修改',
 '修正',
 '修法',
 '停止',
 '健康',
 '內容',
 '內臟',
 '全世界',
 '全國',
 '全牛',
 '全部',
 '八道',
 '公告',
 '公布',
 '公聽會',
 '其中',
 '具有',
 '准許',
 '出口',
 '分級',
 '刊登',
 '利益',
 '制定',
 '制度',
 '前線',
 '加強',
 '動物',
 '半天',
 '協商',
 '協定',
 '協議',
 '危險',
 '即使',
 '原則',
 '原因',
 '反對',
 '受到',
 '只是',
 '召開',
 '可不可以',
 '台灣',
 '台灣人',
 '台美',
 '司長',
 '各部',
 '合約',
 '同一',
 '名字',
 '含有',
 '吳敦義院',
 '吳院',
 '品質',
 '哪些',
 '哪裡',
 '問到',
 '問是',
 '問衛',
 '喪權辱國',
 '單位',
 '嚴格',
 '嚴重',
 '回事',
 '回來',
 '回答',
 '困難',
 '國人',
 '國內',
 '國安',
 '國家',
 '國會',
 '國民',
 '國民黨',
 '國賠',
 '國際',
 '國際法',
 '地步',
 '執政',
 '基因',
 '堅持',
 '報告',
 '增訂',
 '壓力',
 

In [None]:
#Commuting Topic probability
topic_values = lda_tf.transform(dtm_tfidf)
topic_values.shape

In [None]:
#取每個立委topic probability最大的主題(屬於哪類立場)
data01['Topic'] = topic_values.argmax(axis=1)
data01.head()

In [None]:
#取每個立委各主題的topic probabilty
df = pd.DataFrame(topic_values)
df.head()