In [6]:
# -*- coding: utf-8 -*-
# 斷詞工作
import jieba
cstr = "蘋果執行長庫克日前赴中國大陸訪問，這是庫克第8次訪問大陸"

In [7]:
lw = jieba.lcut(cstr)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.935 seconds.
Prefix dict has been built succesfully.


In [8]:
print (lw)

['蘋果', '執行長', '庫克', '日前', '赴', '中國大陸', '訪問', '，', '這是', '庫克', '第', '8', '次訪', '問大陸']


In [9]:
# 計算詞彙的頻率 ：詞頻
from collections import Counter
counter = Counter()

In [10]:
counter.update(lw)

In [11]:
counter

Counter({'蘋果': 1,
         '執行長': 1,
         '庫克': 2,
         '日前': 1,
         '赴': 1,
         '中國大陸': 1,
         '訪問': 1,
         '，': 1,
         '這是': 1,
         '第': 1,
         '8': 1,
         '次訪': 1,
         '問大陸': 1})

#### 能夠計算個別詞彙在個別檔案的頻率，我們就得到定義 term frequence (TF) 的基礎
#### document frequence (DF) 是詞彙出現在不同篇文件的次數，因此能夠計算詞彙在文件中的頻率，自然也能夠計算 DF
#### 有了 DF ，我們就可以定義 inverse document frequency (IDF)
#### AI Cup 團隊的投影片，以 TF 和 IDF 的乘積來計算個別詞彙貢獻到個別檔案的分數
#### 參考AI Cup 的技術投影片：https://ppt.cc/fOKdMx

In [12]:
# 載入和檢視 AI Cup 團隊幫大建立的 TF-IDF 大表格
# load inverted file
# 注意：你必須把 AI Cup 團隊所提供的 inverted_file.json 放在與這一個 ipynb 檔案的同一個資料夾
#       否則你就必須修改檔案的路徑
import json
with open('./inverted_file.json',"r", encoding='utf-8') as f:
	invert_file = json.load(f)


In [13]:
# 查閱 "反對" 的資料
print(invert_file["反對"])

{'idf': 18.570102135561747, 'docs': [{'news_000007': 2}, {'news_000016': 1}, {'news_000022': 1}, {'news_000036': 1}, {'news_000071': 1}, {'news_000091': 1}, {'news_000094': 1}, {'news_000099': 2}, {'news_000110': 1}, {'news_000111': 1}, {'news_000115': 1}, {'news_000121': 1}, {'news_000135': 1}, {'news_000148': 1}, {'news_000230': 1}, {'news_000240': 1}, {'news_000279': 2}, {'news_000306': 1}, {'news_000314': 1}, {'news_000329': 1}, {'news_000339': 1}, {'news_000378': 1}, {'news_000388': 2}, {'news_000389': 1}, {'news_000401': 1}, {'news_000409': 1}, {'news_000411': 1}, {'news_000445': 1}, {'news_000447': 1}, {'news_000458': 1}, {'news_000469': 2}, {'news_000475': 2}, {'news_000505': 1}, {'news_000516': 1}, {'news_000542': 1}, {'news_000549': 1}, {'news_000573': 2}, {'news_000576': 2}, {'news_000594': 2}, {'news_000595': 1}, {'news_000602': 1}, {'news_000619': 1}, {'news_000646': 1}, {'news_000649': 1}, {'news_000652': 1}, {'news_000686': 1}, {'news_000697': 1}, {'news_000712': 1}, {'n

In [14]:
# "反對" 的 IDF
print(invert_file["反對"]["idf"])

18.570102135561747


In [15]:
# 這一個指令列印所有用到 "反對" 的報導的文件編號和詞頻；
# 因為有1000000篇文件，所以你會看到很大量的資料
print(invert_file["反對"]["docs"])

[{'news_000007': 2}, {'news_000016': 1}, {'news_000022': 1}, {'news_000036': 1}, {'news_000071': 1}, {'news_000091': 1}, {'news_000094': 1}, {'news_000099': 2}, {'news_000110': 1}, {'news_000111': 1}, {'news_000115': 1}, {'news_000121': 1}, {'news_000135': 1}, {'news_000148': 1}, {'news_000230': 1}, {'news_000240': 1}, {'news_000279': 2}, {'news_000306': 1}, {'news_000314': 1}, {'news_000329': 1}, {'news_000339': 1}, {'news_000378': 1}, {'news_000388': 2}, {'news_000389': 1}, {'news_000401': 1}, {'news_000409': 1}, {'news_000411': 1}, {'news_000445': 1}, {'news_000447': 1}, {'news_000458': 1}, {'news_000469': 2}, {'news_000475': 2}, {'news_000505': 1}, {'news_000516': 1}, {'news_000542': 1}, {'news_000549': 1}, {'news_000573': 2}, {'news_000576': 2}, {'news_000594': 2}, {'news_000595': 1}, {'news_000602': 1}, {'news_000619': 1}, {'news_000646': 1}, {'news_000649': 1}, {'news_000652': 1}, {'news_000686': 1}, {'news_000697': 1}, {'news_000712': 1}, {'news_000727': 3}, {'news_000731': 1},

In [16]:
# 如果只看前面五筆資料的話，會比較務實一些
print(invert_file["反對"]["docs"][0:5])

[{'news_000007': 2}, {'news_000016': 1}, {'news_000022': 1}, {'news_000036': 1}, {'news_000071': 1}]


In [17]:
print(invert_file["大埔"]["docs"][0:5])

[{'news_000329': 1}, {'news_000531': 2}, {'news_000623': 1}, {'news_001170': 1}, {'news_001282': 2}]


In [18]:
print(invert_file["水庫"]["docs"][0:5])

[{'news_000019': 4}, {'news_000056': 1}, {'news_000264': 5}, {'news_000889': 1}, {'news_001104': 1}]


#### 以下是一個小習題
#### 找到 "同意動物實驗"的文章，然後計算它們的 TFIDF
#### 以下的指令，只是完成以上這一個工作的局部指令，剩下的可以讓大家當作練習

In [19]:
# 參考 QS_1.csv 的 q_18
query = "同意動物實驗"
query_words = jieba.lcut(query)
query_words

['同意', '動物', '實驗']

In [20]:
#這一些文件有 "同意"
agree = [key  for item in invert_file["同意"]["docs"] for key in item]

In [21]:
animal = [key for item in invert_file["動物"]["docs"] for key in item]

In [22]:
experiment = [key for item in invert_file["實驗"]["docs"] for key in item]

In [42]:
#這一些文件有 "同意"、"動物"、"實驗"
dictionary = {}
agree_animal_experiment = list(set(agree) & set(animal) & set(experiment))
agree_animal_experiment

['news_079704',
 'news_073299',
 'news_030419',
 'news_089851',
 'news_014723',
 'news_070108',
 'news_083881',
 'news_039141',
 'news_002403',
 'news_041429',
 'news_020350',
 'news_061334',
 'news_042031',
 'news_066862',
 'news_047161',
 'news_034307',
 'news_052038',
 'news_036196',
 'news_093241',
 'news_072195',
 'news_082397',
 'news_009667',
 'news_010409',
 'news_080912',
 'news_077583',
 'news_079400',
 'news_098664',
 'news_020924',
 'news_081384',
 'news_081850',
 'news_012178']

In [27]:
for document_count_dict in invert_file["同意"]['docs']:
    for doc, doc_tf in document_count_dict.items():
        if doc in agree_animal_experiment:
            if doc in dictionary:
                dictionary[doc] += invert_file["同意"]["idf"] * doc_tf;
            else:
                dictionary[doc] = invert_file["同意"]["idf"] * doc_tf;
            print(doc,": IDF",invert_file["同意"]["idf"], ", TF: ", doc_tf)

news_002403 : IDF 12.825445684237527 , TF:  1
news_009667 : IDF 12.825445684237527 , TF:  1
news_010409 : IDF 12.825445684237527 , TF:  1
news_012178 : IDF 12.825445684237527 , TF:  1
news_014723 : IDF 12.825445684237527 , TF:  1
news_020350 : IDF 12.825445684237527 , TF:  1
news_020924 : IDF 12.825445684237527 , TF:  2
news_030419 : IDF 12.825445684237527 , TF:  1
news_034307 : IDF 12.825445684237527 , TF:  3
news_036196 : IDF 12.825445684237527 , TF:  1
news_039141 : IDF 12.825445684237527 , TF:  1
news_041429 : IDF 12.825445684237527 , TF:  1
news_042031 : IDF 12.825445684237527 , TF:  2
news_047161 : IDF 12.825445684237527 , TF:  1
news_052038 : IDF 12.825445684237527 , TF:  1
news_061334 : IDF 12.825445684237527 , TF:  1
news_066862 : IDF 12.825445684237527 , TF:  1
news_070108 : IDF 12.825445684237527 , TF:  1
news_072195 : IDF 12.825445684237527 , TF:  1
news_073299 : IDF 12.825445684237527 , TF:  1
news_077583 : IDF 12.825445684237527 , TF:  1
news_079400 : IDF 12.8254456842375

In [28]:
for document_count_dict in invert_file["動物"]['docs']:
    for doc, doc_tf in document_count_dict.items():
        if doc in agree_animal_experiment:
            if doc in dictionary:
                dictionary[doc] += invert_file["動物"]["idf"] * doc_tf;
            else:
                dictionary[doc] = invert_file["動物"]["idf"] * doc_tf;
            print(doc,": IDF",invert_file["動物"]["idf"], ", TF: ", doc_tf)

news_002403 : IDF 33.44481605351171 , TF:  4
news_009667 : IDF 33.44481605351171 , TF:  1
news_010409 : IDF 33.44481605351171 , TF:  1
news_012178 : IDF 33.44481605351171 , TF:  1
news_014723 : IDF 33.44481605351171 , TF:  1
news_020350 : IDF 33.44481605351171 , TF:  1
news_020924 : IDF 33.44481605351171 , TF:  1
news_030419 : IDF 33.44481605351171 , TF:  1
news_034307 : IDF 33.44481605351171 , TF:  1
news_036196 : IDF 33.44481605351171 , TF:  8
news_039141 : IDF 33.44481605351171 , TF:  1
news_041429 : IDF 33.44481605351171 , TF:  11
news_042031 : IDF 33.44481605351171 , TF:  1
news_047161 : IDF 33.44481605351171 , TF:  2
news_052038 : IDF 33.44481605351171 , TF:  2
news_061334 : IDF 33.44481605351171 , TF:  1
news_066862 : IDF 33.44481605351171 , TF:  3
news_070108 : IDF 33.44481605351171 , TF:  1
news_072195 : IDF 33.44481605351171 , TF:  3
news_073299 : IDF 33.44481605351171 , TF:  6
news_077583 : IDF 33.44481605351171 , TF:  3
news_079400 : IDF 33.44481605351171 , TF:  1
news_0797

In [29]:
for document_count_dict in invert_file["實驗"]['docs']:
    for doc, doc_tf in document_count_dict.items():
        if doc in agree_animal_experiment:
            if doc in dictionary:
                dictionary[doc] += invert_file["實驗"]["idf"] * doc_tf;
            else:
                dictionary[doc] = invert_file["實驗"]["idf"] * doc_tf;
            print(doc,": IDF",invert_file["實驗"]["idf"], ", TF: ", doc_tf)

news_002403 : IDF 54.67468562055768 , TF:  3
news_009667 : IDF 54.67468562055768 , TF:  3
news_010409 : IDF 54.67468562055768 , TF:  1
news_012178 : IDF 54.67468562055768 , TF:  2
news_014723 : IDF 54.67468562055768 , TF:  1
news_020350 : IDF 54.67468562055768 , TF:  4
news_020924 : IDF 54.67468562055768 , TF:  2
news_030419 : IDF 54.67468562055768 , TF:  5
news_034307 : IDF 54.67468562055768 , TF:  1
news_036196 : IDF 54.67468562055768 , TF:  13
news_039141 : IDF 54.67468562055768 , TF:  2
news_041429 : IDF 54.67468562055768 , TF:  10
news_042031 : IDF 54.67468562055768 , TF:  5
news_047161 : IDF 54.67468562055768 , TF:  5
news_052038 : IDF 54.67468562055768 , TF:  1
news_061334 : IDF 54.67468562055768 , TF:  3
news_066862 : IDF 54.67468562055768 , TF:  5
news_070108 : IDF 54.67468562055768 , TF:  4
news_072195 : IDF 54.67468562055768 , TF:  1
news_073299 : IDF 54.67468562055768 , TF:  12
news_077583 : IDF 54.67468562055768 , TF:  2
news_079400 : IDF 54.67468562055768 , TF:  1
news_07

In [41]:
# 參考 AI Cup 的技術投影片的說明，計算各篇有 "同意"、"動物"、"實驗" 的分數，並且將這一些文件排序
result = sorted(dictionary, key=lambda x: dictionary[x], reverse= True)
#print(result)
for item in result:
    print(item, dictionary[item])
    

news_036196 991.1548871795811
news_041429 927.4652784784432
news_073299 869.590569452
news_079704 624.7476492288235
news_066862 386.53332194756103
news_047161 353.0885058940494
news_089851 353.0885058940494
news_042031 332.4691355247752
news_030419 319.64368984053766
news_002403 310.62876675995744
news_020350 264.96900421998
news_070108 264.96900421998
news_082397 260.3751029408288
news_077583 222.509265085888
news_009667 210.2943185994223
news_061334 210.2943185994223
news_093241 210.2943185994223
news_081850 189.0644490323763
news_020924 168.44507866310212
news_072195 167.8345794653303
news_012178 155.61963297886462
news_039141 155.61963297886462
news_052038 134.38976341181862
news_034307 126.59583872678198
news_010409 100.94494735830692
news_014723 100.94494735830692
news_079400 100.94494735830692
news_080912 100.94494735830692
news_081384 100.94494735830692
news_083881 100.94494735830692
news_098664 100.94494735830692
