In [1]:
from elasticsearch import Elasticsearch
import functools as f
from janome.tokenizer import Tokenizer
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.charfilter import *
from janome.tokenfilter import *
import math

In [2]:
es = Elasticsearch("http://127.0.0.1:9201")
def scroll(index, doc_type, query_body, page_size=100, scroll='2m'):
    page = es.search(index=index, doc_type=doc_type, scroll=scroll, size=page_size, body=query_body)
    sid = page['_scroll_id']
    scroll_size = page['hits']['total']
    total_pages = math.ceil(scroll_size/page_size)
    page_counter = 0
    # Start scrolling
    while (scroll_size > 0):
        # Get the number of results that we returned in the last scroll
        scroll_size = len(page['hits']['hits'])
        if scroll_size>0:
            yield total_pages, page_counter, scroll_size, page
        # get next page
        page = es.scroll(scroll_id = sid, scroll = '2m')
        page_counter += 1
        # Update the scroll ID
        sid = page['_scroll_id']

In [3]:
def wordToDicGen(tokenizer, char_filters, token_filters, stop_words):
    def wordToDic(text):
        dic = dict()
        for token in Analyzer(char_filters, tokenizer, token_filters).analyze(text):
            if token.base_form in stop_words:
                continue
            if token.base_form in dic:
                dic[token.base_form] = dic[token.base_form] + 1
            else:
                dic[token.base_form] = 1
        return dic
    return wordToDic

In [4]:
char_filters = [UnicodeNormalizeCharFilter()]
tokenizer = Tokenizer()
token_filters = [CompoundNounFilter(), POSStopFilter(['記号','助詞', '助動詞', '助動詞']), LowerCaseFilter()]
stop_words=set(["する", "れる", "こと", "いる", "行う", "できる", "場合", "の", "ない", "みる", "使う", "より", "なる"])
wordToDic = wordToDicGen(tokenizer, char_filters, token_filters, stop_words)

In [5]:
class TfIdf():
    doc_num=0
    word_counter = dict()
    
    def addDocument(self, wordCountDic):
        self.doc_num += 1
        for key in wordCountDic:
            if key in self.word_counter:
                self.word_counter[key] += 1
            else:
                self.word_counter[key] = 1

    def answer(self, wordCountDic):
        ans = dict()
        document_words_sum = f.reduce(lambda x,y:x+y, list(wordCountDic.values()))
        for word in wordCountDic:
            tf = self.tf(document_words_sum, wordCountDic[word])
            idf = self.idf(word)
            ans[word] = tf*idf
        return ans
    
    def tf(self, document_words_sum, word_num):
        return word_num / document_words_sum
    
    def idf(self, word):
        return math.log(float(self.doc_num / self.word_counter[word]), math.e) + 1

In [6]:
tfidf = TfIdf()
index = 'fess.20180701'
doc_type = 'doc'
query = { "query": { "match_all": {} }}
page_size =30

currrent_volume=0
for total_pages, page_counter, page_items, page_data in scroll(index, doc_type, query, page_size=page_size):
    for data in page_data['hits']['hits']:
        tfidf.addDocument(wordToDic(data['_source']['content']))
    currrent_volume += page_items

In [7]:
page_size =1

currrent_volume=0
for total_pages, page_counter, page_items, page_data in scroll(index, doc_type, query, page_size=page_size):
    for data in page_data['hits']['hits']:
        print(data['_source']['url'])
        for k, v in sorted(tfidf.answer(wordToDic(data['_source']['content'])).items(), key=lambda x: -x[1])[:10]:
            print(k, v)
    currrent_volume += page_items
    if currrent_volume >= 10:
        break

https://gigazine.net/news/20180706-china-ap1000-epr/
原子力産業 0.023317488109853554
遅れる 0.01886515867249546
ap1000 0.018473965793423773
新型原子炉 0.017488116082390166
原子力発電所 0.017488116082390166
wh 0.017488116082390166
東芝 0.017488116082390166
アメリカ 0.015243532167490574
中国 0.014023731901107834
最 0.013274455410293236
https://gigazine.net/news/20180706-virus-desides-mining-or-ransomware/
ファイル 0.03633063623601737
カスペルスキー 0.03265374682807244
マイニングマルウェア 0.026122997462457952
コンピューター 0.021134973450343948
侵入 0.019592248096843467
身代金 0.019592248096843467
暗号化 0.017287991110333946
ランサムウェア 0.016922932055221898
マルウェア 0.015427253535004832
攻撃 0.015087760100809063
https://gigazine.net/news/20180706-london-police-facial-recognition/
afs 0.03595993869441478
運用 0.024244676575199242
犯罪容疑者 0.023973292462943184
イギリス 0.019090283152333857
98% 0.01797996934720739
テスト 0.01797996934720739
ロンドン警視庁 0.01797996934720739
警察 0.016303568082312883
顔認識機能 0.01581388440795756
監視カメラ 0.014546805945119546
https://gigazine.net/news/2018

In [16]:
a={"b":1, "c":2}

In [20]:
list(a.values())

[1, 2]

3