In [2]:
import math
import numpy as np
import tensorflow as tf
from janome.tokenizer import Tokenizer
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.charfilter import *
from janome.tokenfilter import *

In [3]:
def wordToDicGen(tokenizer, char_filters, token_filters):
    def wordToDic(text):
        dic = dict()
        for token in Analyzer(char_filters, tokenizer, token_filters).analyze(text):
            dic[token.base_form] = 1
        return dic
    return wordToDic

In [4]:
def keyWithIndex(dic: dict):
    retdic = dict()
    i = 0
    for key in sorted(dic.keys()):
        retdic[key] = i
        i += 1
    return retdic

In [5]:
def dicToVec(index_dic, target_dic):
    resultVec = np.zeros(len(index_dic))
    for key in sorted(target_dic):
        if key in index_dic:
            resultVec[index_dic[key]] = 1.0
    return resultVec

In [6]:
char_filters = [UnicodeNormalizeCharFilter()]
tokenizer = Tokenizer()
token_filters = [CompoundNounFilter(), POSStopFilter(['記号','助詞', '助動詞', '助動詞']), LowerCaseFilter()]
wordToDic = wordToDicGen(tokenizer, char_filters, token_filters)

In [7]:
def mergeCountDic(someDic, newDic):
    for key in newDic:
        if key in someDic:
            someDic[key] = someDic[key] + 1
        else:
            someDic[key] = 1
    return someDic

In [8]:
def dicIdfFilter(dic, doc_num):
    resultDic = dict()
    calCache = dict()
    for key in dic:
        if dic[key] not in calCache:
            calCache[dic[key]] = math.log(float(doc_num / dic[key]), math.e)
        if dic[key] >= 2 and calCache[dic[key]] >= 1.5:
            resultDic[key] = calCache[dic[key]] 
    return resultDic

In [9]:
def textToVec(textList: list, wordToDic):
    someDic = dict()
    textDicList = list()
    for text in textList:
        textDic = wordToDic(text)
        someDic = mergeCountDic(someDic, textDic)
        textDicList.append(textDic)
    indexDic = keyWithIndex(dicIdfFilter(someDic, len(textList)))
    return indexDic, np.vstack((list(map(lambda textDic: dicToVec(indexDic, textDic), textDicList))))

In [10]:
keizaiNews = list()
for i in range(10):
    keizaiNews.append(open("news/keizai_"  + '{:02}'.format(i+1) + ".txt", "r").read())

In [11]:
spoNews = list()
for i in range(10):
    spoNews.append(open("news/spo_"  + '{:02}'.format(i+1) + ".txt", "r").read())

In [12]:
someList = list()
someList.extend(keizaiNews)
someList.extend(spoNews)

In [13]:
indexDic, textVec = textToVec(someList, wordToDic)

In [14]:
def input_fn_gen(textVec):
    def input_fn():
        return tf.train.limit_epochs(
            tf.convert_to_tensor(textVec, dtype=tf.float32), num_epochs=30)
    return input_fn

In [15]:
def train_model(num_clusters, input_fn):
    kmeans = tf.contrib.factorization.KMeansClustering(num_clusters=num_clusters, use_mini_batch=False)
    # train
    num_iterations = 10
    previous_centers = None
    for _ in range(num_iterations):
        kmeans.train(input_fn)
        cluster_centers = kmeans.cluster_centers()
        previous_centers = cluster_centers
    return kmeans, cluster_centers

In [16]:
def showResult(kmeans, cluster_centers, input_fn, textVec):
    # map the input points to their clusters
    cluster_indices = list(kmeans.predict_cluster_index(input_fn))
    for i, point in enumerate(textVec):
        cluster_index = cluster_indices[i]
        print('index:', i, 'is in cluster', cluster_index)

In [17]:
entaNews = list()
for i in range(20):
    entaNews.append(open("news/enta_"  + '{:02}'.format(i+1) + ".txt", "r").read())

In [18]:
keizaiNews = list()
for i in range(20):
    keizaiNews.append(open("news/keizai_"  + '{:02}'.format(i+1) + ".txt", "r").read())

In [19]:
spoNews = list()
for i in range(20):
    spoNews.append(open("news/spo_"  + '{:02}'.format(i+1) + ".txt", "r").read())

In [20]:
someList = list()
someList.extend(entaNews)
someList.extend(keizaiNews)
someList.extend(spoNews)

In [21]:
len(someList)

60

In [22]:
indexDic, textVec = textToVec(someList, wordToDic)

In [23]:
indexDic

{'!': 0,
 '!」': 1,
 ')': 2,
 '-final': 3,
 '0-0': 4,
 '1': 5,
 '10年': 6,
 '1人': 7,
 '1次リーグd組': 8,
 '1試合': 9,
 '2': 10,
 '2008年': 11,
 '2017年': 12,
 '2018': 13,
 '2018年9月27日': 14,
 '22日': 15,
 '25日': 16,
 '26日': 17,
 '27日': 18,
 '2人': 19,
 '2戦': 20,
 '2戦目': 21,
 '2点': 22,
 '2節': 23,
 '2試合': 24,
 '2連勝': 25,
 '3': 26,
 '3戦': 27,
 '3月': 28,
 '4': 29,
 '6月22日': 30,
 '6月26日': 31,
 '6月27日': 32,
 '6月28日': 33,
 '7月': 34,
 '7月1日': 35,
 '<7201>': 36,
 '<7203>': 37,
 '?': 38,
 '?」': 39,
 'a': 40,
 'and': 41,
 'fifaワールドカップ': 42,
 'game': 43,
 'goal': 44,
 'images': 45,
 'in': 46,
 'joy-con': 47,
 'news': 48,
 'npc': 49,
 'of': 50,
 'remix-』': 51,
 'the': 52,
 'to': 53,
 'wonderful': 54,
 'world': 55,
 'w杯': 56,
 'w杯ロシア大会': 57,
 'w杯出場': 58,
 'あなた': 59,
 'あの': 60,
 'ありがとう': 61,
 'いい': 62,
 'いう': 63,
 'いく': 64,
 'いける': 65,
 'いただく': 66,
 'いっぱい': 67,
 'いろいろ': 68,
 'うまい': 69,
 'うれしい': 70,
 'お': 71,
 'おる': 72,
 'かける': 73,
 'く': 74,
 'くださる': 75,
 'くる': 76,
 'くれる': 77,
 'こう': 78,
 'こうした': 79,
 'ここ': 80,
 'こ

In [24]:
input_fn = input_fn_gen(textVec)
kmeans, cluster_centers =  train_model(3, input_fn)
showResult(kmeans, cluster_centers, input_fn, textVec)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/jt/w9n_rnhj61xgp3xvr_8mtjvr0000gn/T/tmptn76lsy2', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x132370a90>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/jt/