In [2]:
import numpy as np
import tensorflow as tf
from janome.tokenizer import Tokenizer
from janome.tokenizer import Tokenizer
from janome.analyzer import Analyzer
from janome.charfilter import *
from janome.tokenfilter import *

In [3]:
num_points = 100
dimensions = 2
points = np.random.uniform(0, 1000, [num_points, dimensions])

def input_fn():
    return tf.train.limit_epochs(
        tf.convert_to_tensor(points, dtype=tf.float32), num_epochs=1)

num_clusters = 5
kmeans = tf.contrib.factorization.KMeansClustering(
    num_clusters=num_clusters, use_mini_batch=False)

# train
num_iterations = 10
previous_centers = None
for _ in range(num_iterations):
    kmeans.train(input_fn)
    cluster_centers = kmeans.cluster_centers()
    if previous_centers is not None:
        print('delta:', cluster_centers - previous_centers)
    previous_centers = cluster_centers
    print ('score:', kmeans.score(input_fn))
print ('cluster centers:', cluster_centers)

# map the input points to their clusters
cluster_indices = list(kmeans.predict_cluster_index(input_fn))
for i, point in enumerate(points):
    cluster_index = cluster_indices[i]
    center = cluster_centers[cluster_index]
    print('point:', point, 'is in cluster', cluster_index, 'centered at', center)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/jt/w9n_rnhj61xgp3xvr_8mtjvr0000gn/T/tmp5bets6i0', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1265a9198>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/jt/

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-06-24-09:18:26
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/jt/w9n_rnhj61xgp3xvr_8mtjvr0000gn/T/tmp5bets6i0/model.ckpt-11
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [1/1]
INFO:tensorflow:Finished evaluation at 2018-06-24-09:18:26
INFO:tensorflow:Saving dict for global step 11: global_step = 11, loss = 3635154.5, score = 3635154.5
score: 3635154.5
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/jt/w9n_rnhj61xgp3xvr_8mtjvr0000gn/T/tmp5bets6i0/model.ckpt-11
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 12 into /var/folders/jt/w9n_rnhj61xgp3xvr_8mtjvr0000gn/T/tmp5

point: [300.66443043 203.94495023] is in cluster 3 centered at [234.63187 296.65164]
point: [156.8933003  702.85193433] is in cluster 4 centered at [240.76158 793.7662 ]
point: [333.34591539 850.50346304] is in cluster 4 centered at [240.76158 793.7662 ]
point: [133.60802799 901.31350314] is in cluster 4 centered at [240.76158 793.7662 ]
point: [454.764597   108.19056484] is in cluster 3 centered at [234.63187 296.65164]
point: [606.07332228 770.36250466] is in cluster 2 centered at [631.6959 792.5859]
point: [317.51971795 428.66086761] is in cluster 3 centered at [234.63187 296.65164]
point: [645.97581135 416.87560987] is in cluster 1 centered at [760.9083  284.38184]
point: [334.60620818 997.82731587] is in cluster 4 centered at [240.76158 793.7662 ]
point: [ 26.89179815 230.92718909] is in cluster 3 centered at [234.63187 296.65164]
point: [724.89691852 882.67078483] is in cluster 2 centered at [631.6959 792.5859]
point: [608.49712623 708.60297369] is in cluster 2 centered at [631.6

In [3]:
def wordToDicGen(tokenizer, char_filters, token_filters):
    def wordToDic(text):
        dic = dict()
        for token in Analyzer(char_filters, tokenizer, token_filters).analyze(text):
            dic[token.base_form] = 1
        return dic
    return wordToDic

In [4]:
def keyWithIndex(dic: dict):
    retdic = dict()
    i = 0
    for key in sorted(dic.keys()):
        retdic[key] = i
        i += 1
    return retdic

In [5]:
def dicToVec(index_dic, target_dic):
    resultVec = np.zeros(len(index_dic))
    for key in sorted(target_dic):
        if key in index_dic:
            resultVec[index_dic[key]] = 1.0
    return resultVec

In [6]:
char_filters = [UnicodeNormalizeCharFilter()]
tokenizer = Tokenizer()
token_filters = [CompoundNounFilter(), POSStopFilter(['記号','助詞', '助動詞', '助動詞']), LowerCaseFilter()]
wordToDic = wordToDicGen(tokenizer, char_filters, token_filters)

In [7]:
def mergeCountDic(someDic, newDic):
    for key in newDic:
        if key in someDic:
            someDic[key] = someDic[key] + 1
        else:
            someDic[key] = 1
    return someDic

In [8]:
def dicCountFilter(min, max,  dic):
    resultDic = dict()
    for key in dic:
        if dic[key] >= min and dic[key] < max:
            resultDic[key] = 1
    return resultDic

In [9]:
def textToVec(textList: list, wordToDic):
    someDic = dict()
    textDicList = list()
    for text in textList:
        textDic = wordToDic(text)
        someDic = mergeCountDic(someDic, textDic)
        textDicList.append(textDic)
    indexDic = keyWithIndex(dicCountFilter(4, 8, someDic))
    return indexDic, np.vstack((list(map(lambda textDic: dicToVec(indexDic, textDic), textDicList))))

In [10]:
def input_fn_gen(textVec):
    def input_fn():
        return tf.train.limit_epochs(
            tf.convert_to_tensor(textVec, dtype=tf.float32), num_epochs=30)
    return input_fn

In [11]:
def train_model(num_clusters, input_fn):
    kmeans = tf.contrib.factorization.KMeansClustering(num_clusters=num_clusters, use_mini_batch=False)
    # train
    num_iterations = 10
    previous_centers = None
    for _ in range(num_iterations):
        kmeans.train(input_fn)
        cluster_centers = kmeans.cluster_centers()
        previous_centers = cluster_centers
    return kmeans, cluster_centers

In [12]:
def showResult(kmeans, cluster_centers, input_fn, textVec):
    # map the input points to their clusters
    cluster_indices = list(kmeans.predict_cluster_index(input_fn))
    for i, point in enumerate(textVec):
        cluster_index = cluster_indices[i]
        print('index:', i, 'is in cluster', cluster_index)

In [13]:
keizaiNews = list()
for i in range(10):
    keizaiNews.append(open("news/keizai_"  + '{:02}'.format(i+1) + ".txt", "r").read())

In [14]:
spoNews = list()
for i in range(10):
    spoNews.append(open("news/spo_"  + '{:02}'.format(i+1) + ".txt", "r").read())

In [15]:
someList = list()
someList.extend(keizaiNews)
someList.extend(spoNews)

In [16]:
len(someList)

20

In [17]:
indexDic, textVec = textToVec(someList, wordToDic)

In [18]:
indexDic

{'いう': 0,
 'いく': 1,
 'おる': 2,
 'かける': 3,
 'この': 4,
 'せる': 5,
 'そして': 6,
 'その': 7,
 'できる': 8,
 'もの': 9,
 'よう': 10,
 'られる': 11,
 'わけ': 12,
 'チーム': 13,
 '一方': 14,
 '中': 15,
 '今': 16,
 '出る': 17,
 '出場': 18,
 '受ける': 19,
 '同じ': 20,
 '多く': 21,
 '大会': 22,
 '強い': 23,
 '思う': 24,
 '日本': 25,
 '決める': 26,
 '決勝トーナメント': 27,
 '発表': 28,
 '第': 29,
 '結果': 30,
 '行う': 31,
 '試合': 32,
 '語る': 33,
 '選手': 34,
 '開く': 35,
 '開幕': 36}

In [21]:
textVec[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0.])

In [22]:
textVec[11]

array([1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1.,
       1., 0., 0.])

In [23]:
textVec[12]

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 0.,
       0., 0., 0.])

In [19]:
input_fn = input_fn_gen(textVec)
kmeans, cluster_centers =  train_model(2, input_fn)
showResult(kmeans, cluster_centers, input_fn, textVec)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/jt/w9n_rnhj61xgp3xvr_8mtjvr0000gn/T/tmpvbjwpsvv', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12c8b2320>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/jt/