In [62]:
import re
import urllib.request
import zipfile
from lxml import etree
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd

In [63]:
data = [
    "action","arcade & rhythm","fighting & martial arts", "first-person shooter","hack & slash","platformer & runner",
    "third-person shooter", "shmup","role-playing","action rpg","adventure rpg","jrpg","party-based","rogue-like","strategy rpg",
    "turn-based","strategy","card & board","city & settlement","grand & 4x","military","real-time strategy","tower defense","turn-based strategy",
    "adventure","adventure rpg","casual","hidden object","metroidvania","puzzle","story-rich","visual novel","simulation","building & automation",
    "dating","farming & crafting","hobby & job","life & immersive","sandbox & physics","space & flight","sports & racing","all sports","fishing & hunting",
    "individual sports","racing","racing sim","sports sim","team sports"
]
data = list(set(data))

In [64]:
normalized_text = []
for string in data:
     tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
     normalized_text.append(tokens)

# 각 문장에 대해서 NLTK를 이용하여 단어 토큰화를 수행.
# result = [word_tokenize(sentence) for sentence in normalized_text]
result = [[sentence] for sentence in normalized_text]

In [65]:
result

[['visual novel'],
 ['first person shooter'],
 ['fighting martial arts'],
 ['military'],
 ['arcade rhythm'],
 ['sports racing'],
 ['racing sim'],
 ['rogue like'],
 ['action rpg'],
 ['all sports'],
 ['third person shooter'],
 ['grand 4x'],
 ['hack slash'],
 ['jrpg'],
 ['story rich'],
 ['card board'],
 ['sandbox physics'],
 ['strategy rpg'],
 ['puzzle'],
 ['real time strategy'],
 ['hidden object'],
 ['platformer runner'],
 ['strategy'],
 ['simulation'],
 ['adventure'],
 ['life immersive'],
 ['hobby job'],
 ['dating'],
 ['city settlement'],
 ['building automation'],
 ['casual'],
 ['fishing hunting'],
 ['party based'],
 ['turn based strategy'],
 ['role playing'],
 ['sports sim'],
 ['farming crafting'],
 ['racing'],
 ['individual sports'],
 ['tower defense'],
 ['space flight'],
 ['action'],
 ['metroidvania'],
 ['turn based'],
 ['team sports'],
 ['shmup'],
 ['adventure rpg']]

In [66]:
print('총 샘플의 개수 : {}'.format(len(result)))

총 샘플의 개수 : 47


In [67]:
from gensim.models import FastText
model = FastText(result,min_count=1,workers=4, sg=1)
model.build_vocab(corpus_iterable=result)
model.train(corpus_iterable=result, total_examples=len(result), epochs=10)

(114, 470)

In [68]:
model.wv.most_similar("all sports")

[('individual sports', 0.4489245116710663),
 ('team sports', 0.3898385465145111),
 ('sports sim', 0.2855340540409088),
 ('arcade rhythm', 0.2211601585149765),
 ('turn based strategy', 0.2140343338251114),
 ('sports racing', 0.20826105773448944),
 ('visual novel', 0.1984509378671646),
 ('racing', 0.16095691919326782),
 ('card board', 0.14919812977313995),
 ('sandbox physics', 0.147965207695961)]

In [69]:
# 단어 유사도
model.wv.similarity('sports racing', 'all sports')

0.20826103

In [70]:
from sklearn.cluster import KMeans

word_vectors = model.wv.vectors # 어휘의 feature vector


In [71]:
word_vectors.shape

(47, 100)

In [85]:
num_clusters = int(word_vectors.shape[0]/4) # 어휘 크기의 1/5나 평균 5단어
print(num_clusters)
num_clusters = int(num_clusters)

11


In [86]:
kmeans_clustering = KMeans(n_clusters=num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

idx = list(idx)
names = model.wv.index_to_key
word_centroid_map = {names[i]: idx[i] for i in range(len(names))}



In [87]:
for c in range(num_clusters):
    # 클러스터 번호를 출력
    print("\ncluster {}".format(c))
    
    words = []
    cluster_values = list(word_centroid_map.values())
    for i in range(len(cluster_values)):
        if (cluster_values[i] == c):
            words.append(list(word_centroid_map.keys())[i])            
    print(words)


cluster 0
['sports racing', 'sports sim', 'racing']

cluster 1
['rogue like', 'platformer runner', 'simulation', 'space flight', 'farming crafting', 'role playing', 'party based', 'fishing hunting', 'building automation', 'dating', 'hobby job']

cluster 2
['action']

cluster 3
['hack slash', 'first person shooter', 'tower defense', 'casual']

cluster 4
['jrpg']

cluster 5
['puzzle']

cluster 6
['team sports']

cluster 7
['hidden object', 'real time strategy', 'sandbox physics', 'card board', 'story rich', 'third person shooter', 'all sports', 'racing sim', 'arcade rhythm', 'military', 'fighting martial arts', 'turn based', 'metroidvania', 'individual sports', 'turn based strategy', 'city settlement', 'life immersive', 'visual novel']

cluster 8
['grand 4x']

cluster 9
['adventure rpg', 'strategy rpg', 'strategy', 'action rpg', 'adventure']

cluster 10
['shmup']
