In [71]:
import re
import urllib.request
import zipfile
from lxml import etree
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd

In [72]:
data = [
    "action","arcade & rhythm","fighting & martial arts", "first-person shooter","hack & slash","platformer & runner",
    "third-person shooter", "shmup","role-playing","action rpg","adventure rpg","jrpg","party-based","rogue-like","strategy rpg",
    "turn-based","strategy","card & board","city & settlement","grand & 4x","military","real-time strategy","tower defense","turn-based strategy",
    "adventure","adventure rpg","casual","hidden object","metroidvania","puzzle","story-rich","visual novel","simulation","building & automation",
    "dating","farming & crafting","hobby & job","life & immersive","sandbox & physics","space & flight","sports & racing","all sports","fishing & hunting",
    "individual sports","racing","racing sim","sports sim","team sports"
]
data = list(set(data))

In [73]:
normalized_text = []
for string in data:
     tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
     normalized_text.append(tokens)

# 각 문장에 대해서 NLTK를 이용하여 단어 토큰화를 수행.
# result = [word_tokenize(sentence) for sentence in normalized_text]
result = [[sentence] for sentence in normalized_text]

In [74]:
result

[['building automation'],
 ['life immersive'],
 ['military'],
 ['real time strategy'],
 ['racing'],
 ['city settlement'],
 ['strategy'],
 ['visual novel'],
 ['farming crafting'],
 ['sandbox physics'],
 ['all sports'],
 ['role playing'],
 ['space flight'],
 ['platformer runner'],
 ['team sports'],
 ['metroidvania'],
 ['tower defense'],
 ['fighting martial arts'],
 ['adventure rpg'],
 ['individual sports'],
 ['grand 4x'],
 ['story rich'],
 ['rogue like'],
 ['simulation'],
 ['racing sim'],
 ['party based'],
 ['dating'],
 ['adventure'],
 ['first person shooter'],
 ['hidden object'],
 ['puzzle'],
 ['sports sim'],
 ['third person shooter'],
 ['hobby job'],
 ['hack slash'],
 ['turn based strategy'],
 ['jrpg'],
 ['card board'],
 ['sports racing'],
 ['arcade rhythm'],
 ['action rpg'],
 ['strategy rpg'],
 ['fishing hunting'],
 ['shmup'],
 ['casual'],
 ['turn based'],
 ['action']]

In [75]:
print('총 샘플의 개수 : {}'.format(len(result)))

총 샘플의 개수 : 47


In [76]:
from gensim.models import Word2Vec
model = Word2Vec(result,min_count=1,workers=4, sg=1)
model.build_vocab(corpus_iterable=result)
model.train(corpus_iterable=result, total_examples=len(result), epochs=10)

(114, 470)

In [77]:
model.wv['all sports']

array([ 7.0887972e-03, -1.5679300e-03,  7.9474989e-03, -9.4886590e-03,
       -8.0294991e-03, -6.6403709e-03, -4.0034545e-03,  4.9892161e-03,
       -3.8135587e-03, -8.3199050e-03,  8.4117772e-03, -3.7470020e-03,
        8.6086961e-03, -4.8957514e-03,  3.9185942e-03,  4.9220170e-03,
        2.3926091e-03, -2.8188038e-03,  2.8491246e-03, -8.2562361e-03,
       -2.7655398e-03, -2.5911583e-03,  7.2490061e-03, -3.4634031e-03,
       -6.5997029e-03,  4.3404270e-03, -4.7448516e-04, -3.5975564e-03,
        6.8824720e-03,  3.8723124e-03, -3.9002013e-03,  7.7188847e-04,
        9.1435025e-03,  7.7546560e-03,  6.3618720e-03,  4.6673026e-03,
        2.3844899e-03, -1.8416261e-03, -6.3712932e-03, -3.0181051e-04,
       -1.5653884e-03, -5.7228567e-04, -6.2628710e-03,  7.4340473e-03,
       -6.5914928e-03, -7.2392775e-03, -2.7571463e-03, -1.5154004e-03,
       -7.6357173e-03,  6.9824100e-04, -5.3261113e-03, -1.2755442e-03,
       -7.3651113e-03,  1.9605684e-03,  3.2731986e-03, -2.3138524e-05,
      

In [78]:
model.wv.most_similar("all sports")

[('shmup', 0.2685850262641907),
 ('turn based strategy', 0.14284475147724152),
 ('space flight', 0.12813477218151093),
 ('role playing', 0.10941850394010544),
 ('visual novel', 0.1088901236653328),
 ('simulation', 0.10807985067367554),
 ('racing sim', 0.10195513069629669),
 ('rogue like', 0.09932279586791992),
 ('fighting martial arts', 0.09614861756563187),
 ('individual sports', 0.08635812252759933)]

In [79]:
# 단어 유사도
# model.wv.similarity('sportsracing', 'allsports')

In [80]:
from sklearn.cluster import KMeans

word_vectors = model.wv.vectors # 어휘의 feature vector


In [81]:
word_vectors.shape

(47, 100)

In [82]:
num_clusters = int(word_vectors.shape[0]/5) # 어휘 크기의 1/5나 평균 5단어
print(num_clusters)
num_clusters = int(num_clusters)

9


In [83]:
kmeans_clustering = KMeans(n_clusters=num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

idx = list(idx)
names = model.wv.index_to_key
word_centroid_map = {names[i]: idx[i] for i in range(len(names))}



In [84]:
for c in range(num_clusters):
    # 클러스터 번호를 출력
    print("\ncluster {}".format(c))
    
    words = []
    cluster_values = list(word_centroid_map.values())
    for i in range(len(cluster_values)):
        if (cluster_values[i] == c):
            words.append(list(word_centroid_map.keys())[i])            
    print(words)


cluster 0
['sandbox physics', 'casual', 'building automation']

cluster 1
['role playing', 'grand 4x', 'fighting martial arts', 'tower defense', 'city settlement', 'real time strategy', 'military', 'strategy rpg', 'jrpg', 'hack slash', 'puzzle']

cluster 2
['platformer runner', 'strategy', 'racing', 'story rich', 'simulation', 'arcade rhythm', 'sports racing', 'party based']

cluster 3
['all sports', 'turn based', 'turn based strategy', 'action rpg']

cluster 4
['individual sports', 'team sports', 'visual novel', 'card board', 'first person shooter', 'dating']

cluster 5
['metroidvania', 'rogue like', 'life immersive', 'sports sim', 'hidden object']

cluster 6
['fishing hunting', 'racing sim']

cluster 7
['adventure rpg', 'space flight', 'shmup', 'hobby job', 'third person shooter']

cluster 8
['action', 'farming crafting', 'adventure']
