In [1]:
from keybert import KeyBERT
from kiwipiepy import Kiwi
from transformers import BertModel
import pandas as pd
from collections import defaultdict
import glob
import math
from tqdm import tqdm
import numpy as np

In [2]:
model = BertModel.from_pretrained('skt/kobert-base-v1')
kw_model = KeyBERT(model)
# keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words=None, top_n=10)
# keywords

In [3]:
kiwi = Kiwi()

In [4]:
# 명사 추출 함수
def noun_extractor(text):
    results = []
    result = kiwi.analyze(text)
    for token, pos, _, _ in result[0][0]:
        if len(token) != 1 and pos.startswith('N') or pos.startswith('SL'):
            results.append(token)
    return results

In [5]:
review_files = glob.glob('./DF/*csv')
review_files

['./DF\\AIGames.DSFMK.csv',
 './DF\\cjenm.ModooMarbleKakao.csv',
 './DF\\cjenm.monster.csv',
 './DF\\cjenm.sknights.csv',
 './DF\\Danalgames.CobFighter.csv',
 './DF\\dddgame.sd3.csv',
 './DF\\fincon.hh.csv',
 './DF\\fincon.hh2.csv',
 './DF\\fingertips.kof98umol.csv',
 './DF\\flerogames.ES.csv',
 './DF\\ftt.hero_gl_4kakao.csv',
 './DF\\funnypack.rfkakao.csv',
 './DF\\gamevil.psrforkakao.csv',
 './DF\\grampus.cookingadventureforkakao.csv',
 './DF\\joymax.candypang2.csv',
 './DF\\joymax.gostop.csv',
 './DF\\kakaogames.archewar.csv',
 './DF\\kakaogames.ares.csv',
 './DF\\kakaogames.eversoul.csv',
 './DF\\kakaogames.friendsKing.csv',
 './DF\\kakaogames.friendsScpuzzle.csv',
 './DF\\kakaogames.friendsTower.csv',
 './DF\\kakaogames.friendsTown.csv',
 './DF\\kakaogames.gdtskr.csv',
 './DF\\kakaogames.grdchase.csv',
 './DF\\kakaogames.hellobt21.csv',
 './DF\\kakaogames.lmzgplay.csv',
 './DF\\kakaogames.moonlight.csv',
 './DF\\kakaogames.myhome.csv',
 './DF\\kakaogames.nikki.csv',
 './DF\\kakaog

In [6]:
li = []
stop_words = ["게임", "이거", "하나", "유저", "나름", "업데이트", "접속", "플레이"]
for path in tqdm(review_files):
    d = defaultdict(int)
    df = pd.read_csv(path)
    for review in df['reviews']:
        if type(review) == str:
            nouns = noun_extractor(review)
            text = ' '.join(nouns)
            keywords = kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words=stop_words, top_n=20)
            for key, val in keywords:
                d[key] += val
        elif math.isnan(review):
            continue
    # li.append(sorted(d.items(), key= lambda x: -x[1])[:10])
    li.append(sorted(d.items(), key= lambda x: -x[1])) # 전체를 다 주는경우 
    

100%|██████████| 91/91 [39:54<00:00, 26.32s/it]


In [7]:
li

[[('재미', 20.0958),
  ('선수', 13.8591),
  ('시간', 12.5131),
  ('나름', 7.0),
  ('처음', 6.1943),
  ('재밋', 5.7599),
  ('축구', 4.2936),
  ('재밋어요', 3.9778000000000002),
  ('그래픽', 3.1656),
  ('플레이', 3.1583),
  ('최고', 3.1231),
  ('킬링타임', 3.0),
  ('감독', 2.8066),
  ('업뎃', 2.7363),
  ('시작', 2.7119),
  ('생각', 2.7088),
  ('적응', 2.4588),
  ('진행', 2.4396),
  ('터치', 2.3418),
  ('fm', 2.3023),
  ('자동', 2.1298),
  ('과금', 2.1244),
  ('조아요', 2.0),
  ('good', 1.9992),
  ('만족', 1.9937),
  ('재밋네', 1.9868000000000001),
  ('리얼', 1.9042),
  ('화면', 1.8942999999999999),
  ('잼나요', 1.8809),
  ('조작', 1.8357),
  ('킬링', 1.7932000000000001),
  ('단순', 1.7351999999999999),
  ('정도', 1.6633),
  ('초반', 1.6472),
  ('매니저', 1.6205),
  ('느낌', 1.5967),
  ('심심풀이', 1.5857999999999999),
  ('부분', 1.5790000000000002),
  ('필요', 1.5756000000000001),
  ('전술', 1.4945),
  ('예전', 1.4926),
  ('이름', 1.4901),
  ('오랜만', 1.4047),
  ('기능', 1.3860000000000001),
  ('영입', 1.3814),
  ('경기', 1.3517000000000001),
  ('업데이트', 1.3138),
  ('최근', 1.299100000000

In [8]:
d_idf = defaultdict(int)

for game_keywords in li:
    for key, _ in game_keywords:
        d_idf[key] += 1
    

In [9]:
tf_idf = []

N = len(li)

for i in range(len(li)):
    n_tuple = []
    for j in range(len(li[i])):
        n_tuple.append([li[i][j][0], li[i][j][1] * math.log(N / d_idf[li[i][j][0]] + 1)])
        
    n_tuple.sort(key= lambda x : -x[1])
    only_keywords = []
    for key in n_tuple[:10]:
        only_keywords.append(key[0])
    tf_idf.append(only_keywords)

In [10]:
print(tf_idf)

[['선수', '축구', '재미', '감독', 'fm', '시간', '재밋네', '나름', '영입', '전술'], ['행템', '모마', '마블', '주사위', '유저', '아이템', '캐릭터', '행운', '카드', '확률'], ['신장', '유저', '업데이트', '대륙', '몬스터', '캐릭터', '복귀', '옛날', '초월', '미나'], ['세븐나이츠', '리부트', '대장간', '유저', '신화', '복귀', '각성', '영웅', '업데이트', '생각'], ['오락실', '추억', '베이비', '파이터', '재미', '옛날', '실행', '오랜만', '스트레스', '컴온'], ['장수', '삼국지', '디펜스', '나팔', '루비', '군웅', '마음의소리', '도감', '소탕', '업데이트'], ['히어로', '헬히', '영정', '캐럿', '유저', '헬로', '접속', '영웅', '정지', '영구'], ['히어로', '헬로', '재미', '영웅', '폭군', '캐릭터', '시간', '그래픽', '모험', '킬링타임'], ['자질', '격투', '접속', '업데이트', '대전', '킹오파', '서버', '다이아', '현지', '유저'], ['통신', '하트', '상태', '에브리타운', '접속', '재미', '퍼즐', '캐릭터', '시간', '종료'], ['영웅', '초코', '업데이트', '화면', '환생', '바하무트', '실행', '접속', '복귀', '주년'], ['낚시', '물고기', '바다', '어부', '수족관', '라이벌', '피싱', '손맛', '접속', '장비'], ['유저', '퍼스트', '별되', '업데이트', '복귀', '퍼임', '인피니티', '신위', '동료', '캐릭터'], ['광고', '럭키', '업데이트', '레스토랑', '식당', '음식', '요리', '마리셰', '음식점', '다이아'], ['시간', '캔디팡', '하트', '업데이트', '재미', '챔피언스리그', '리그', '중독', '스테이지', '아이템'

In [11]:
df = pd.DataFrame(columns=['game_name','keywords'])
df['game_name'] = review_files
df['keywords'] = tf_idf
df

Unnamed: 0,game_name,keywords
0,./DF\AIGames.DSFMK.csv,"[선수, 축구, 재미, 감독, fm, 시간, 재밋네, 나름, 영입, 전술]"
1,./DF\cjenm.ModooMarbleKakao.csv,"[행템, 모마, 마블, 주사위, 유저, 아이템, 캐릭터, 행운, 카드, 확률]"
2,./DF\cjenm.monster.csv,"[신장, 유저, 업데이트, 대륙, 몬스터, 캐릭터, 복귀, 옛날, 초월, 미나]"
3,./DF\cjenm.sknights.csv,"[세븐나이츠, 리부트, 대장간, 유저, 신화, 복귀, 각성, 영웅, 업데이트, 생각]"
4,./DF\Danalgames.CobFighter.csv,"[오락실, 추억, 베이비, 파이터, 재미, 옛날, 실행, 오랜만, 스트레스, 컴온]"
...,...,...
86,./DF\SuperAwesome.HKFKakao.csv,"[키티, 헬로키티, 쿠로미, 캐릭터, 산리오, 사과, 재미, 시간, 프렌즈, 친구]"
87,./DF\webzen.mua2.google.csv,"[엔젤, 웹젠, 접속, 과금, 아크, 보스, 재미, 생각, 현지, 그래픽]"
88,./DF\wemade.mir4.csv,"[미르4, 그래픽, 미르, 채집, 업데이트, 플레이, 접속, 컨텐츠, 토벌, 진행]"
89,./DF\wemade.mirm.csv,"[미르, 그래픽, 리니지, 진행, 사냥, 재미, 화신, 버그, 과금, 유저]"


In [12]:
df.to_csv('game_keywords.csv', index=False)