In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("0425_ENG_final_drama_reviews.csv")

In [3]:
drama_titles = df.groupby('title',as_index=False).count().title
drama_titles

0                          18어게인
1                  365:운명을거스르는1년
2                       60일지정생존자
3                           D.P.
4     검색어를입력하세요WWW
                 ...            
87                  하늘에서내리는일억개의별
88                          하이에나
89                      한번다녀왔습니다
90                          해피니스
91                         호텔델루나
Name: title, Length: 92, dtype: object

In [4]:
reviews = df.reviews.to_list()
sentences = ",".join(reviews)
reviews_by_drama = []
for title in titles:
    rev_by_drama = df[df.title == title].reviews
    reviews_by_drama.append(','.join(rev_by_drama))

In [5]:
len(reviews)

182687

# 필요 함수들

In [None]:
# pip install contextualized_topic_models

In [None]:
# pip install tensorflow

In [6]:
import re
import pandas as pd
import numpy as np
import string
from tensorflow.keras.preprocessing.text import text_to_word_sequence
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
# from contextualized_topic_models.models.ctm import CombinedTM
# from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation, bert_embeddings_from_list
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
# from konlpy.tag import Okt
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english'))
stop_words.update(('\r\n', '\n\r', 'ever', 'much', 'look', 'squid', 'show', 'thing', "i've", 'anything', 'something', "show's",
                   'www', 'soompi' 'com', 'instagram', 'youtube', 'https', 'mydramalist', 'twitter', 'episode', 'comment', 'scene',
                   'version', "he's", 'gonna', 'series', 'watch', 'everything', 'something', "can't", 'list', 'dramas', 'drama',
                   'wait', 'preview', 'someone', 'everyone', 'dont', 'think', 'season', 'anyone', 'something', 'anything', 'nothing', 'world',
                   'status', 'week', 'name', 'cause', 'time', 'en', 'org', 'wikipedia', 'wiki', 'pbs', 'twimg', 'year', 'point', 'please', 'today',
                   'haha', 'case', 'guess', 'reason', 'person', 'moment', 'sense', 'kinda', 'part', 'movie', 'school', 'start', 'work', 'lead', 'kind',
                   'rate', 'rating', 'rate', 'men', 'example', 'idea', 'half', 'review', 'genre', 'side', "that's", "they're", 'till', 'tell', 'phone',
                   'section', 'number', 'company', 'line', "there's", 'male', 'team', 'rating', 'baby', 'course', 'care', 'cute', 'question', 'help', 'group',
                   'hand', 'spoiler', 'hate', 'need', 'mess', 'change', 'drop', 'date', 'netflix', 'yeah', 'daon', 'park', 'thank', 'lmao', 'damn', "i'll",
                   'kang', 'shinwoo', 'taekyung', 'mean', 'woman', 'hope', 'read', 'fact', 'opinion', 'stuff', 'feel', 'kdrama', 'talk', 'song', 'hype',
                   'title', 'type'))

# 사용 함수

In [8]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('V'):
        return 'v'
    elif pos_tag.startswith('N'):
        return 'n'
    elif pos_tag.startswith('J'):
        return 'a'
    elif pos_tag.startswith('R'):
        return 'r'
    else:
        return None

# CustomTokenizer class setting

In [9]:
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, a):
        # a = ' '.join(a)
        word_tokens = self.tagger(a)
        
        words = []
        for i in word_tokens:
            text = re.sub('[^a-zA-Z0-9\']','',i).strip()
            text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`…》]','', text)
            if(text != ''):
                words.append(text)
        
        tag_words = nltk.pos_tag(words)
        pos_words = [word for word in tag_words if word[1][0] in {'N'}] #'V','N','J','R'

        temp_list = []
        for token, pos_tag in pos_words:
            tag = get_wordnet_pos(pos_tag)
            if tag != None:
                temp_list.append((token, get_wordnet_pos(pos_tag)))
        lemma = WordNetLemmatizer()
        token_final = [lemma.lemmatize(token, pos=tag) for token, tag in temp_list]
        long_words = [i for i in token_final if len(i) > 2]
        results = [w for w in long_words if w not in stop_words]
        
        return results

# 작업

In [10]:
def tfidf_vectorizing(reviews):
    ngram_range = (1,2)

    custom_tokenizer = CustomTokenizer(text_to_word_sequence)

    tfidf = TfidfVectorizer(tokenizer=custom_tokenizer,ngram_range = ngram_range,\
        stop_words = stop_words, max_df=10, min_df=2 ,max_features=5000).fit(reviews)
    candidates = tfidf.get_feature_names()
    
    return candidates

# 임베딩

In [13]:
def embedding(sentences,candidates):
    from sentence_transformers import SentenceTransformer

    model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

    doc_embedding = model.encode([sentences])
    candidate_embeddings = model.encode(candidates)
    # drama_embedding = model.encode(reviews_by_drama)

    return doc_embedding, candidate_embeddings

In [81]:
drama_embedding.shape

(92, 768)

# 코사인 유사도 함수

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
top_n = 500
diversity = 0.6

def mmr(doc_embedding, candidate_embeddings, words, top_n, diversity):
    word_doc_distances = cosine_similarity(candidate_embeddings,doc_embedding)
    word_distances = cosine_similarity(candidate_embeddings) 

    keywords_idx = [np.argmax(word_doc_distances)]
    candidates_idx = [i for i in range(len(candidates)) if i != keywords_idx[0]]
    
    for _ in range(top_n-1):
        try:
        # 후보 키워드들의 문서유사도 값
            candidate_similarities = word_doc_distances[candidates_idx, :]
            # 후보 키워드와 가장 유사한 키워드
            target_similarities = np.max(word_distances[candidates_idx][:,keywords_idx], axis=1) 

            mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1,1)
            mmr_idx = candidates_idx[np.argmax(mmr)]

            keywords_idx.append(mmr_idx)
            candidates_idx.remove(mmr_idx)
        except:
            break

    # 본 단어 임베딩벡터에서 추출키워드에 해당하는 벡터만
    keywords_vector = candidate_embeddings[keywords_idx][:] 
    keywords = [words[idx] for idx in keywords_idx]
    
    return keywords, keywords_vector


In [91]:
# for idx, drama in enumerate(drama_embedding):
#     keywords, keywords_vector = mmr(drama.reshape(1,-1) ,candidate_embeddings,candidates,top_n,diversity)
#     print(titles[idx])
#     print(keywords[:10],'\n')

18어게인
['character heartwarming', 'relationship flaw', 'grandma killer', 'chemistry butterfly', 'eatmelonn', 'rain way', 'playbook hospital', 'pain guilt', 'people battle', 'crime love'] 

365:운명을거스르는1년
['charm actor', 'mom tae', 'disease zombie', 'fencing match', 'dramaaddictscorner com', 'sad bittersweet', 'story heartwarming', "jin's life", 'girlfriend love', 'cop killer'] 

60일지정생존자
['boring people', 'hyun dream', 'chemistry butterfly', 'brother murder', 'happiness zombie', "drama's success", 'romance conflict', 'game commentary', 'anxiety attack', 'kdramas ending'] 

D.P.
['pain actor', 'girl chemistry', 'happiness zombie', 'brother hyun', 'story heartwarming', 'cinematography transition', 'dramaaddictscorner com', 'killer mom', 'chicken restaurant', 'misunderstanding couple'] 

검색어를입력하세요WWW
['vibe couple', 'korea zombie', 'plot inconsistency', 'girl tae', 'chicken restaurant', 'dramaaddictscorner com', 'job dream', 'taste preference', 'actor soo', 'chemistry butterfl

### keyword Dataframe 생성

In [17]:
import pickle

# drama titles in csv
drama_titles = df.groupby('title',as_index=False).count().title

output = pd.DataFrame(None,columns=drama_titles)

top_n = 500
diversity = 0.6

for i,title in tqdm(enumerate(drama_titles)):
    reviews = df[df.title == title].reviews.to_list()

    # tfidf custom vectorizing
    candidates = tfidf_vectorizing(reviews) # reviews = list

    # embedding
    sentences = ','.join(reviews)
    doc_embedding, candidate_embeddings = embedding(sentences, candidates) 

    # keywords extraction

    keywords, keywords_vector = mmr(doc_embedding,candidate_embeddings,candidates,top_n,diversity)

    with open(f'{i}_{title}_.pickle', 'wb') as f:
        pickle.dump(keywords, f, pickle.HIGHEST_PROTOCOL)
        pickle.dump(keywords_vector, f, pickle.HIGHEST_PROTOCOL)

    output[title] = keywords[:7]
    print(title, keywords[:7])

output = output.T

1it [00:28, 28.84s/it]

18어게인 ['character heartwarming', 'relationship misunderstanding', 'prison playbook', 'story chemistry', 'eighteen tearjerker', 'pandemic', 'family sacrifice']


2it [00:51, 25.07s/it]

365:운명을거스르는1년 ['officer cuteness', 'tae sister', "foreshadowing pen's", 'killing trophy', 'chemistry thriller', 'quackery plot', 'actor actress']


3it [01:07, 21.18s/it]

60일지정생존자 ['awkward romance', 'survivor day', 'hee actor', 'lazy', 'people politics', 'production cinematography', 'antagonist']


4it [01:29, 21.24s/it]

D.P. ['fun pain', 'actress jeon', 'chemistry actor', 'bullying soldier', 'midnight runner', 'army deserter', 'spotify com']


5it [01:54, 22.59s/it]

검색어를입력하세요WWW ['romance irritating', 'sunbae life', 'jung actor', 'reservation honest', 'pianist girl', 'spotify com', 'story development']


6it [02:18, 23.12s/it]

무브투헤븐:나는유품정리사입니다 ["asperger's character", 'parent love', 'cage fighting', 'memory creepy', 'lee jehoon', 'girl butterfly', 'development story']


7it [02:31, 19.93s/it]

보좌관:세상을움직이는사람들 ['judgemental politician', 'romance story', 'corruption politics', 'kdramas', 'tae jun', 'coherent party', 'power struggle']


8it [03:39, 35.18s/it]

갯마을차차차 ['love actor', 'death grandma', 'thought rice', 'shopaholic donator', 'hypocrisy', 'july', 'wedding dress']


9it [04:36, 42.15s/it]

괴물 ['viewer suspense', "sister's body", 'soul mechanic', 'buzzkpop com', 'one fishing', 'supermarket owner', 'cop murder']


10it [04:54, 34.51s/it]

구경이 ['disgust rationalisation', 'gay energy', 'mouse chase', 'actress character', 'fun crime', 'cup tea', 'student killer']


11it [05:26, 33.74s/it]

그녀의사생활 ['love acting', 'birth nonsense', 'band night', 'police mother', 'gay baker', 'life romcom', 'remorse bullcrap']


12it [05:42, 28.32s/it]

그림자미녀 ['character creepy', 'eps day', 'relationship mom', 'surgery', 'sleepless', 'screenwriter', 'online life']


13it [06:01, 25.56s/it]

기름진멜로 ['woo romance', 'gangster cook', 'evil mother', 'writer incarnate', 'horse cancer', 'trust commenters', 'payback cutest']


14it [06:36, 28.47s/it]

김비서가왜그럴까 ['suspense story', 'min chemistry', 'trailer judge', 'healer game', 'brother girl', 'pollen allergy', 'lack plot']


15it [06:58, 26.55s/it]

나빌레라 ['happiness sadness', 'taemin', 'dancer lot', 'dad plot', 'manga butterfly', 'nature ups', 'experience grandma']


16it [07:38, 30.47s/it]

나의아저씨 ['love betrayal', 'grocery', 'drinking dialogue', 'job sun', 'girl hardship', 'people wiretapping', 'value people']


17it [07:56, 26.91s/it]

낭만닥터김사부 ['hospital rivalry', 'rerun', 'greedy', 'pacing lot', 'doldam crew', 'felt romance', 'mother']


18it [08:14, 23.98s/it]

내뒤에테리우스 ['mind romance', 'sandwich', 'actor spy', 'family comedy', 'nobody director', 'tae agent', 'mix thriller']


19it [08:27, 20.91s/it]

녹두꽃 ['story romance', 'brother hyun', 'rougthless politician', 'watching', 'teacher', 'thief', 'politician intensity']


20it [08:53, 22.33s/it]

단하나의사랑 ['weird angel', 'ballet bottom', 'character budget', "kim dan's", 'tragedy tragedy', 'overdone', 'aunt cousin']


21it [09:15, 22.29s/it]

대박부동산 ['pain agony', 'actress jang', 'business ghost', 'chemistry cast', 'rest plot', 'yugami blood', 'dramacool']


22it [10:07, 31.26s/it]

도시남녀의사랑법 ['lovestruck acting', 'girl month', 'relationship fear', 'domesticviolence ptsd', 'jaewon alcoholism', 'flashback beach', 'medicalnewstoday com']


23it [10:33, 29.52s/it]

동백꽃필무렵 ['love acting', 'son dam', 'plot lol', 'boyfriend wife', 'mind mom', 'bloom blend', 'life lesson']


24it [10:49, 25.65s/it]

라이브 ['character importance', 'chemistry wook', "'guys' dream", 'spotify com', 'pacing', 'mother alcoholic', 'soundtrack romance']


25it [11:09, 23.76s/it]

라이프온마스 ['thriller suspense', 'brother min', 'victim manicure', 'hospital', 'love jung', 'prison playbook', 'dream hallucination']


26it [11:31, 23.32s/it]

라켓소년단 ['badminton fun', 'scriptwriter', 'family dynamic', "i'm rewatch", 'share indonesia', 'student baseball', 'lover korean']




런온 ['romance boring', 'development chemistry', 'filter people', 'reality film', 'prison playbook', 'pacing storytelling', 'politics family']


28it [13:01, 32.62s/it]

로맨스는별책부록 ['romance heartwarming', 'shame dan', 'bookworm book', 'chemistry brother', 'break motherhood', 'eun character', 'villain refreshing']


29it [13:30, 31.46s/it]

로스쿨 ['fun law', 'sister people', 'doctor exam', 'mastermind murder', 'chill goosebump', 'script storyline', "drama's ost"]


30it [13:46, 26.77s/it]

마녀식당으로오세요 ['witch wish', 'abuse tension', 'hyun actor', 'food vibe', 'life lesson', 'chemistry character', 'plot gist']


31it [14:59, 40.74s/it]

마우스 ['killer coincidence', 'surgery writer', 'girl family', 'lack empathy', 'prey vlogger', 'train thought', 'character chemistry']


32it [15:17, 33.97s/it]

멜로가체질 ["jung's brother", 'idiocy heartfelt', 'character problem', 'girl friendship', 'drunk', 'comedy challenge', 'boyfriend restaurant']


33it [15:51, 33.89s/it]

모범택시 ['love taxi', 'heist', 'story vigilante', 'hacker girl', 'stunt interview', 'revenge romance', 'superhero']


34it [16:32, 35.90s/it]

미스터션샤인 ['happiness tragedy', 'tae actress', 'butcher', 'goosebump', 'screenwriter couple', 'turbulent history', 'experience cinematography']


35it [16:51, 30.94s/it]

미스티 ['cast superb', 'abortion', "wook's father", 'business law', 'misty evil', 'people dream', 'studio wife']


36it [17:08, 26.82s/it]

미치지않고서야 ['favorite plot', 'negativity workforce', 'politics story', 'choi miss', 'applause job', 'insanity challenge', 'dinner']


37it [17:32, 26.04s/it]

백일의낭군님 ['love acting', 'shopping king', 'king louie', 'day burn', "seo's brother", 'music rewatch', 'wonder emptiness']


38it [17:49, 23.14s/it]

본대로말하라 ['wit villain', 'teen college', 'fight police', 'herring', 'manga murder', 'couple revelation', 'moonoverstar']


39it [18:20, 25.52s/it]

부부의세계 ['cheat wife', 'audacity taeoh', 'train wreck', 'calm storm', 'prison playbook', 'dad lot', "shy drama's"]


40it [18:40, 23.98s/it]

뷰티인사이드 ['power actress', 'dream priest', 'com dramafoxblog', 'plot twist', 'intense boring', 'love chemistry', 'rewatch']


41it [19:41, 35.08s/it]

비밀의숲 ['stranger misogyny', 'television teamwork', 'repeat essay', 'perfect policewoman', "simok yeojin's", 'prison playbook', 'ordeal evidence']


42it [20:55, 46.53s/it]

사내맞선 ['cringe acting', 'ceo girl', 'restaurant kiss', 'grandfather blind', 'zombie crime', 'scissors game', 'breakup trope']


43it [22:03, 53.09s/it]

사랑의불시착 ['love cast', 'accident korea', 'wife antagonist', 'philosopher tomato', 'businessinsider tvns', 'yawn', 'life lesson']


44it [22:18, 41.58s/it]

사의찬미 ['tragedy cheating', 'bgm album', 'typewriter', 'lack chemistry', 'rewatch', 'script love', 'pressure society']


45it [23:43, 54.72s/it]

사이코지만괜찮아 ['character heartache', 'kim moon', 'love chemistry', 'prison playbook', "ji's acting", 'photo shoot', 'butterfly killer']


46it [24:07, 45.36s/it]

서른아홉 ['friend actress', 'execution lot', "trailer netflix's", 'chemistry son', 'heartache', 'jeon haircut', 'thirty subtitleshttps']


47it [24:27, 38.04s/it]

서른이지만열일곱입니다 ['character charm', 'cry laugh', 'development story', 'bus accident', 'chemistry music', 'cinderella knight', 'hae sun']


48it [25:44, 49.67s/it]

세빛남고학생회 ['shyness insecurity', 'interview endgame', 'taedaon taeshinwoo', 'chemistry butterfly', 'feeling home', 'fond soundtrack', 'destiny']


49it [26:02, 40.02s/it]

소년심판 ['acting lot', 'end rape', 'lesson backstories', 'kim hyesoo', 'ruling juvenile', 'binge day', 'fan law']


50it [26:26, 35.34s/it]

손theguest ['supernatural', 'character development', 'bos fight', 'actor lot', 'yang finale', 'chemistry love', 'filming fear']


51it [26:40, 29.01s/it]

술꾼도시여자들 ['girl friendship', 'heist', 'lot drinking', 'kdramas', 'comedy amount', 'slice life', 'sickness health']


52it [28:09, 46.81s/it]

스물다섯스물하나 ['relationship cringe', 'fullhouse manhwa', "heedo's daughter", 'endgame writer', 'color theory', 'evolution character', 'problem goblin']


53it [28:33, 40.15s/it]

스카이캐슬 ['character manipulative', 'nature parent', 'sky university', 'money flower', 'homeroom', 'badass bunny', 'romance romance']


54it [28:51, 33.23s/it]

스토브리그 ['baseball problem', 'passion leadership', 'prison playbook', 'imdb', 'script draggy', 'cinematography shot', 'league viewer']


55it [29:56, 42.76s/it]

슬기로운의사생활 ['love writer', 'marathoner', 'scare ikjun', 'spam bot', 'kingdom lockdown', 'dinner mother', 'people plot']


56it [30:12, 34.89s/it]

시를잊은그대에게 ['lot comedy', 'nurse trainee', 'heart sore', 'woo alligator', 'income dream', 'kim dae', 'relationship romance']


57it [30:53, 36.77s/it]

시맨틱에러 ['cuteness overload', "director's cut", 'jaeyoung restaurant', 'manhwa novel', 'chemistry butterfly', 'fight minute', 'stalking']


58it [31:14, 31.87s/it]

아는와이프 ['heart cuteness', 'husband divorce', 'management fault', 'actor korea', "'destiny'", 'lot cast', 'fraud money']


59it [32:21, 42.46s/it]

악의꽃 ['suspense thrill', "'mother' flower", 'lawless lawyer', 'hospital accident', 'trailer mind', 'chemistry actor', 'cake acting']


60it [32:41, 35.88s/it]

악의마음을읽는자들 ['korean mindhunter', 'day crime', 'lullaby', 'handsome actor', 'nonfiction', 'others mind', 'procedure profiling']


61it [32:59, 30.39s/it]

안녕나야 ['fun character', 'truck doom', 'wedding day', "drama's", 'actor chemistry', 'development lukewarm', 'story romance']


62it [34:05, 41.12s/it]

어느날우리집현관으로멸망이들어왔다 ['disappoint story', 'actress chemistry', 'plot goblin', 'soul actor', 'family watching', 'angel death', 'wrenching romance']


63it [34:59, 44.87s/it]

어쩌다발견한하루 ['romance acting', 'mind day', 'webtoon bittersweet', 'others manhwa', 'brother fight', 'stage lot', 'twist dan']


64it [35:31, 41.05s/it]

오월의청춘 ['love actor', 'tae girl', 'hymn death', 'storytelling', 'selfish', 'student law', 'month romance']


65it [36:47, 51.65s/it]

오징어게임 ['character favorite', 'runner hunger', 'capitalism illusion', 'game anxiety', 'film parasite', 'twist finale', 'senseless violence']


66it [37:58, 57.30s/it]

옷소매붉은끝동 ['politics romance', "i'm rewatch", 'sad actor', 'palace ritual', 'story joseon', 'power revenge', 'machine translation']


67it [38:13, 44.63s/it]

왓쳐 ['suspense plot', 'kdramas', 'police corruption', 'acting class', 'skill', 'boyfriend', 'league watcher']


68it [38:32, 37.06s/it]

왕이된남자 ['romance lot', 'saeguk king', 'assassin', 'genius', 'story acting', 'paranoid king', 'barren seon']


69it [38:46, 30.07s/it]

우리들의블루스 ['cast vibe', 'daughter golf', 'bullying', 'romance couple', 'camellia bloom', 'director writer', 'jung eun']


70it [39:01, 25.56s/it]

우수무당가두심 ['fantasy', 'kim sae', 'teenager master', 'chemistry character', "i'm eps", 'shamanism', 'vampire slayer']


71it [39:15, 22.24s/it]

우아한친구들 ['jealousy', 'tae hwan', 'owl thanks', 'shower', "mom's death", 'dramaboy', 'revenge jung']


72it [39:32, 20.48s/it]

원더우먼 ['romance thriller', 'snippet', 'daughter arsonist', 'surgery face', 'intelligence', 'shitty law', 'everytime']


73it [39:52, 20.38s/it]

월간집 ['jung favourite', 'cast ending', 'struggle lesson', 'cutie pie', 'writer twist', 'rewatch', 'housing crisis']


74it [40:21, 22.94s/it]

유미의세포들 ['romance lot', 'day cell', 'devil judge', "yumi's life", 'hyun acting', 'worry hunger', "characters' thought"]


75it [40:41, 21.98s/it]

으라차차와이키키 ['love comedy', 'rewatch', 'fortune mother', 'tiring development', 'lot bakery', 'drunk', 'plot twist']


76it [41:01, 21.61s/it]

의사요한 ['love doctor', 'bait viewer', "sung's acting", 'badass bunny', 'death pain', 'writer problem', 'vampire detective']


77it [41:22, 21.20s/it]

이구역의미친X ['rewatch', 'comedy illness', 'romance pacing', 'management paranoia', 'pickpocket', 'girl dog', 'covenience store']


78it [41:46, 22.05s/it]

이리와안아줘 ['killer brother', 'chatterbox girl', 'preference melodrama', 'home dog', 'parent lack', 'moon lover', 'tension suspense']


79it [42:14, 23.80s/it]

인간수업 ['thriller twist', 'girl parent', 'class acting', 'eat dog', 'bullying sex', 'thought day', 'lawless lawyer']


80it [42:28, 20.91s/it]

자백 ['perfection actress', 'war son', 'plot twist', 'flower evil', "ho's chemistry", 'hyun heart', 'rest cast']


81it [42:41, 18.76s/it]

작은신의아이들 ['mystery plot', 'byul popeye', 'dramamilk com', 'dyingtobeinkorea', 'lot people', 'grandmother', 'rest']


82it [43:26, 26.60s/it]

지옥 ['horror demon', "jungmin's", 'expectation brainwash', 'politics abortion', 'father destiny', 'magic', 'nature event']


83it [43:49, 25.57s/it]

카이로스 ['romance vibe', 'tantrum home', 'kairos plot', 'logic', 'effect thriller', 'staff cast', 'wife bitch']


84it [44:21, 27.26s/it]

킹덤 ['palace intrigue', 'dyingtobeinkorea zombie', 'actress bae', 'physician father', 'day break', 'prison playbook', 'cinematography acting']


85it [44:34, 23.17s/it]

태종이방원 ['animal cruelty', 'entertainment http', 'death vain', "i'm dragon", 'politics', 'stunt injury', 'badass bunny']


86it [44:47, 20.08s/it]

트랩 ['thriller', 'medicine', 'sung', 'ceo hong', 'psychopath', 'taste', 'trap']


87it [45:01, 18.11s/it]

트레이서 ['villain backstories', 'tax office', 'people tracer', 'mbc', 'cliche politician', 'eps', 'luck']


88it [45:28, 20.93s/it]

하늘에서내리는일억개의별 ['thriller mystery', 'one kimura', 'literature theater', 'pasta meat', 'fan ending', 'incest korean', 'jin forgiving']


89it [45:46, 20.04s/it]

하이에나 ['romance lot', 'hyena giriboy', 'jerk career', 'actress kim', 'character morality', 'badass chemistry', 'drunk driving']


90it [46:04, 19.46s/it]

한번다녀왔습니다 ['fun family', 'yongju market', 'miscarriage fault', 'character chemistry', 'acting actor', 'hee writer', 'job mom']




해피니스 ['happiness thriller', 'mother food', 'doctor yeon', 'reaction pandemic', 'theyre zombie', 'detective baseball', 'cure blood']


92it [47:45, 31.14s/it]

호텔델루나 ['love acting', 'breakfast', 'kdramas ghost', 'hotel eternity', 'chemistry pain', 'thought story', 'sister sister']





In [23]:
output=output.T

In [25]:
output.to_csv('0506_ENG_Keywords.csv')

In [None]:
# with open('title.pickle', 'rb') as f:
#     keywords = pickle.load(f)
#     keywords_vector = pickle.load(f)

# KMEANS - 작품별 연관키워드 뽑을 때 사용

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

X = keywords_vector

true_k = 6
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X.T)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = keywords # = candidates
for i in range(true_k):
    print("["),
    for ind in order_centroids[i, :20]:
        print(f'"{terms[ind]}"', end=",")
    print("],\n")

In [None]:
x = list(range(1,11))
y = []
for i in x:
    model = KMeans(n_clusters=i, init='k-means++', max_iter=100, n_init=1)
    model.fit(X.T)
    y.append(model.inertia_)

import matplotlib.pyplot as plt

plt.plot(x,y,label='linear')
plt.show();