In [26]:
import pandas as pd
import numpy as np

df = pd.read_csv("0505_ENG_Korean.csv")

In [27]:
drama_titles = df.groupby('title',as_index=False).count().title
drama_titles

0                          18어게인
1                  365:운명을거스르는1년
2                       60일지정생존자
3                           D.P.
4     검색어를입력하세요WWW
                 ...            
87                  하늘에서내리는일억개의별
88                          하이에나
89                      한번다녀왔습니다
90                          해피니스
91                         호텔델루나
Name: title, Length: 92, dtype: object

In [28]:
reviews = df.reviews.to_list()
sentences = ",".join(reviews)
reviews_by_drama = []
for title in titles:
    rev_by_drama = df[df.title == title].reviews
    reviews_by_drama.append(','.join(rev_by_drama))

In [29]:
len(reviews)

27699

# 필요 함수들

In [None]:
# pip install contextualized_topic_models

In [None]:
# pip install tensorflow

In [33]:
import re
import pandas as pd
import numpy as np
import string
from tensorflow.keras.preprocessing.text import text_to_word_sequence
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
# from contextualized_topic_models.models.ctm import CombinedTM
# from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation, bert_embeddings_from_list
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
# from konlpy.tag import Okt
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english'))
stop_words.update(('\r\n', '\n\r', 'ever', 'much', 'look', 'squid', 'show', 'thing', "i've", 'anything', 'something', "show's",
                   'www', 'soompi' 'com', 'instagram', 'youtube', 'https', 'mydramalist', 'twitter', 'episode', 'comment', 'scene',
                   'version', "he's", 'gonna', 'series', 'watch', 'everything', 'something', "can't", 'list', 'dramas', 'drama',
                   'wait', 'preview', 'someone', 'everyone', 'dont', 'think', 'season', 'anyone', 'something', 'anything', 'nothing', 'world',
                   'status', 'week', 'name', 'cause', 'time', 'en', 'org', 'wikipedia', 'wiki', 'pbs', 'twimg', 'year', 'point', 'please', 'today',
                   'haha', 'case', 'guess', 'reason', 'person', 'moment', 'sense', 'kinda', 'part', 'movie', 'school', 'start', 'work', 'lead', 'kind',
                   'rate', 'rating', 'rate', 'men', 'example', 'idea', 'half', 'review', 'genre', 'side', "that's", "they're", 'till', 'tell', 'phone',
                   'section', 'number', 'company', 'line', "there's", 'male', 'team', 'rating', 'baby', 'course', 'care', 'cute', 'question', 'help', 'group',
                   'hand', 'spoiler', 'hate', 'need', 'mess', 'change', 'drop', 'date', 'netflix', 'yeah', 'daon', 'park', 'thank', 'lmao', 'damn', "i'll",
                   'kang', 'shinwoo', 'taekyung', 'mean', 'woman', 'hope', 'read', 'fact', 'opinion', 'stuff', 'feel', 'kdrama', 'talk', 'song', 'hype',
                   'title', 'type'))

# 사용 함수

In [35]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('V'):
        return 'v'
    elif pos_tag.startswith('N'):
        return 'n'
    elif pos_tag.startswith('J'):
        return 'a'
    elif pos_tag.startswith('R'):
        return 'r'
    else:
        return None

# CustomTokenizer class setting

In [36]:
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, a):
        # a = ' '.join(a)
        word_tokens = self.tagger(a)
        
        words = []
        for i in word_tokens:
            text = re.sub('[^a-zA-Z0-9\']','',i).strip()
            text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`…》]','', text)
            if(text != ''):
                words.append(text)
        
        tag_words = nltk.pos_tag(words)
        pos_words = [word for word in tag_words if word[1][0] in {'N'}] #'V','N','J','R'

        temp_list = []
        for token, pos_tag in pos_words:
            tag = get_wordnet_pos(pos_tag)
            if tag != None:
                temp_list.append((token, get_wordnet_pos(pos_tag)))
        lemma = WordNetLemmatizer()
        token_final = [lemma.lemmatize(token, pos=tag) for token, tag in temp_list]
        long_words = [i for i in token_final if len(i) > 2]
        results = [w for w in long_words if w not in stop_words]
        
        return results

# 작업

In [37]:
def tfidf_vectorizing(reviews):
    ngram_range = (1,2)

    custom_tokenizer = CustomTokenizer(text_to_word_sequence)

    tfidf = TfidfVectorizer(tokenizer=custom_tokenizer,ngram_range = ngram_range,\
        stop_words = stop_words, max_df=10, min_df=2 ,max_features=5000).fit(reviews)
    candidates = tfidf.get_feature_names()
    
    return candidates

# 임베딩

In [38]:
def embedding(sentences,candidates):
    from sentence_transformers import SentenceTransformer

    model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

    doc_embedding = model.encode([sentences])
    candidate_embeddings = model.encode(candidates)
    # drama_embedding = model.encode(reviews_by_drama)

    return doc_embedding, candidate_embeddings

# 코사인 유사도 함수

In [39]:
from sklearn.metrics.pairwise import cosine_similarity
top_n = 500
diversity = 0.7

def mmr(doc_embedding, candidate_embeddings, words, top_n, diversity):
    word_doc_distances = cosine_similarity(candidate_embeddings,doc_embedding)
    word_distances = cosine_similarity(candidate_embeddings) 

    keywords_idx = [np.argmax(word_doc_distances)]
    candidates_idx = [i for i in range(len(candidates)) if i != keywords_idx[0]]
    
    for _ in range(top_n-1):
        try:
        # 후보 키워드들의 문서유사도 값
            candidate_similarities = word_doc_distances[candidates_idx, :]
            # 후보 키워드와 가장 유사한 키워드
            target_similarities = np.max(word_distances[candidates_idx][:,keywords_idx], axis=1) 

            mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1,1)
            mmr_idx = candidates_idx[np.argmax(mmr)]

            keywords_idx.append(mmr_idx)
            candidates_idx.remove(mmr_idx)
        except:
            break

    # 본 단어 임베딩벡터에서 추출키워드에 해당하는 벡터만
    keywords_vector = candidate_embeddings[keywords_idx][:] 
    keywords = [words[idx] for idx in keywords_idx]
    
    return keywords, keywords_vector


In [31]:
# for idx, drama in enumerate(drama_embedding):
#     keywords, keywords_vector = mmr(drama.reshape(1,-1) ,candidate_embeddings,candidates,top_n,diversity)
#     print(titles[idx])
#     print(keywords[:10],'\n')

### keyword Dataframe 생성

In [40]:
import pickle

# drama titles in csv
drama_titles = df.groupby('title',as_index=False).count().title

output = pd.DataFrame(None,columns=drama_titles)

top_n = 500
diversity = 0.6

for i,title in tqdm(enumerate(drama_titles)):
    reviews = df[df.title == title].reviews.to_list()

    # tfidf custom vectorizing
    candidates = tfidf_vectorizing(reviews) # reviews = list

    # embedding
    sentences = ','.join(reviews)
    doc_embedding, candidate_embeddings = embedding(sentences, candidates) 

    # keywords extraction

    keywords, keywords_vector = mmr(doc_embedding,candidate_embeddings,candidates,top_n,diversity)

    with open(f'drama_{i}.pickle', 'wb') as f:
        pickle.dump(keywords, f, pickle.HIGHEST_PROTOCOL)
        pickle.dump(keywords_vector, f, pickle.HIGHEST_PROTOCOL)

    output[title] = keywords[:7]
    print(title, keywords[:7])

output.to_csv('0506_theme_Keywords.csv')

1it [00:16, 16.81s/it]

18어게인 ['pain stunning', 'hyun actor', 'society mother', 'taste plot', 'husband lot', 'child dream', "episode's epilogue"]


2it [00:30, 15.00s/it]

365:운명을거스르는1년 ['character suspense', 'hyun joon', 'man chemistry', 'sweet loophole', 'universe travel', 'romance day', 'soundtrack']


3it [00:44, 14.62s/it]

60일지정생존자 ['awkward romance', 'survivor day', 'thug threat', 'ost cinematography', 'hee president', 'sigh popularity', 'fonds politics']


4it [01:00, 15.25s/it]

D.P. ['korea admire', 'abuse bullying', 'cinemaescapist com', 'prison playbook', 'husband', 'hae rain', 'character chemistry']


5it [01:17, 15.70s/it]

검색어를입력하세요WWW ['power romance', 'job sunbae', 'yall hating', 'mistake life', 'kdramas female', 'story business', 'jeon hye']


6it [01:32, 15.53s/it]

무브투헤븐:나는유품정리사입니다 ["asperger's character", 'love story', 'cage fighting', 'character development', 'jin hee', 'death lot', 'sang uncle']


7it [01:45, 14.68s/it]

보좌관:세상을움직이는사람들 ['judgemental politician', 'relationship', 'corruption politics', 'tae', 'coherent party', 'actor', 'enemy']


8it [02:22, 21.63s/it]

갯마을차차차 ['romance lot', 'dentist life', 'power outage', 'spotify com', 'people demon', 'character introspection', "yumi's"]


9it [02:45, 22.05s/it]

괴물 ['creepy smile', 'han actor', 'soup', 'end cast', 'detective sister', 'deer farm', 'percussion night']


10it [02:58, 19.30s/it]

구경이 ['excitement mystery', 'female perpetrator', 'kdramas', 'soundtrack context', 'rest cast', 'backstory', 'cat mouse']


11it [03:15, 18.77s/it]

그녀의사생활 ['favorite story', "jae wook's", 'soundtrack rewatch', 'mother accident', 'shower', 'character development', 'history kdramas']


12it [03:28, 16.87s/it]

그림자미녀 ['character story', 'victory guy', 'lamb yang', 'darker beauty', 'online friend', 'chemistry', 'surgery']


13it [03:41, 15.68s/it]

기름진멜로 ['romance food', 'jung ryeo', 'depth', 'writer lot', 'chemistry poong', 'jealousy', 'sweetness hour']


14it [03:58, 16.23s/it]

김비서가왜그럴까 ['chemistry insane', 'trailer judge', 'surgery actress', 'mix cheer', 'couple backstory', 'pacing', 'cup tea']


15it [04:13, 15.72s/it]

나빌레라 ['poetry compassionate', 'cinematography misstep', 'wife deok', 'amount practice', 'people dream', 'history lot', 'actor hwan']


16it [04:38, 18.59s/it]

나의아저씨 ['relationship feeling', 'poverty office', 'intelligent', 'pain life', 'politics way', 'mom grandma', 'acting rest']


17it [04:52, 17.31s/it]

낭만닥터김사부 ['doctor patient', 'bit romance', 'people pacing', 'jung bum', 'character problem', 'bone', 'burn']


18it [05:05, 16.04s/it]

내뒤에테리우스 ['charm', 'hiatus kdramas', 'action comedy', 'jeju island', 'parent', 'air spy', 'soundtrack']


19it [05:19, 15.25s/it]

녹두꽃 ['favorite', 'donghak war', 'people battle', 'character plot', 'forestella lyric', 'sympathy', 'revolution country']


20it [05:35, 15.62s/it]

단하나의사랑 ['favorite plot', 'kim dan', 'angel pray', 'diety', 'actress casting', 'snippet ballet', 'scream']


21it [05:51, 15.75s/it]

대박부동산 ['glad plot', 'exorcist ghost', 'daebak realty', 'restaurant', 'fantasy television', 'chemistry cast', 'antagonist']


22it [06:11, 17.03s/it]

도시남녀의사랑법 ['love jae', 'documentary life', 'acting cast', 'awareness dysfunction', 'jaewon kdramas', 'healer play', 'chemistry actor']


23it [06:27, 16.59s/it]

동백꽃필무렵 ['outcast dongbaek', 'love love', 'murder suspense', 'struggle mother', 'drunk', 'son dam', 'baseball star']


24it [06:41, 15.90s/it]

라이브 ['character importance', 'wife jang', 'roller coaster', 'soo actor', 'bit romance', 'pacing', 'police corruption']


25it [06:56, 15.60s/it]

라이프온마스 ['theory dream', 'manicure murder', 'water officer', 'han tae', 'rewatch', 'winner colleague', 'gang']


26it [07:11, 15.52s/it]

라켓소년단 ['lot fan', 'boy badminton', 'screenwriter', 'yoon dam', 'august prisonplaybook', 'kdramas romance', 'comedy taste']


27it [07:41, 19.78s/it]

런온 ['romance feeling', 'misaeng kdramas', 'filter people', 'dad pain', 'mother sister', 'athleticism writer', 'film plotless']


28it [08:06, 21.23s/it]

로맨스는별책부록 ['romance corny', 'job publishing', 'dan eun', 'mother divorcee', 'homeless', 'actor lot', 'burn']


29it [08:23, 20.05s/it]

로스쿨 ['law jargon', 'pretending dan', 'sister', 'nature perfect', 'goosebump', 'murder vibe', 'mind blowing']


30it [08:36, 17.96s/it]

마녀식당으로오세요 ['magic', 'customer wish', 'hiyo cook', 'bullying situation', 'pacing', 'taiwan myvideo', 'assistant storyline']


31it [09:02, 20.36s/it]

마우스 ['girl mouse', 'psychopath remorse', 'pairing grandmother', 'transplant theory', 'acting cast', 'murder korea', 'tantrum']


32it [09:16, 18.47s/it]

멜로가체질 ['dialogue humor', 'shampoo ost', 'life problem', 'actress', 'scent flower', 'cinematography', 'genius']


33it [09:35, 18.51s/it]

모범택시 ['taxi revenge', 'versatility actor', 'bullying cast', 'rainbow squad', 'vigilante law', "could've chemistry", 'shark lady']


34it [09:59, 20.27s/it]

미스터션샤인 ['happiness tragedy', 'tae actress', 'butcher', 'turbulent history', 'history cinematography', 'character writer', 'boring chemistry']


35it [10:13, 18.22s/it]

미스티 ['wook love', 'news anchor', 'murder thriller', 'studio bouquet', 'actress baeksang', 'character lot', 'wife man']


36it [10:25, 16.57s/it]

미치지않고서야 ['insanity', 'director lesson', 'acting seo', 'ladylord', 'love character', 'office cast', 'kdramas']


37it [10:41, 16.26s/it]

백일의낭군님 ['story joy', 'assassin brother', 'motivation shopping', 'exo kpop', 'viewer drought', 'music rewatch', 'character chemistry']


38it [10:54, 15.23s/it]

본대로말하라 ['fan crime', 'jang hyuk', 'method gun', 'actress', 'pacing', 'strength wit', 'doctor']


39it [11:08, 15.06s/it]

부부의세계 ['suspense', 'wife sun', 'therapy', 'sex violence', 'eps', 'money father', 'parent revenge']


40it [11:23, 15.01s/it]

뷰티인사이드 ['power actress', 'dream priest', 'com dramafoxblog', 'plot twist', 'love chemistry', 'suffers prosopagnosia', 'focus chracters']


41it [12:16, 26.46s/it]

비밀의숲 ['stranger misogyny', 'television teamwork', 'repeat essay', 'perfect policewoman', "simok yeojin's", 'prison playbook', 'ordeal evidence']


42it [12:45, 27.11s/it]

사내맞선 ['cringe acting', 'ceo girl', 'love chemistry', 'slice life', 'brother manhwa', 'family chicken', 'manga night']


43it [13:32, 33.20s/it]

사랑의불시착 ['love cast', 'korean amnesia', 'story businesswoman', 'philosopher tomato', 'heartbreak train', 'legend sea', 'music rewatch']


44it [13:46, 27.16s/it]

사의찬미 ['star lover', 'deok woo', 'pain', 'literature student', 'melodrama', 'story heart', 'cheating']


45it [14:42, 35.81s/it]

사이코지만괜찮아 ['jung favourite', 'story cry', 'restaurant food', 'identity besides', 'mother overthought', 'lot acting', 'journey healing']


46it [14:55, 29.18s/it]

서른아홉 ['character lot', 'noona romance', 'chanyoung cancer', 'life death', 'onlykdrama com', 'boyfriend', 'hangover']


47it [15:10, 24.76s/it]

서른이지만열일곱입니다 ['character charm', 'cry', 'soundtrack ost', 'dream', 'pacing story', 'romcoms twist', 'heartwarming end']


48it [15:36, 25.10s/it]

세빛남고학생회 ['woo crush', 'chef', "watching afraid'", 'dildo', 'plot development', 'desk light', 'tie family']


49it [15:49, 21.71s/it]

소년심판 ['storytelling cinematography', 'kim hyesoo', 'juvenile delinquency', 'judge max', 'society lot', 'punishment belief', 'pacing']


50it [16:04, 19.68s/it]

손theguest ['bloodcurdling demon', 'cinematography quality', 'seo yoon', 'character plot', 'creepy', 'romance chemistry', 'lacklustre acting']


51it [16:18, 17.73s/it]

술꾼도시여자들 ['drinking culture', 'wife plenty', 'slice life', 'comedy', 'yoga instructor', 'eun husband', 'witness loving']


52it [16:57, 24.11s/it]

스물다섯스물하나 ['dream relationship', 'chemistry taeri', 'rainstorm tumblr', 'hindsight', 'heedo dad', 'twenty eps', 'fencing rivalry']


53it [17:12, 21.65s/it]

스카이캐슬 ['parent lot', 'university korea', 'masterpiece acting', 'anxiety', 'cup ramen', 'prison playbook', 'people plot']


54it [17:26, 19.25s/it]

스토브리그 ["drama' depth", 'day baseball', 'sport anime', 'business', 'novice pity', 'politics baseball', 'prison playbook']


55it [17:49, 20.35s/it]

슬기로운의사생활 ['mind story', 'crush songhwa', 'editing mediocre', 'love alarm', 'mother others', 'cast chemistry', 'lot food']


56it [18:02, 18.22s/it]

시를잊은그대에게 ['character problem', 'life kdramas', 'couple love', 'korean', 'therapist', 'twist', 'cast']


57it [18:25, 19.63s/it]

시맨틱에러 ['cuteness overload', 'ghost doctor', 'chemistry butterfly', 'unassuming plot', 'sangwoo trash', 'adaption manhwa', 'jaeyoung match']


58it [18:38, 17.63s/it]

아는와이프 ['character mistake', 'wife seo', 'chemistry cast', 'perfection', 'college boy', 'people choice', 'actor korea']


59it [19:03, 19.95s/it]

악의꽃 ['mind thriller', 'village mother', 'cake acting', 'couple chemistry', 'scriptwriter', 'war arrow', 'evil flower']


60it [19:18, 18.35s/it]

악의마음을읽는자들 ["korea's profiler", 'suspense crime', 'cup tea', 'adaptation book', 'killer chun', 'psychology criminal', 'whodunnits']


61it [19:31, 16.83s/it]

안녕나야 ['hyun lot', 'friend family', 'growth chef', 'dream', 'bullying', 'thriller', 'actress']


62it [19:59, 19.99s/it]

어느날우리집현관으로멸망이들어왔다 ['romance doom', 'chemistry skill', 'comeback lot', 'criticism people', 'story magic', 'love human', 'burn enemy']


63it [20:20, 20.26s/it]

어쩌다발견한하루 ['plot favorite', 'stage kyung', 'others manhwa', 'rewatch', 'dan couple', 'manga awareness', 'actress lot']


64it [20:37, 19.30s/it]

오월의청춘 ['love hyun', 'uprising pain', 'memoir', 'actor skill', 'student law', 'noona', 'lyric']


65it [21:33, 30.34s/it]

오징어게임 ["'squid game'", 'quality script', 'kpop idol', 'quentin tarantino', 'brother nitpicks', 'snowpiercer parasite', 'thug greed']


66it [22:26, 37.30s/it]

옷소매붉은끝동 ['intrigue romance', 'kdramas king', 'history lesson', 'scriptwriter jung', "i'm rewatch", 'death heart', 'power life']


67it [22:39, 29.90s/it]

왓쳐 ['thriller fan', 'seo joon', 'disappoint', 'cast', 'attack', 'story tragedy', 'ceo man']


68it [22:52, 24.89s/it]

왕이된남자 ['politics', 'strength weakness', 'kdramas romance', 'production storyline', 'taste', 'death king', 'actress']


69it [23:05, 21.18s/it]

우리들의블루스 ['people problem', 'jeju island', 'actor slice', 'love story', 'chemistry', 'vibe', 'twist']


70it [23:19, 19.06s/it]

우수무당가두심 ['fantasy', 'kim bok', 'student teacher', 'eps', 'shaman', 'binge', 'ghost hyun']


71it [24:00, 25.79s/it]

우아한친구들 ['boring', 'sky castle', 'people slice', 'kim hee', 'writer', 'murder mystery', 'wife']


72it [24:27, 26.09s/it]

원더우먼 ['opera plot', 'boring', 'sister law', 'soo jung', 'serial thingy', 'surgery', 'politics']


73it [24:44, 23.21s/it]

월간집 ['jung favourite', 'car taxi', 'plot twist', "witch's restaurant", 'box enemy', 'rest cast', 'soundtrack']


74it [25:00, 21.20s/it]

유미의세포들 ['story romance', 'day actor', "animation yumi's", 'worry hunger', 'goblin', 'flaw people', 'star cell']


75it [25:14, 18.98s/it]

으라차차와이키키 ['humor romance', 'production', 'zombie', 'rewatch', 'medicine', 'actress', 'crazy']


76it [25:29, 17.67s/it]

의사요한 ['love doctor', 'punch', 'badass bunny', 'sung character', 'pain death', 'actor lot', 'politics patient']


77it [25:43, 16.76s/it]

이구역의미친X ['storyline fun', 'therapist', 'watcher', 'snippet', 'screenwriter', 'enemy lover', 'meal']


78it [25:58, 16.04s/it]

이리와안아줘 ['killer father', 'character chemistry', 'rest', 'actress', 'police', 'suspense', 'hyun moo']


79it [26:15, 16.40s/it]

인간수업 ['plot twist', 'eat dog', 'hyun actress', 'business security', 'fast cinematography', 'pressure parent', 'fight brawl']


80it [26:28, 15.48s/it]

자백 ['perfection actress', 'chemistry father', 'rewatch', 'story cast', 'hyun', 'corruption', 'eps']


81it [26:41, 14.69s/it]

작은신의아이들 ['ghost', 'dan', 'twist', 'romance', 'prosecutor', 'fear', 'others']


82it [27:01, 16.17s/it]

지옥 ['hellbound thriller', 'psychology', 'ninja', 'cult people', 'development pacing', 'throat safety', 'acting script']


83it [27:14, 15.23s/it]

카이로스 ['thriller romance', 'food', 'character plan', 'hole perfectly', 'kim seo', 'goblin', 'idol actor']


84it [27:32, 16.10s/it]

킹덤 ['history joseon', 'addition thriller', 'fan actress', 'zombie train', 'court intrigue', 'nature morality', 'cgi makeup']


85it [27:44, 15.06s/it]

태종이방원 ['animal abuse', 'joseon', 'dragon', 'death', 'celebrity', 'news horse', 'cgi']


86it [27:57, 14.46s/it]

트랩 ['thriller', 'fan', 'trap', 'lee', 'character', 'plot', 'lot']


87it [28:11, 14.34s/it]

트레이서 ['cliche politician', 'streaming', 'dramacool', 'tracer', 'pirate google', 'korean', 'plot']


88it [28:28, 14.85s/it]

하늘에서내리는일억개의별 ['thriller mystery', "kdrama's", 'incest korean', 'jin forgiving', 'brother acting', 'antihero light', 'hoshi story']


89it [28:45, 15.51s/it]

하이에나 ['romance badass', 'hyena giriboy', 'reality yoon', 'lot criminal', 'chemistry chart', 'morality', 'comedy character']


90it [29:21, 21.71s/it]

한번다녀왔습니다 ['favourite family', 'noona', 'lee jung', 'acting cast', 'rollercoaster', 'sin', 'kid story']


91it [29:41, 21.17s/it]

해피니스 ['happiness zombie', 'problem couple', 'gym trainer', 'cure vaccine', 'character wise', 'chemistry villain', 'selfishness']


92it [30:13, 19.71s/it]

호텔델루나 ['love acting', 'lamb', 'wish hotel', 'luna fighting', 'story development', "jang's snark", 'alldramaaddict']





In [41]:
output.to_csv('0506_theme_Keywords.csv')

In [None]:
# with open('title.pickle', 'rb') as f:
#     keywords = pickle.load(f)
#     keywords_vector = pickle.load(f)

# KMEANS - 작품별 연관키워드 뽑을 때 사용

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

X = keywords_vector

true_k = 6
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X.T)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = keywords # = candidates
for i in range(true_k):
    print("["),
    for ind in order_centroids[i, :20]:
        print(f'"{terms[ind]}"', end=",")
    print("],\n")

In [None]:
x = list(range(1,11))
y = []
for i in x:
    model = KMeans(n_clusters=i, init='k-means++', max_iter=100, n_init=1)
    model.fit(X.T)
    y.append(model.inertia_)

import matplotlib.pyplot as plt

plt.plot(x,y,label='linear')
plt.show();