In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

In [3]:
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.test.utils import datapath

In [4]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

In [5]:
pd.set_option('display.max.colwidth', None)
pd.set_option('display.max_rows', 100)

### 함수모음

In [15]:
stop_words = set(stopwords.words('english'))
stop_words.update(('\r\n', '\n\r', 'ever', 'much', 'look', 'squid', 'show', 'thing', "i've", 'anything', 'something', "show's",
                   'www', 'soompi' 'com', 'instagram', 'youtube', 'https', 'mydramalist', 'twitter', 'episode', 'comment', 'scene',
                   'version', "he's", 'gonna', 'series', 'watch', 'everything', 'something', "can't", 'list', 'dramas', 'drama',
                   'wait', 'preview', 'someone', 'everyone', 'dont', 'think', 'season', 'anyone', 'something', 'anything', 'nothing', 'world',
                   'status', 'week', 'name', 'cause', 'time', 'en', 'org', 'wikipedia', 'wiki', 'pbs', 'twimg', 'year', 'point', 'please', 'today',
                   'haha', 'case', 'guess', 'reason', 'person', 'moment', 'sense', 'kinda', 'part', 'movie', 'school', 'start', 'work', 'lead', 'kind',
                   'rate', 'rating', 'rate', 'men', 'example', 'idea', 'half', 'review', 'genre', 'side', "that's", "they're", 'till', 'tell', 'phone',
                   'section', 'number', 'company', 'line', "there's", 'male', 'team', 'rating', 'baby', 'course', 'care', 'cute', 'question', 'help', 'group',
                   'hand', 'spoiler', 'hate', 'need', 'mess', 'change', 'drop', 'date', 'netflix', 'yeah', 'daon', 'park', 'thank', 'lmao', 'damn', "i'll",
                   'kang', 'shinwoo', 'taekyung', 'mean', 'woman', 'hope', 'read', 'fact', 'opinion', 'stuff', 'feel', 'kdrama', 'talk', 'song', 'hype',
                   'title', 'type'))

In [16]:
def bigram_filter(bigram):
    tag = nltk.pos_tag(bigram)
    if tag[0][1] not in ['JJ', 'NN'] and tag[1][1] not in ['NN']:
        return False
    if bigram[0] in stop_words or bigram[1] in stop_words:
        return False
    if 'n' in bigram or 't' in bigram:
        return False
    if 'PRON' in bigram:
        return False
    return True

In [17]:
def replace_ngram(x):
    # for gram in trigrams:
    #     x = x.replace(gram, '_'.join(gram.split()))
    for gram in bigrams:
        x = x.replace(gram, '_'.join(gram.split()))
    return x

In [18]:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('V'):
        return 'v'
    elif pos_tag.startswith('N'):
        return 'n'
    elif pos_tag.startswith('J'):
        return 'a'
    elif pos_tag.startswith('R'):
        return 'r'
    else:
        return None

In [19]:
# Filter for only nouns
def noun_only(x):
    pos_comment = nltk.pos_tag(x)
    filtered = [word[0] for word in pos_comment if word[1] in ['NN']]
    # to filter both noun and verbs
    #filtered = [word[0] for word in pos_comment if word[1] in ['NN','VB', 'VBD', 'VBG', 'VBN', 'VBZ']]
    return filtered

### 데이터프레임

In [38]:
df = pd.read_csv('/Users/suchan/study/파이널 프로젝트/0425_ENG_final_drama_reviews.csv')

In [21]:
titles = df['title']
df_lower = df['reviews'].str.lower()
df_lower = pd.DataFrame(df_lower)
df_lower['title'] = titles

### bigram

In [22]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = nltk.collocations.BigramCollocationFinder.from_documents([comment.split() for comment in df_lower.reviews])
# Filter only those that occur at least 50 times
finder.apply_freq_filter(50)
bigram_scores = finder.score_ngrams(bigram_measures.pmi)

In [23]:
bigram_pmi = pd.DataFrame(bigram_scores)
bigram_pmi.columns = ['bigram', 'pmi']
bigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)

In [24]:
filtered_bigram = bigram_pmi[bigram_pmi.apply(lambda bigram:\
                                              bigram_filter(bigram['bigram'])\
                                              and bigram.pmi > 5, axis = 1)][:500]

bigrams = [
    ' '.join(x) for x in filtered_bigram.bigram.values
    if len(x[0]) > 3 or len(x[1]) > 3
]

### 전처리

In [25]:
reviews_w_ngrams = df_lower.copy()

In [26]:
reviews_w_ngrams.reviews = reviews_w_ngrams.reviews.map(lambda x: replace_ngram(x))

In [27]:
reviews_w_ngrams = reviews_w_ngrams.reviews.map(lambda x: [word for word in x.split()\
                                                 if word not in stop_words and len(word) > 3])

In [28]:
word_list = reviews_w_ngrams.to_list()

In [29]:
lemma = WordNetLemmatizer()
word_list_lemma = []
for ls in word_list:
    word_lem = []
    tag_words = nltk.pos_tag(ls)
    pos_words = []
    for word in tag_words:
        if word[1] in ['NN']:  #'VB', 'VBP', 'JJ'
            pos_words.append(word)
    temp_list = []
    for token, pos_tag in pos_words:
        tag = get_wordnet_pos(pos_tag)
        if tag != None:
            temp_list.append((token, get_wordnet_pos(pos_tag)))
    token_final = [lemma.lemmatize(token, pos=tag) for token, tag in temp_list]
    word_list_lemma.append(token_final)

In [30]:
word_list_lemma = pd.Series(word_list_lemma)

In [31]:
final_reviews = word_list_lemma.map(noun_only)

In [39]:
df['preprocessed_data'] = final_reviews

### Dictionary 생성

In [73]:
dictionary = corpora.Dictionary(final_reviews)
dictionary.filter_extremes(no_below=20, no_above=0.5)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in final_reviews]

### LDA 모델 불러오기

In [33]:
from gensim.test.utils import datapath

In [51]:
temp_file = datapath("/Users/suchan/study/파이널 프로젝트/ENG_LDA_1/ldamodel")
lda = LdaModel.load(temp_file)

In [58]:
temp_file = datapath("/Users/suchan/study/파이널 프로젝트/ENG_LDA_BEST/lda_model")
lda2 = LdaModel.load(temp_file)

### LDA 시각화

In [66]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

In [74]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda, doc_term_matrix, dictionary)
pyLDAvis.display(vis)

ValidationError: 
 * Not all rows (distributions) in topic_term_dists sum to 1.

### 드라마별 리뷰 합치기

In [41]:
drama_list = df.title.unique().tolist()

In [44]:
from tqdm import tqdm

each_drama_reviews_dict = {'title' : [], 'preprocessed_reviews' : []}
for drama in tqdm(drama_list):
    drama_review_list = df[df['title'] == drama].preprocessed_data.tolist()
    reviews_join = sum(drama_review_list,[])
    each_drama_reviews_dict['title'].append(drama)
    each_drama_reviews_dict['preprocessed_reviews'].append(reviews_join)

100%|██████████| 92/92 [00:06<00:00, 13.61it/s]


In [45]:
df_each_drama_reviews = pd.DataFrame(each_drama_reviews_dict)

In [46]:
df_each_drama_reviews

Unnamed: 0,title,preprocessed_reviews
0,악의마음을읽는자들,"[monster, gaze, nonfiction, book, profiler, order, story, development, superb, writer, thrill, killer, analysis, personality, study, prevent, production, attention, detail, killer, serial_killer, interview, breakdown, bromance, kook, kook, shot, love, crime, waste, romance, press, accurate, chun, profiler, tone, heart, suspense, mind, psychology, force, crime, period, profiler, character, clueless, lightheartedness, detective, crime, comedic_relief, yeon, crime, superb, character, chemistry, moody, connection, hidden, indie, yeon, ethereal, angel, threat, mind, hunter, history, culture, production, deja, agent, youtu, discussion, plbdkvgnp, ozvhaefw, rliyg, klyn, darkness, cannot, drive, darkness, action, adaptation, nonfiction, writer, studio, wavve, charge, debut, master, composer, gaemi, music, production, winter, depiction, life, sequence, reality, extent, nonfiction, ...]"
1,라이프온마스,"[friendship, actor, student, doctor, trash, who's, coma, research, focus, paying_attention, game, romance, role, romance, tunnel, signal, choice, stay, imho, relationship, connection, coma, solve, place, explain, reality, dream, romance, romance, kiss, romance, tagso, cejj, thanksdid, wife, life, peak, twist, depression, loneliness, coma, dream, peace, mind, ambiguity, conclusion, imagination, fantasy, wish, radio, violence, violence, wondering, violence, scary, violence, blood, blood, rjei, family, blood, blood, autumn, gore, reality, lil'bit, dream, coma, state, jump, suicide, wake, coma, brain_surgery, energy, drive, serial_killer, coma, state, month, brain_surgery, energy, month, story, reality, life, sooo, patient, country, connect, life, thriller, mind, life, mind, research, character, story, coma, top_notch, ...]"
2,옷소매붉은끝동,"[choice, life, script, consort, slow_burn, romance, action, story, court, life, passionate, relationship, crown_prince, deok_im, place, gender, journey, journey, stand, character, manipulation, burden, ghost, duty, bound, sensitivity, darkness, intelligent, deok_im, identity, autonomy, desire, society, freedom, micro, body_language, product, experience, story, glory, chemistry, romance, tension, culmination, intimate, skirt, theme, power, imbalance, viewer, context, reality, trust, worth, deok_im, lifetime, lifetime, discover, deok_im, freedom, choice, deal, story, decision, empowerment, journey, story, director, tender, ethereal, color, direction, cinematography, costume, design, quality, cast, storyline, marvel, firm, story, history, distortion, mass, appeal, mouth, heart, palpitate, emotion, ache, leftover, life, story, happiness, story, exploration, heart, saga, quote, oscar, ...]"
3,이구역의미친X,"[comedy, scroll, context, trailer, squeeze, burst, laughter, context, comedy, comedy, scar, throw, wrath, obsession, paranoia, rage, value, schedule, edit, crossroad, life, story, progress, tomorrow, edit, direction, tension, dosage, vulnerability, water, extent, story, face, test, relationship, story, music, music, value, release, tension, story, relationship, place, bound, relationship, relationship, relationship, period, deserve, conclusion, pace, closure, hyun_bin, suzy, popularity, problem, focus, story, execution, suggestion, management, paranoia, post, stress, disorder, jung, yeon, pleasant_surprise, kyung, pain, heartbreak, mind's, hour, story, cannot, play, game, office, craziness, waterworks, fragility, experience, oh's, trust, kyung's, paranoia, life, family, tender, cannot, association, apartment, tune, action, road, recovery, anger, recovery, journey, ...]"
4,어느날우리집현관으로멸망이들어왔다,"[story, tearjerker, class, lineup, chemistry, storyline, arrangement, production, development, dull, climax, death, female, contract, sort, destroyer, relationship, swore, life, cliche, love, anticipate, parting, fate, flow, story, development, story, deity, girl, climax, expectation, candy, floss, shell, deity, character, couple, couple, triangle, role, poker, face, confession, sarcasm, joke, guk's, bo_young's, expression, voice, figure, pace, story, catch, view, area, criticism, piece, compare, reception, doom, service, controversy, criticism, contingency, storyline, waste, issue, diehard, attraction, hyuk, screen, bo_young, kyung, performance, charm, baritone, style, voice, hyun, shin, hyun, star, reception, problem, note, doom, service, trope, brother, doom, myul_mang, screenwriter, support, storyline, asset, plot, character, growth, investment, ...]"
5,스물다섯스물하나,"[path, five_twenty, story, mystery, reply, husband, min_chae, father, wasnt, yijin, answer, timeline, daughter, theme, joo_hyuk, multi, actor, hyun, wook, breath, fresh_air, coach, mother, role, portrayal, college, student, student, affair, heedo, adult, life, issue, portray, hee_do's, length, solve, goodness, madeup, attention, haywire, love, subtle, story, character, reveal, husband, character, husband, hee_do's, mother, character, cant, hee_do's, daughter, daughter, presence, possibility, endgame, heart, closure, rewatch, sorry, chance, shine, stage, subscription, story, place, hour, youth, deal, five_twenty, development, place, story, justice, reaction, stark, contrast, story, message, friendship, childhood, none, adulthood, adult, life, contrast, development, taste, mouth, audience, mystery, concept, friendship, couple's, romance, place, chunk, ...]"
6,무브투헤븐:나는유품정리사입니다,"[portrayal, asperger's, mother, cannot, stress, therapy, mother, ru's, father, life, approach, instance, child, ability, determine, autism, story, focus, relationship, uncle, story, heart, je_hoon, actor, range, taxi_driver, autism, plot, twist, cliffhanger, life, life, demise, message, life, death, life, misery, journey, journey, kdramas, plot, synopsis, communicate, living, heart, sure, life, guise, acceptance, direction, cinematography, crew, happiness, production, casting, choice, character, joon, bring, opportunity, seung, navillera, slice, life, twist, what's, writer's, mind, move, heaven, society, ride, emotion, singel, life, nuance, death, message, story, violence, regret, ability, competent, respect, assistance, society, pause, humor, spot, singel, flawless, fall, love, romance, memory, story, message, mind, rewatch, ...]"
7,호텔델루나,"[fantasy, shot, disappoint, doubt, love, cinematography, storyline, cast, kdramaland, plot, romance, plot, plenty, edge, character, man_weol's, story, humour, love', love, life, family, convince, receive, surgery, lifespan, amount, quality, life, place, state, argue, rest, death, analogy, effort, cinematography, cinematography, top_notch, camerawork, life, viewing_experience, describe, production, man_weol's, count, performance, mister', place, character, man_weol, performance, nature, complexity, character, character, focus, growth, wardrobe, clothing, furthermore, effort, chemistry, screen, relationship, cast, comedic_relief, story, soundtrack, soundtrack, complement, hotel_del_luna's, soundtrack, heize, level, board, call, hour, heck, ride, void, fill, credit, credit, worth, concept, advocate, system, ghost, plot, imbalance, plot, filler, classify, ghost, story, plot, hour, story, history, ...]"
8,하늘에서내리는일억개의별,"[note, melodrama, story, melodrama, mystery, romance, tragedy, antihero, story, mystery, melodrama, suspense, spark, script, communication, plot, progress, story, character, explanation, entertainment, execution, story, chemistry, happiness, role, character, dialogue, element, kang's, strength, character, ability, endure, move, jung, pain, role, kang's, brother, jin_gook, emotion, character, love, baek, seung, mention, hyun, jang, surprise, character, script, stance, story, music, sinister, music, melodrama, romance, yeon, sung, addition, sung, none, jung, relationship, romance, thriller, mystery, note, tragedy, melodrama, plot, suspense, romance, psychological_thriller, tragedy, expression, smirk, absolute, jung, chemistry, banter, emotional_rollercoaster, range, relationship, scream, screen, sung, brother, role, jang, chemistry, jin_gook, darkness, depth, cast, story, confusion, hesitation, ...]"
9,기름진멜로,"[character, love, share, junho, gosh, actor, lover, character, chemistry, abortion, storyline, window, president, husband, divorce, jerk, tsundure, jerk, gang, mind, interest, gang, junho, didnt, syndrome, rage, food, garbage, comedy, bromance, romance, fire, food, wont, regret, chemistry, onion, lift, family, treat, porcelain, doll, immature, food, fierce, food, plate, husband, chef, station, golf, control, penalty, golf, ball, face, learn, husband, relationship, attraction, conversation, response, wonder, understand, approach, relationship, push, pull, right, hold, jang_hyuk, message, fortune, cookie, soul, mate, jang_hyuk, friendship, comedy, chemistry, romance, ground, actor, actress, junho, ehem, huehue, actor, chemistry, comedy, jung, ryeo, won's, romance, disappoint, wall, romance, food, motivation, place, ...]"


### 토픽 비중 뽑기

In [47]:
drama_corpus = [dictionary.doc2bow(text) for text in df_each_drama_reviews['preprocessed_reviews']]

In [65]:
lda2[drama_corpus[6]]

[(0, 0.4818499), (1, 0.13419317), (2, 0.21252574), (3, 0.17143117)]

In [52]:
topic_weight_dict = {'title' : [], 'Topic0' : [], 'Topic1' : [], 'Topic2' : [], 'Topic3' : []}
for i in tqdm(range(len(drama_corpus))):
    review = drama_corpus[i]
    vector = lda[review]
    vector_topic_count = len(vector)
    topic_weight_dict['title'].append(df_each_drama_reviews['title'][i])
    for j in range(vector_topic_count):
        if vector[j][0] == 0:
            topic_weight_dict['Topic0'].append(float(vector[0][1]))
        elif vector[j][0] == 1:
            topic_weight_dict['Topic1'].append(float(vector[1][1]))
        elif vector[j][0] == 2:
            topic_weight_dict['Topic2'].append(float(vector[2][1]))
        else:
            topic_weight_dict['Topic3'].append(float(vector[3][1]))

100%|██████████| 92/92 [00:00<00:00, 598.68it/s]


In [53]:
df_topic_weight = pd.DataFrame(topic_weight_dict)

In [54]:
df_topic_weight

Unnamed: 0,title,Topic0,Topic1,Topic2,Topic3
0,악의마음을읽는자들,0.177125,0.206077,0.30946,0.307338
1,라이프온마스,0.173199,0.263278,0.247229,0.316294
2,옷소매붉은끝동,0.180467,0.283315,0.225748,0.31047
3,이구역의미친X,0.181837,0.245857,0.213315,0.358991
4,어느날우리집현관으로멸망이들어왔다,0.175621,0.273826,0.206356,0.344197
5,스물다섯스물하나,0.220461,0.291277,0.193932,0.294329
6,무브투헤븐:나는유품정리사입니다,0.198478,0.355192,0.205681,0.240649
7,호텔델루나,0.202815,0.273846,0.235664,0.287676
8,하늘에서내리는일억개의별,0.190681,0.273497,0.22832,0.307502
9,기름진멜로,0.186665,0.258003,0.267549,0.287783


In [None]:
df_topic_weight.to_csv("0502")