In [2]:
from konlpy.tag import Kkma
from konlpy.tag import Okt
import csv
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from pandas import DataFrame
from multipledispatch import dispatch

class SentenceTokenizer(object):
    def __init__(self):
        self.kkma = Kkma()
        self.okt = Okt()
        #불용어사전 추가중
        with open('C:/Users/user/stopwords.txt', 'r', encoding='utf-8') as f:
            list_file = f.readlines()
        self.stopwords = list_file[0].split(",")
        
    def text2sentences(self, text):
        sentences = self.kkma.sentences(text)      
        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                sentences[idx-1] += (' ' + sentences[idx])
                sentences[idx] = ''
        return sentences

    def get_nouns(self, sentences):
        nouns = []
        for sentence in sentences:
            if sentence != '':
                nouns.append(' '.join([noun for noun in self.okt.nouns(str(sentence)) 
                                       if noun not in self.stopwords and len(noun) > 1]))
        return nouns


class GraphMatrix(object):
    def __init__(self):
        self.tfidf = TfidfVectorizer()
        self.cnt_vec = CountVectorizer()
        self.graph_sentence = []
    def build_words_graph(self, sentence):
        cnt_vec_mat = normalize(self.cnt_vec.fit_transform(sentence).toarray().astype(float), axis=0)
        vocab = self.cnt_vec.vocabulary_
        return np.dot(cnt_vec_mat.T, cnt_vec_mat), {vocab[word] : word for word in vocab}

#TextRank
class Rank(object):
    def get_ranks(self, graph, d=0.85): 
        A = graph
        matrix_size = A.shape[0]
        for id in range(matrix_size):
            A[id, id] = 0 
            link_sum = np.sum(A[:,id])
            if link_sum != 0:
                A[:, id] /= link_sum
            A[:, id] *= -d
            A[id, id] = 1
        B = (1-d) * np.ones((matrix_size, 1))
        ranks = np.linalg.solve(A, B) 
        return {idx: r[0] for idx, r in enumerate(ranks)}

class TextRank(object):
    def __init__(self, text):
        self.sent_tokenize = SentenceTokenizer()
        self.sentences = self.sent_tokenize.text2sentences(text)
        
        self.nouns = self.sent_tokenize.get_nouns(self.sentences)
        
        self.graph_matrix = GraphMatrix()
        self.words_graph, self.idx2word = self.graph_matrix.build_words_graph(self.nouns)

        self.rank = Rank()
        self.word_rank_idx = self.rank.get_ranks(self.words_graph)
        self.sorted_word_rank_idx = sorted(self.word_rank_idx, key=lambda k: self.word_rank_idx[k], reverse=True)
    
    def keywords(self, word_num=5):
        rank = Rank()
        rank_idx = rank.get_ranks(self.words_graph)
        sorted_rank_idx = sorted(rank_idx, key=lambda k: rank_idx[k], reverse=True)

        keywords = []
        index=[]
        
        for idx in sorted_rank_idx[:word_num]:
            index.append(idx)

        for idx in index:
            keywords.append(self.idx2word[idx])

        return keywords
    
data = pd.read_csv('통합_keyword.csv', low_memory=False)
row=[]
#print(df.tail(5))
@dispatch(str)
def get_key(str1):
    one = data.index[data['제목'] == str1]
    key = data.iloc[one[0], 8]
    #print(data.iloc[data.index[data['제목'] == str1], 8])
    print(str1)
    textrank = TextRank(key)
    row = [len(data),'추천키워드','작가','장르','줄거리','url','썸네일','플랫폼',str(textrank.keywords())]
    df = data.append(pd.Series(row, index=data.columns),ignore_index=True)
    df.iloc[-1] = row
    print(row[8])
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df['키워드'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    index = len(df)-1
    sim = sorted(list(enumerate(cosine_sim[index])), key=lambda x: x[1], reverse=True)
    web_indices = [index[0] for index in sim[2:8]]
    print(df['제목'].iloc[web_indices])
    print(str(cosine_sim[index][web_indices]))
@dispatch(str, str)
def get_key(str1, str2):
    key=  data.iloc[data.index[data['제목'] == str1][0], 8] + data.iloc[data.index[data['제목'] == str2][0], 8]
    print(str1 + "\t" + str2)
    textrank = TextRank(key)
    text_key = textrank.keywords()
    row = [len(data), '추천키워드','작가','장르','줄거리','url','썸네일','플랫폼',str(text_key)]
    df = data.append(pd.Series(row, index=data.columns),ignore_index=True)
    df.iloc[-1] = row
    print(row[8])
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df['키워드'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    index = len(df)-1
    sim = sorted(list(enumerate(cosine_sim[index])), key=lambda x: x[1], reverse=True)
    web_indices = [index[0] for index in sim[3:9]]
    print(df['제목'].iloc[web_indices])
    print(str(cosine_sim[index][web_indices]))
@dispatch(str, str, str)
def get_key(str1, str2, str3):
    key=  data.iloc[data.index[data['제목'] == str1][0], 8] + data.iloc[data.index[data['제목'] == str2][0], 8] + data.iloc[data.index[data['제목'] == str3][0], 8]
    print(str1 + "\t" + str2 + "\t" + str3)
    textrank = TextRank(key)
    text_key = textrank.keywords()
    row = [len(data), '추천키워드','작가','장르','줄거리','url','썸네일','플랫폼',str(text_key)]
    #row = [len(data), '추천키워드', str(text_key)]
    df = data.append(pd.Series(row, index=data.columns),ignore_index=True)
    df.iloc[-1] = row
    print(row[8])
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df['키워드'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    index = len(df)-1
    sim = sorted(list(enumerate(cosine_sim[index])), key=lambda x: x[1], reverse=True)
    web_indices = [index[0] for index in sim[4:10]]
    print(df['제목'].iloc[web_indices])
    print(str(cosine_sim[index][web_indices]))
@dispatch(str, str, str, str)
def get_key(str1, str2, str3, str4):
    key=  data.iloc[data.index[data['제목'] == str1][0], 8] + data.iloc[data.index[data['제목'] == str2][0], 8] + data.iloc[data.index[data['제목'] == str3][0], 8] + data.iloc[data.index[data['제목'] == str4][0], 8]
    print(str1 + "\t" + str2 + "\t" + str3 + "\t" + str4)
    textrank = TextRank(key)
    text_key = textrank.keywords()
    row = [len(data), '추천키워드','작가','장르','줄거리','url','썸네일','플랫폼',str(text_key)]
    #row = [len(data), '추천키워드', str(text_key)]
    df = data.append(pd.Series(row, index=data.columns),ignore_index=True)
    df.iloc[-1] = row
    print(row[8])
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df['키워드'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    index = len(df)-1
    sim = sorted(list(enumerate(cosine_sim[index])), key=lambda x: x[1], reverse=True)
    web_indices = [index[0] for index in sim[5:11]]
    print(df['제목'].iloc[web_indices])
    print(str(cosine_sim[index][web_indices]))
@dispatch(str, str, str, str, str)
def get_key(str1, str2, str3, str4, str5):
    key = data.iloc[data.index[data['제목'] == str1][0], 8] + data.iloc[data.index[data['제목'] == str2][0], 8] + data.iloc[data.index[data['제목'] == str3][0], 8] + data.iloc[data.index[data['제목'] == str4][0], 8] + data.iloc[data.index[data['제목'] == str5][0], 8]
    print(str1 + "\t" + str2 + "\t" + str3 + "\t" + str4 + "\t" + str5)
    textrank = TextRank(key)
    text_key = textrank.keywords()
    row = [len(data), '추천키워드','작가','장르','줄거리','url','썸네일','플랫폼',str(text_key)]
    #row = [len(data), '추천키워드', str(text_key)]
    df = data.append(pd.Series(row, index=data.columns),ignore_index=True)
    df.iloc[-1] = row
    print(row[8])
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df['키워드'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    index = len(df)-1
    sim = sorted(list(enumerate(cosine_sim[index])), key=lambda x: x[1], reverse=True)
    web_indices = [index[0] for index in sim[5:11]]
    print(df['제목'].iloc[web_indices])
    print(str(cosine_sim[index][web_indices]))
    
get_key('부마님 거기 있어줄래요')
print("\n")
get_key('방과 후 전쟁활동', '데드데이즈(DEAD DAYS)')
print("\n")
get_key('방과 후 전쟁활동', '데드데이즈(DEAD DAYS)','하이브 3')
print("\n")
get_key('방과 후 전쟁활동', '데드데이즈(DEAD DAYS)','하이브 3','개장수')
print("\n")
get_key('부마님 거기 있어줄래요', '공주님 마음대로!', '너와 사는 오늘', '우리 내일 이혼해요', 'N번째 연애')

부마님 거기 있어줄래요
['공주', '당나라', '사랑', '남자', '시대극']
2191    공주전쟁 [연재]
6952         공주전쟁
8187      유니크한 그녀
225          연애혁명
2332         천일야화
5333           여혜
Name: 제목, dtype: object
[0.29040308 0.29040308 0.28992228 0.27999852 0.26825437 0.26214608]


방과 후 전쟁활동	데드데이즈(DEAD DAYS)
['드라마', '바이러스', '생존', '스릴러', '스토리']
871         의도적 외면
1433    심연의 하늘 시즌4
1049            개미
631         사람의 조각
86             신도림
2443       정해진 첫사랑
Name: 제목, dtype: object
[0.49092318 0.48192445 0.4818589  0.43274684 0.4147308  0.39273807]


방과 후 전쟁활동	데드데이즈(DEAD DAYS)	하이브 3
['곤충', '드라마', '바이러스', '전쟁', '생존']
2443    정해진 첫사랑
1211        개장수
236        정글쥬스
4018     스킵과 로퍼
86          신도림
5312    리턴 서바이벌
Name: 제목, dtype: object
[0.31611797 0.31422747 0.30547959 0.2889469  0.26522074 0.25215486]


방과 후 전쟁활동	데드데이즈(DEAD DAYS)	하이브 3	개장수
['곤충', '바이러스', '생존', '수가', '스릴러']
871          의도적 외면
1433     심연의 하늘 시즌4
1486        하이브 1~2
1792              연
6282             파동
475     어느날 갑자기 서울은
Name: 제목, dtype: object
[0.325773