In [1]:
from konlpy.tag import Hannanum
from konlpy.utils import pprint
import csv
import pandas as pd

lezhin_df = pd.read_csv('레진코믹스.csv')

In [2]:
from newspaper import Article
from konlpy.tag import Kkma
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import numpy as np

num_summary=3

class SentenceTokenizer(object):
    def __init__(self):
        self.kkma = Kkma()
        self.okt = Okt()
        self.stopwords =['이', '있', '하', '것', '들', '그', '되', '수', '이', '보', '않', '없', '나', '사람', '주', '아니', '등', '같',
                         '우리', '때', '년', '가', '한', '지', '대하', '오', '말', '일', '그렇', '위하', '때문', '그것', '두', '말하', 
                         '알', '그러나', '받', '못하', '일', '그런', '또', '문제', '더', '사회', '많', '그리고', '좋', '크', '따르', 
                         '중', '나오', '가지', '씨', '시키', '만들', '지금', '생각하', '그러', '속', '하나', '집', '살', '모르', '적', 
                         '월', '데', '자신', '안', '어떤', '내', '내', '경우', '명', '생각', '시간', '그녀', '다시', '이런', '앞', 
                         '보이', '번', '나', '다른', '어떻', '여자', '개', '전', '들', '사실', '이렇', '점', '싶', '말', '정도', 
                         '좀', '원', '잘', '통하', '소리', '놓', '레진', '연재', '공개', '독점', '인기', '수도', '없는', '제작', '작가',
                         '수상', '작', '수상작', '보고', '고', '해도', '통해', '위해', '귀염둥이', '관계']
    def url2sentences(self, url):
        article = Article(url, language='ko')
        article.download()
        article.parse()
        sentences = self.kkma.sentences(article.text)
        
        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                sentences[idx-1] += (' ' + sentences[idx])
                sentences[idx] = ''        
        return sentences
  
    def text2sentences(self, text):
        sentences = self.kkma.sentences(text)      
        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                sentences[idx-1] += (' ' + sentences[idx])
                sentences[idx] = ''
        return sentences

    def get_nouns(self, sentences):
        nouns = []
        for sentence in sentences:
            if sentence != '':
                nouns.append(' '.join([noun for noun in self.okt.nouns(str(sentence)) 
                                       if noun not in self.stopwords and len(noun) > 1]))
        return nouns


class GraphMatrix(object):
    def __init__(self):
        self.tfidf = TfidfVectorizer()
        self.cnt_vec = CountVectorizer()
        self.graph_sentence = []
    def build_sent_graph(self, sentence):
        tfidf_mat = self.tfidf.fit_transform(sentence).toarray()
        self.graph_sentence = np.dot(tfidf_mat, tfidf_mat.T)
        return self.graph_sentence
    def build_words_graph(self, sentence):
        cnt_vec_mat = normalize(self.cnt_vec.fit_transform(sentence).toarray().astype(float), axis=0)
        vocab = self.cnt_vec.vocabulary_
        return np.dot(cnt_vec_mat.T, cnt_vec_mat), {vocab[word] : word for word in vocab}

#TextRank
class Rank(object):
    def get_ranks(self, graph, d=0.85): 
        A = graph
        matrix_size = A.shape[0]
        for id in range(matrix_size):
            A[id, id] = 0 
            link_sum = np.sum(A[:,id])
            if link_sum != 0:
                A[:, id] /= link_sum
            A[:, id] *= -d
            A[id, id] = 1
        B = (1-d) * np.ones((matrix_size, 1))
        ranks = np.linalg.solve(A, B) 
        return {idx: r[0] for idx, r in enumerate(ranks)}

class TextRank(object):
    def __init__(self, text):
        self.sent_tokenize = SentenceTokenizer()
        self.sentences = self.sent_tokenize.text2sentences(text)
        
        self.nouns = self.sent_tokenize.get_nouns(self.sentences)
        
        self.graph_matrix = GraphMatrix()
        self.sent_graph = self.graph_matrix.build_sent_graph(self.nouns)
        self.words_graph, self.idx2word = self.graph_matrix.build_words_graph(self.nouns)

        self.rank = Rank()
        self.sent_rank_idx = self.rank.get_ranks(self.sent_graph)
        self.sorted_sent_rank_idx = sorted(self.sent_rank_idx, key=lambda k: self.sent_rank_idx[k], reverse=True)

        self.word_rank_idx = self.rank.get_ranks(self.words_graph)
        self.sorted_word_rank_idx = sorted(self.word_rank_idx, key=lambda k: self.word_rank_idx[k], reverse=True)
    
    def summarize(self, sent_num=3):
        summary = []
        index=[]
        for idx in self.sorted_sent_rank_idx[:sent_num]:
            index.append(idx)
        index.sort()

        for idx in index:
            summary.append(self.sentences[idx])
        return summary

    def keywords(self, word_num=10):
        rank = Rank()
        rank_idx = rank.get_ranks(self.words_graph)
        sorted_rank_idx = sorted(rank_idx, key=lambda k: rank_idx[k], reverse=True)

        keywords = []
        index=[]
        
        for idx in sorted_rank_idx[:word_num]:
            index.append(idx)

        for idx in index:
            keywords.append(self.idx2word[idx])

        return keywords
title_list=[]
keyword_list=[]
for i in lezhin_df.index:
    text = lezhin_df.loc[i, '키워드'] + lezhin_df.loc[i, '스토리']
    #print(text)
    textrank = TextRank(text)
    title_list.append(lezhin_df.loc[i, '제목'])
    keyword_list.append(textrank.keywords())

In [3]:
web_data = pd.DataFrame()
web_data['제목'] = title_list
web_data['키워드'] = keyword_list
web_data.to_csv('레진코믹스_sim.csv', encoding='utf-8-sig')

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv('레진코믹스_sim.csv', low_memory=False)
data.head(10)

Unnamed: 0.1,Unnamed: 0,제목,키워드
0,0,우리사이느은,"['로맨스', '연인', '모두', '원색', '파스텔', '사이', '시작', '..."
1,1,아기가 생겼어요,"['결혼', '로맨스', '희원', '아기', '아슬아슬', '이자', '아빠', ..."
2,2,애프터 커튼콜,"['연극', '매체', '영상', '뮤지컬', '또한', '모든', '이의', '이..."
3,3,대표님의 삐뚤어진 사랑 [연재],"['집착', '계약', '강재', '원래', '조여', '주변인', '분투', '영..."
4,4,너와 사는 오늘,"['원영', '연애', '교내', '도희', '라이', '로맨스', '삼각', '소..."
5,5,"대표님, 사모님이 도망가요","['과연', '그때', '남자', '모든', '불행', '요구', '행운', '가족..."
6,6,LOVE WINS,"['독자', '동성혼', '소식', '정신', '취해', '해지', '혼인신고', ..."
7,7,사랑할 수 없는 그녀,"['꽁꽁', '남동생', '리기', '시작', '시절', '이웃집', '재회', '..."
8,8,우리 내일 이혼해요,"['이혼', '갑자기', '고모', '여름', '제안', '집착', '하리', '결..."
9,9,안나 이야기,"['대장간', '드라마', '로맨스', '모험', '도적', '단골', '로부터',..."


In [5]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(data['키워드'])
print('TF-IDF 행렬 :',tfidf_matrix.shape)

TF-IDF 행렬 : (1240, 3918)


In [6]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(data['키워드'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print('코사인 유사도 행렬:',cosine_sim.shape)
print(cosine_sim)

코사인 유사도 행렬: (1240, 1240)
[[1.         0.00966836 0.         ... 0.01816084 0.         0.        ]
 [0.00966836 1.         0.06915898 ... 0.01586295 0.         0.        ]
 [0.         0.06915898 1.         ... 0.         0.         0.        ]
 ...
 [0.01816084 0.01586295 0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [7]:
title_to_index = dict(zip(data['제목'], data.index))

index = title_to_index['부마님 거기 있어줄래요']

In [8]:
import sys
import numpy as np
np.set_printoptions(threshold=sys.maxsize)

def get_recommendations(text, cosine_sim=cosine_sim):
    index = title_to_index[text]

    sim = sorted(list(enumerate(cosine_sim[index])), key=lambda x: x[1], reverse=True)

    web_indices = [index[0] for index in sim[1:6]]
    
    print(data['제목'].iloc[web_indices])
    print(str(cosine_sim[index][web_indices]))

# 코사인 유사도

In [9]:
get_recommendations('부마님 거기 있어줄래요')

438       황자님 거기 있어 줄래요
608           공주님 마음대로!
218     로젠 블러드 ~배덕의 저택~
116    환생하여 의녀가 되다 [연재]
314           공주전쟁 [연재]
Name: 제목, dtype: object
[0.28261898 0.12906286 0.12116387 0.11590916 0.11003447]
