In [125]:
from konlpy.tag import Kkma
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import numpy as np
import pandas as pd

#키워드 추출할 파일선택
webtoon_df = pd.read_csv('네이버웹툰_완결.csv')

In [126]:
class SentenceTokenizer(object):
    def __init__(self):
        self.kkma = Kkma() 
        self.okt = Okt()
        #불용어사전 추가중
        with open('C:/Users/user/CodingWorkspace/stopwords.txt', 'r') as f:
            list_file = f.readlines()
        self.stopwords = list_file[0].split(",")
        
    #text를 입력받아 Kkma.sentences()를 이용해 문장단위로 나눈 뒤 sentences로 리턴
    def text2sentences(self, text):  
        sentences = self.kkma.sentences(text)
        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                sentences[idx-1] += (' ' + sentences[idx])
                sentences[idx] = ''
        #text일 때 문장별로 리스트 만듦
        return sentences
    
    def get_nouns(self, sentences):
        nouns = []
        for sentence in sentences:
            if sentence is not '':
                nouns.append(' '.join([noun for noun in self.okt.nouns(str(sentence))
                                      if noun not in self.stopwords and len(noun) >1])) # 한글자 제거
        return nouns

  if sentence is not '':


In [127]:
class GraphMatrix(object):
    def __init__(self):
        self.tfidf = TfidfVectorizer()
        self.cnt_vec = CountVectorizer()
        self.graph_sentence = []

    def build_words_graph(self, sentence):
        cnt_vec_mat = normalize(self.cnt_vec.fit_transform(sentence).toarray().astype(float), axis=0)
        vocab = self.cnt_vec.vocabulary_
        return np.dot(cnt_vec_mat.T, cnt_vec_mat), {vocab[word] : word for word in vocab}

In [128]:
class Rank(object):
    
    def get_ranks(self, graph, d=0.85): 
        A = graph
        matrix_size = A.shape[0]
        for id in range(matrix_size):
            A[id, id] = 0 
            link_sum = np.sum(A[:,id]) 
            if link_sum != 0:
                A[:, id] /= link_sum
            A[:, id] *= -d
            A[id, id] = 1

        B = (1-d) * np.ones((matrix_size, 1))
        ranks = np.linalg.solve(A, B) 
        return {idx: r[0] for idx, r in enumerate(ranks)}

In [129]:
class TextRank(object):
    def __init__(self, text):
        self.sent_tokenize = SentenceTokenizer()
        self.sentences = self.sent_tokenize.text2sentences(text)
        
        self.nouns = self.sent_tokenize.get_nouns(self.sentences)
        
        self.graph_matrix = GraphMatrix()
        self.words_graph, self.idx2word = self.graph_matrix.build_words_graph(self.nouns)

        self.rank = Rank()
        self.word_rank_idx = self.rank.get_ranks(self.words_graph)
        self.sorted_word_rank_idx = sorted(self.word_rank_idx, key=lambda k: self.word_rank_idx[k], reverse=True)
        
        # 키워드 수 조정
    def keywords(self, word_num=5):
        rank = Rank()
        rank_idx = rank.get_ranks(self.words_graph)
        sorted_rank_idx = sorted(rank_idx, key=lambda k: rank_idx[k], reverse=True)

        keywords = []
        index=[]

        ######################
        for idx in sorted_rank_idx[:word_num]:
            index.append(idx)

        #index.sort()
        for idx in index:
            keywords.append(self.idx2word[idx])

        return keywords

In [130]:
#크롤링한 csv파일에서 추출
title_list=[]
keyword_list=[]

#장르+스토리에서 키워드뽑기
for i in webtoon_df.index:
    text =webtoon_df.loc[i, '장르']+webtoon_df.loc[i, '스토리']
    
    #print(text)
    textrank = TextRank(text)
    
    title_list.append(webtoon_df.loc[i, '제목'])
    keyword_list.append(textrank.keywords())

In [131]:
web_data = pd.DataFrame()
web_data['제목'] = title_list
web_data['키워드'] = keyword_list
web_data.to_csv('네이버_완결_keyword.csv', encoding='utf-8-sig')

In [132]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv('네이버_완결_keyword.csv', low_memory=False)
data.head(10)

Unnamed: 0.1,Unnamed: 0,제목,키워드
0,0,파이어스,"['공익', '근무', '나방', '액션', '이야기']"
1,1,바로 보지 않는,"['필요', '희진', '감치', '결심', '드라마']"
2,2,인간졸업,"['드라마', '라면', '세상', '영재', '소경']"
3,3,불어오는 밤,"['가문', '동해', '원수', '파발', '불명']"
4,4,배틀트레인,"['발전', '배틀', '스포츠', '지하철', '토너먼트']"
5,5,장난감,"['인생', '방식', '분풀이', '비밀', '세상']"
6,6,[영화원작] 모럴센스,"['로맨스', '아주', '남자', '사이', '오해']"
7,7,후작님을 녹이는 방법,"['얼굴', '결혼', '후작', '에이', '프레이']"
8,8,2022 서브병에 빠지다!,"['서브', '개그', '로맨스', '매력', '발산']"
9,9,강림전기 개정기,"['액션', '차지', '얼굴', '우등', '강림']"
