In [16]:
import pandas as pd
import numpy as np

#local 読み込み
dataPath = './datasets/'
titlePath = './data/titledata/'
keyPath = './data/keyworddata/'
storyPath = './data/storydata/'
allPath =  './data/all/'

# MecabでSummary用に形態素解析

In [None]:
import MeCab

# MeCab による単語への分割関数,(名詞,形容詞,動詞)のみ残す
def MorphologicalAnalysis(sentences):
  if sentences is np.nan:#欠損値は欠損値のまま返す
    return np.NaN
  result = []
  text_list = sentences.split("。")
  for text in text_list:
    tagger = MeCab.Tagger()
    words = []
    for c in tagger.parse(text).splitlines()[:-1]:
      surface, feature = c.split('\t')
      pos = feature.split(',')[0]
      if pos in ['名詞', '動詞', '形容詞', '副詞']:
        words.append(surface)
    result.append(" ".join(words)+"。")
  return ' '.join(result)

#形態素解析結果を'mecab'に代入
data = pd.read_csv(storyPath+'cleaning.csv', index_col=0)
data['summary_mecab'] = data['cleaning'].apply(MorphologicalAnalysis)
data.to_csv(storyPath+'summary_mecab.csv')

# Summary実行

In [32]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

def makeSummary(corpus):
    if corpus is np.nan:
        return np.nan
    # 連結したcorpusを再度tinysegmenterでトークナイズさせる
    parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese'))
    # LexRankで要約を3文抽出
    summarizer = LexRankSummarizer()
    summarizer.stop_words = [' ']  # スペースも1単語として認識されるため、ストップワードにすることで除外する
    summary = summarizer(document=parser.document, sentences_count=3)
    return ''.join(map(str, summary))


data = pd.read_csv(storyPath+'summary_mecab.csv')
data['summary'] = data.summary_mecab.apply(makeSummary)
data.to_csv(storyPath + 'summary.csv')


# Sudachipy にて形態素解析

In [None]:
from sudachipy import tokenizer
from sudachipy import dictionary

tokenizer_obj = dictionary.Dictionary().create()

# sudachi による単語への分割関数,(名詞,形容詞,動詞)のみ残す
def AnalysisBySudachi(text):
  if text is np.nan:#欠損値は欠損値のまま返す
    return np.NaN
  mode = tokenizer.Tokenizer.SplitMode.C #モードCの一番長い形で分ける
  results =[m.surface() for m in tokenizer_obj.tokenize(text, mode)]
  word_list = []
  for word in results:
    if not (word == ""): #何故か分かち書きの結果として空白データ（''）ができたための省く処理
      normalize = tokenizer_obj.tokenize(word, mode)[0].normalized_form() #正規化（標準化？）してなるべく言葉の揺れを無くす　e.g. 打込む → 打ち込む かつ丼 → カツ丼
      pos = tokenizer_obj.tokenize(normalize, mode)[0].part_of_speech()[0]
      if pos in  ["名詞", "動詞", "形容詞"]:  # 対象とする品詞を指定
        word = tokenizer_obj.tokenize(normalize, mode)[0].dictionary_form()
        word_list.append(word)
  return " ".join(word_list) #スペースで繋げていく

#形態素解析結果を'sudachi'に代入
data = pd.read_csv(storyPath+'summary.csv', index_col=0)
data['summary_sudachi'] = data['summary'].apply(AnalysisBySudachi)
data.to_csv(storyPath+'summary_sudachi.csv')