## データの読み込み

In [1]:
import pandas as pd
import numpy as np

#local 読み込み
dataPath = "datasets/"
storyPath = 'data/storydata/'

## データの読み込み

In [5]:
test = pd.read_csv(dataPath + 'test.csv')
train = pd.read_csv(dataPath +'train.csv')
sub = pd.read_csv(dataPath +'sample_submission.csv')

data = pd.concat([train, test], sort=False)#データの統合

data['story'].to_csv(storyPath + 'origin.csv')

## 文字列のクリーニング処理

In [11]:
from mojimoji import zen_to_han
import re

#unicode正規化
import unicodedata
def normalize_unicode(text):
  return unicodedata.normalize('NFKC', text)

#全角を半角に変更
# def zen_han(text):
#   for i in range(len(text)):
#     text = zen_to_han(text)
#   return text

#数字を全て'0'に変更
# def num_zero(text):
#   return re.sub(r'[0-9]+', "0", text)

#アルファベットを小文字に変換
# def lower_text(text):
#   return text.lower()

#urlの削除
def delete_url(text):
  return re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '', text)

#記号,特殊文字を削除(数字, 英語, ひらがな,カタカナ,漢字,ー。、以外'、'に変換)
def delete_symbol(text):
  return re.sub('[^0-9a-zA-Zぁ-んァ-ヿｦ-ﾟ一-龥ー。、]', '、',text)

#データのクリーン処理
def clean_text(text):
  text = normalize_unicode(text)
  text = delete_url(text)
  text = delete_symbol(text)
  return text

#titleをクリーニング処理
data = pd.read_csv(storyPath+'origin.csv', index_col=0)
data['cleaning'] = data['story'].apply(clean_text)
data.to_csv(storyPath+'cleaning.csv')



## 形態素解析

In [12]:
import MeCab

# MeCab による単語への分割関数,(名詞,形容詞,動詞)のみ残す
def MorphologicalAnalysis(text):
  if text is np.nan:#欠損値は欠損値のまま返す
    return np.NaN
  tagger = MeCab.Tagger()
  words = []
  for c in tagger.parse(text).splitlines()[:-1]:
    surface, feature = c.split('\t')
    pos = feature.split(',')[0]
    if pos in ['名詞', '動詞', '形容詞']:
      words.append(surface)
  return ' '.join(words)

#形態素解析結果を'mecab'に代入
data = pd.read_csv(storyPath+'cleaning.csv', index_col=0)
data['mecab'] = data['cleaning'].apply(MorphologicalAnalysis)
data.to_csv(storyPath+'mecab.csv')

In [None]:
from sudachipy import tokenizer
from sudachipy import dictionary

tokenizer_obj = dictionary.Dictionary().create()

# sudachi による単語への分割関数,(名詞,形容詞,動詞)のみ残す
def AnalysisbySudachi(text):
  if text is np.nan:#欠損値は欠損値のまま返す
    return np.NaN
  mode = tokenizer.Tokenizer.SplitMode.C #モードCの一番長い形で分ける
  results =[m.surface() for m in tokenizer_obj.tokenize(text, mode)]
  word_list = []
  for word in results:
    if not (word == ""): #何故か分かち書きの結果として空白データ（''）ができたための省く処理
      normalize = tokenizer_obj.tokenize(word, mode)[0].normalized_form() #正規化（標準化？）してなるべく言葉の揺れを無くす　e.g. 打込む → 打ち込む かつ丼 → カツ丼
      pos = tokenizer_obj.tokenize(normalize, mode)[0].part_of_speech()[0]
      if pos in  ["名詞", "動詞", "形容詞"]:  # 対象とする品詞を指定
        word = tokenizer_obj.tokenize(normalize, mode)[0].dictionary_form()
        word_list.append(word)
  return " ".join(word_list) #スペースで繋げていく

#形態素解析結果を'sudachi'に代入
data = pd.read_csv(storyPath+'cleaning.csv', index_col=0)
data['sudachi'] = data['cleaning'].apply(AnalysisbySudachi)
data.to_csv(storyPath+'sudachi.csv')


In [11]:
data = pd.read_csv(storyPath+'sudachi.csv')

from collections import Counter
Cnt = Counter()

def CountStoryWords(text):
  if text is np.nan:#欠損値は欠損値のまま返す
    return 0
  tmp = text.split()
  for t in tmp:
    Cnt[t] += 1

t = data['sudachi'].apply(CountStoryWords)

In [20]:
Cnt.most_common()
print(Cnt['ファンタジー'])
print(Cnt['現実'])
# data = pd.read_csv(storyPath + 'count.csv')
# data['count'].sum()

904
1248


## 学習済みモデルによる分散表現

In [21]:
import gensim

# chiVeデータのPATH（kv:KeyedVectors）
model_path = './jalang/entity_vector.model.txt'
# モデルの読み込み
model2 = gensim.models.KeyedVectors.load_word2vec_format(model_path)

In [23]:
model2.similarity('ファンタジー','現実')

0.426629

In [16]:
from tqdm import tqdm

#文章の分散表現を求める(単語ベクトルの平均)
def getSentenceVector(text):
  if text is np.nan:
    return np.array([np.nan for _ in range(200)])
  L = []
  for w in text.split(' '):
    if w in model.key_to_index:
      L.append(model.get_vector(key=w))
  if len(L)==0:
    return np.array([np.nan for _ in range(200)])
  return np.array(L).mean(axis=0)

#分散表現をnp.arrayの形に変換
def makeVectorArrayList(data):
  vec_list = []
  for index, row in data.iterrows():
    vec_list.append(getSentenceVector(row['mecab']))
  return np.array(vec_list)

data = pd.read_csv(storyPath+'mecab.csv', index_col=0)
vec_list_array = makeVectorArrayList(data)
vec_data = pd.DataFrame(data=vec_list_array, dtype='float')
vec_data.to_csv(storyPath+'vec_learnedModel.csv')
