## データの読み込み

In [23]:
import pandas as pd
import numpy as np

#local 読み込み
dataPath = "datasets/"
titlePath = 'titledata/'

## Titleを抽出してtitle_origin.csvに保存

In [24]:
test = pd.read_csv(dataPath + 'test.csv')
train = pd.read_csv(dataPath +'train.csv')
sub = pd.read_csv(dataPath +'sample_submission.csv')

data = pd.concat([train, test], sort=False)#データの統合

data['title'].to_csv(titlePath + 'title_origin.csv')

# train = data[:len(train)]
# test = data[len(train):]

## 文字列のクリーニング前処理

In [83]:
from mojimoji import zen_to_han
import re

#記号,特殊文字を削除(日本語,英語,カタカナ,数字以外削除)
def delete_symbol(text):
  return re.sub('[^0-9a-zA-Zぁ-んｦ-ﾟ一-龥ー]', '',text)

#全角を半角に変更
def zen_han(text):
  for i in range(len(text)):
    text = zen_to_han(text)
  return text

#数字を全て'0'に変更
def num_zero(text):
  return re.sub(r'[0-9]+', "0", text)

#アルファベットを小文字に変換
def lower_text(text):
  return text.lower()

#データのクリーン処理
def clean_text(text):
  text = zen_han(text)
  text = num_zero(text)
  text = lower_text(text)
  text = delete_symbol(text)
  return text

#titleをクリーニング処理
data = pd.read_csv(titlePath+'title_origin.csv', index_col=0)
data['cleaning'] = data['title'].apply(clean_text)
data.to_csv(titlePath+'cleaning.csv')


True


## 形態素解析

In [84]:
import MeCab

# MeCab による単語への分割関数,(名詞,形容詞,動詞)のみ残す
def MorphologicalAnalysis(text):
  if text is np.nan:#欠損値は欠損値のまま返す
    return np.NaN
  tagger = MeCab.Tagger()
  words = []
  for c in tagger.parse(text).splitlines()[:-1]:
    surface, feature = c.split('\t')
    pos = feature.split(',')[0]
    if pos in ['名詞', '動詞', '形容詞']:
      words.append(surface)
  return ' '.join(words)

#形態素解析結果を'mecab'に代入
data = pd.read_csv(titlePath+'cleaning.csv', index_col=0)
data['mecab'] = data['cleaning'].apply(MorphologicalAnalysis)
data.to_csv(titlePath+'mecab.csv')

Unnamed: 0,title,cleaning,mecab
3147,・・・。,,
3468,,,
4207,∞,,
6943,××,,
8502,αφεσις,,
8706,。。。,,
19790,▲◆〇●●▼▼▼△,,
20666,•,,
20722,〆,,
24531,**,,
