In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd

# データ読み込み

In [13]:
T23_train = pd.read_csv('/content/drive/MyDrive/Learning/EasyJapanese/analysis/corpus/T23_train.csv')
T23_test = pd.read_csv('/content/drive/MyDrive/Learning/EasyJapanese/analysis/corpus/T23_test.csv')
T15 = pd.read_csv('/content/drive/MyDrive/Learning/EasyJapanese/analysis/corpus/T15.csv')

# データ前処理

In [None]:
# https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja から引用・一部改変
from __future__ import unicode_literals
import re
import unicodedata
 
def unicode_normalize(cls, s):
    pt = re.compile('([{}]+)'.format(cls))
 
    def norm(c):
        return unicodedata.normalize('NFKC', c) if pt.match(c) else c
 
    s = ''.join(norm(x) for x in re.split(pt, s))
    s = re.sub('－', '-', s)
    return s
 
def remove_extra_spaces(s):
    s = re.sub('[ ]+', ' ', s)
    blocks = ''.join(('\u4E00-\u9FFF',  # CJK UNIFIED IDEOGRAPHS
                      '\u3040-\u309F',  # HIRAGANA
                      '\u30A0-\u30FF',  # KATAKANA
                      '\u3000-\u303F',  # CJK SYMBOLS AND PUNCTUATION
                      '\uFF00-\uFFEF'   # HALFWIDTH AND FULLWIDTH FORMS
                      ))
    basic_latin = '\u0000-\u007F'
 
    def remove_space_between(cls1, cls2, s):
        p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
        while p.search(s):
            s = p.sub(r'\1\2', s)
        return s
 
    s = remove_space_between(blocks, blocks, s)
    s = remove_space_between(blocks, basic_latin, s)
    s = remove_space_between(basic_latin, blocks, s)
    return s
 
def normalize_neologd(s):
    s = s.strip()
    s = unicode_normalize('０-９Ａ-Ｚａ-ｚ｡-ﾟ', s)
 
    def maketrans(f, t):
        return {ord(x): ord(y) for x, y in zip(f, t)}
 
    s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s)  # normalize hyphens
    s = re.sub('[﹣－ｰ—―─━ー]+', 'ー', s)  # normalize choonpus
    s = re.sub('[~∼∾〜〰～]+', '〜', s)  # normalize tildes (modified by Isao Sonobe)
    s = s.translate(
        maketrans('!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~｡､･｢｣',
              '！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」'))
 
    s = remove_extra_spaces(s)
    s = unicode_normalize('！”＃＄％＆’（）＊＋，－．／：；＜＞？＠［￥］＾＿｀｛｜｝〜', s)  # keep ＝,・,「,」
    s = re.sub('[’]', '\'', s)
    s = re.sub('[”]', '"', s)
    return s

In [None]:
import re
import numpy as np
import pickle
from tqdm import tqdm
 
tag_regex = re.compile(r"<[^>]*?>")
 
def normalize_text(text):
    text = text.replace("\t", " ")
    text = normalize_neologd(text)
    text = tag_regex.sub("", text)
    text = text.replace("&quot;", "\"").replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">").replace("&nbsp;", " ")
    return text
 
all_data = []
count = 0
 
for index, data in T23_train.iterrows():
    if data['#日本語(原文)'] is None or data['#日本語(原文)'] is np.nan or not data['#日本語(原文)']:
        continue
    if data['#やさしい日本語'] is None or data['#やさしい日本語'] is np.nan or not data['#やさしい日本語']:
        continue
    normalized_body = normalize_text(data['body'])
    all_data.append({"text": "keyword: " + normalized_body,"response": normalize_text(data['keyword_str_1']),})
    all_data.append({"text": "keyword: " + normalized_body,"response": normalize_text(data['keyword_str_2']),})
    all_data.append({"text": "keyword: " + normalized_body,"response": normalize_text(data['keyword_str_3']),})
    all_data.append({"text": "topics_title: " + normalized_body,"response": normalize_text(data['topics_article_title']),})
    all_data.append({"text": "title: " + normalized_body,"response": normalize_text(data['title']),})
    if data['long_title'] is not None and data['long_title'] is not np.nan:
        all_data.append({"text": "long_title: " + normalized_body,"response": normalize_text(data['long_title']),})
    if data['summary_1'] is not None and data['summary_1'] is not np.nan:
        all_data.append({"text": "summary_1: " + normalized_body,"response": normalize_text(data['summary_1']),})
    if data['summary_2'] is not None and data['summary_2'] is not np.nan:
        all_data.append({"text": "summary_2: " + normalized_body,"response": normalize_text(data['summary_2']),})
    if data['summary_3'] is not None and data['summary_3'] is not np.nan:
        all_data.append({"text": "summary_3: " + normalized_body,"response": normalize_text(data['summary_3']),})


In [15]:
T23_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34300 entries, 0 to 34299
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        34300 non-null  object
 1   #日本語(原文)  34300 non-null  object
 2   #やさしい日本語  34300 non-null  object
 3   #英語(原文)   34300 non-null  object
 4   #固有名詞     2812 non-null   object
dtypes: object(5)
memory usage: 1.3+ MB
