In [2]:
%cd ../../spell_correction

/Users/tyoyo/lab/spell_correction


In [13]:
from spell_correction import MeCabTokenizer
from smart_open import open
import pandas as pd

In [16]:
with open("https://storage.googleapis.com/tyoyo/jwtd/v1.0/train.tsv") as f:
    df = pd.read_csv(f, sep='\t', header=None)

In [20]:
tokenizer = MeCabTokenizer()

src_tokens = df[0].apply(tokenizer.tokenize)

In [22]:
src_lens = src_tokens.apply(len)

In [26]:
src_lens.describe()

count    500071.000000
mean         35.509662
std          20.428800
min           1.000000
25%          21.000000
50%          31.000000
75%          46.000000
max         155.000000
Name: 0, dtype: float64

In [29]:
(src_lens <= 100).mean()

0.9895334862449532

In [30]:
(src_lens <= 64).mean()

0.9062313151532483

In [31]:
tgt_tokens = df[1].apply(tokenizer.tokenize)
tgt_lens = tgt_tokens.apply(len)
tgt_lens.describe()

count    500071.000000
mean         35.384329
std          20.411678
min           1.000000
25%          21.000000
50%          31.000000
75%          46.000000
max         155.000000
Name: 1, dtype: float64

In [32]:
print((tgt_lens <= 100).mean())
print((tgt_lens <= 64).mean())

0.9896214737507274
0.9069352151994416


## 結論
最大系列長を64にすると10%の文章が入りきらない。
最大系列長を100にすれば99%の文章が入りきり、いい感じ。

In [35]:
from collections import Counter

In [36]:
counter = Counter()

In [39]:
from itertools import chain

for tokens in chain(src_tokens, tgt_tokens):
    counter.update([t.text for t in tokens])


In [40]:
freqs = counter.most_common()

In [46]:
len(freqs)

292673

In [42]:
freqs[0:10]

[('の', 1756226),
 ('、', 1607767),
 ('に', 1145275),
 ('を', 1038435),
 ('。', 942591),
 ('は', 896739),
 ('た', 888769),
 ('が', 883025),
 ('で', 763508),
 ('て', 692960)]

In [41]:
freqs[10000:10010]

[('しも', 241),
 ('ぶつけ', 241),
 ('江戸川', 241),
 ('禅', 241),
 ('務', 241),
 ('任官', 241),
 ('よっ', 241),
 ('テナント', 241),
 ('絶た', 241),
 ('キラ', 241)]

In [44]:
freqs[20000:20010]

[('Air', 88),
 ('倣い', 88),
 ('ねずみ', 88),
 ('ワンセグ', 88),
 ('貧', 88),
 ('カイジ', 88),
 ('噴水', 88),
 ('Type', 88),
 ('皆様', 88),
 ('無口', 88)]

In [43]:
freqs[30000:30010]

[('保科', 46),
 ('選ら', 46),
 ('印可', 46),
 ('INOJO', 46),
 ('相乗', 45),
 ('ョ', 45),
 ('容易く', 45),
 ('公邸', 45),
 ('ボラ', 45),
 ('幼馴染み', 45)]

## 語彙数を3万とかにすると、誤字がOOVになってしまう。

In [47]:
freqs[50000:50010]

[('ヒエラルキー', 18),
 ('鉄山', 18),
 ('笛吹川', 18),
 ('松之助', 18),
 ('イーゴリ', 18),
 ('水防', 18),
 ('BEGIN', 18),
 ('尾関', 18),
 ('達彦', 18),
 ('あそび', 18)]

In [48]:
freqs[100000:100010]

[('みさき公園', 6),
 ('ガルビオン', 6),
 ('かたく', 6),
 ('押しやる', 6),
 ('おびき出し', 6),
 ('Flag', 6),
 ('CECH', 6),
 ('バイカンフー', 6),
 ('経と', 6),
 ('無鉛', 6)]