In [1]:
from module import Example

In [7]:
import os
import glob
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import functools as ft
import operator as op
import unicodedata

In [3]:
os.chdir('/Users/sheg/Documents/projects/anki-flashcards/japanese/data')

# Загружаем Радикалы

У оригинального файла `japanese-radicals.csv` отсутствовало чтение для радикала 乙, оно добавлено вручную при редактировании этого файла

In [6]:
radic = pd.read_csv('japanese-radicals.csv', delimiter=';',
                    usecols=[0, 1, 3, 4, 6, 8],
                    header=0,
                    names=['strokes', 'radic', 'en', 'reading', 'position', 'important'],
                    dtype={'strokes': 'Int8'}
                   )

In [7]:
radic.important.replace({np.nan: False, 'Important': True}, inplace=True)
radic.reset_index(drop=True, inplace=True)

In [8]:
from lxml import html, etree

In [9]:
from requests import get

In [23]:
page = get('https://en.wikipedia.org/wiki/List_of_kanji_radicals_by_stroke_count')

In [24]:
xpath = etree.XPath('//*[@id="mw-content-text"]/div/table[1]')
table = xpath( html.fromstring(page.text) )[0]

In [25]:
table = pd.read_html( etree.tostring(table), index_col=0 )[0]

In [26]:
kangxi = table.iloc[:, 0]
kangxi.Name = 'radic'

In [27]:
kangxi = kangxi.str.rstrip('\)').str.split(' \(')

In [28]:
f = lambda x: {'radic': x[0], 'var': ','.join(x[1:]) if len(x) - 1 else np.nan}
kangxi = kangxi.apply(f)

In [29]:
kangxi = pd.DataFrame( list(kangxi) )

In [32]:
# normalization
kangxi = kangxi.applymap(
    lambda rad: unicodedata.normalize('NFKC', rad) if isinstance(rad, str) else rad
)

In [34]:
radic.radic = radic.radic.apply(lambda rad: unicodedata.normalize('NFKC', rad))

In [57]:
radic = kangxi.merge(radic, on='radic', how='left')

In [62]:
radic.drop_duplicates('radic', inplace=True)

In [64]:
radic.index = range(1, 215)

In [66]:
radic.to_pickle('radic.pkl')

# Кандзи

## Данные KANJIDIC

In [67]:
from lxml import etree

In [68]:
kanjidic = etree.parse('kanjidic2.xml').getroot()

In [69]:
kanjidic = kanjidic.xpath('character')

In [70]:
xpaths = {
    'kanji': 'literal',
    'radic': 'radical/rad_value',
    'grade': 'misc/grade',
    'strokes': 'misc/stroke_count',
    'freq_kd': 'misc/freq',
    'jlpt': 'misc/jlpt',
    'on_kd': "reading_meaning/rmgroup/reading[@r_type='ja_on']",
    'kun_kd': "reading_meaning/rmgroup/reading[@r_type='ja_kun']",
    'en_kd': 'reading_meaning/rmgroup/meaning[not(@m_lang)]'
}

In [71]:
df = pd.DataFrame(columns=xpaths.keys())

In [72]:
for kanji in kanjidic:
    row = {col: kanji.xpath(xpath_ + '/text()') for col, xpath_ in xpaths.items()}
    df = df.append(row, ignore_index=True)

In [73]:
df.kanji = df.kanji.apply(lambda l: l[0])

In [74]:
df.grade = df.grade.apply(lambda l: l[0] if len(l) == 1 else np.nan)

In [75]:
df.grade = df.grade.astype(float)

In [76]:
# Первая ступень школы
df['kyoiku'] = df.apply(lambda row: row.grade if row.grade in {1, 2, 3, 4, 5, 6} else np.nan, axis=1).astype('Int8').astype('category')

In [77]:
# Для повседневного употребления, кандзи, не входящие в этот список, обычно подписываются фуриганой
df['joyo'] = df.apply(lambda row: True if row.grade in {1, 2, 3, 4, 5, 6, 8} else False, axis=1)

In [78]:
# Для использования в именах
df['jinmeiyo'] = df.apply(lambda row: True if row.grade in {1, 2, 3, 4, 5, 6, 8, 9, 10} else False, axis=1)

In [79]:
# Больше не нужна
del df['grade']

In [80]:
df.freq_kd = df.freq_kd.apply(lambda l: l[0] if len(l) == 1 else np.nan).astype(float).astype('Int32')

In [81]:
df.jlpt = df.jlpt.apply(lambda l: l[0] if len(l) == 1 else np.nan).astype(float).astype('Int8')

In [87]:
df.radic = df.radic.apply(lambda l: [int(n) for n in l])

In [88]:
df.strokes = df.strokes.apply(lambda l: [int(n) for n in l])

In [95]:
# Переназовём таблицу
kanji = df
del df

In [96]:
kanji.head()

Unnamed: 0,kanji,radic,strokes,freq_kd,jlpt,on_kd,kun_kd,en_kd,kyoiku,joyo,jinmeiyo
0,亜,"[7, 1]",[7],1509.0,1.0,[ア],[つ.ぐ],"[Asia, rank next, come after, -ous]",,True,True
1,唖,[30],[10],,,"[ア, アク]",[おし],"[mute, dumb]",,False,False
2,娃,[38],[9],,,"[ア, アイ, ワ]",[うつく.しい],[beautiful],,False,True
3,阿,[170],[8],1126.0,1.0,"[ア, オ]","[おもね.る, くま]","[Africa, flatter, fawn upon, corner, nook, rec...",,False,True
4,哀,"[30, 8]",[9],1715.0,1.0,[アイ],"[あわ.れ, あわ.れむ, かな.しい]","[pathetic, grief, sorrow, pathos, pity, sympat...",,True,True


## Добавим переводы кандзи на русский, взятые на [nippon.temerov.org]

In [97]:
temerov_files = glob.glob('kyoiku/*.*sv') + ['joyo.tsv']

In [98]:
df = pd.DataFrame(columns=['kanji', 'ru_tm'])

In [99]:
for file in temerov_files:
    tmp = pd.read_csv(file, delimiter='\t',
                      usecols=[0, 1], header=0,
                      names=df.columns)
    
    df = df.append(tmp, ignore_index=True)

In [100]:
kanji = kanji.merge(df, on='kanji', how='outer')

In [101]:
del df, tmp

In [102]:
kanji.head()

Unnamed: 0,kanji,radic,strokes,freq_kd,jlpt,on_kd,kun_kd,en_kd,kyoiku,joyo,jinmeiyo,ru_tm
0,亜,"[7, 1]",[7],1509.0,1.0,[ア],[つ.ぐ],"[Asia, rank next, come after, -ous]",,True,True,Азия
1,唖,[30],[10],,,"[ア, アク]",[おし],"[mute, dumb]",,False,False,
2,娃,[38],[9],,,"[ア, アイ, ワ]",[うつく.しい],[beautiful],,False,True,
3,阿,[170],[8],1126.0,1.0,"[ア, オ]","[おもね.る, くま]","[Africa, flatter, fawn upon, corner, nook, rec...",,False,True,
4,哀,"[30, 8]",[9],1715.0,1.0,[アイ],"[あわ.れ, あわ.れむ, かな.しい]","[pathetic, grief, sorrow, pathos, pity, sympat...",,True,True,печальный


## Добавим переводы на английский, он, кун и примеры из проекта *KanjiAlive*

In [103]:
def parse_ka_examples(s):
    
    l = eval(s)
    examples = []
    
    for jp, en in l:
        kanji, kana = jp.rsplit('（', 1)
        kana = kana.rstrip('）')

        ex = Example(kanji=kanji, kana=kana, en=en)
        examples.append(ex)
    
    return examples

In [104]:
df = pd.read_csv('ka_data.csv', usecols=[0, 3, 5, 7, 9], header=0,
                names=['kanji', 'en_ka', 'kun_ka', 'on_ka', 'examples_ka'])

In [105]:
df.examples_ka = df.examples_ka.apply(parse_ka_examples)

In [106]:
df.kun_ka = df.kun_ka.str.split('、')
df.on_ka = df.on_ka.str.split('、')

In [107]:
kanji = kanji.merge(df, on='kanji', how='outer')

In [108]:
kanji.head()

Unnamed: 0,kanji,radic,strokes,freq_kd,jlpt,on_kd,kun_kd,en_kd,kyoiku,joyo,jinmeiyo,ru_tm,en_ka,kun_ka,on_ka,examples_ka
0,亜,"[7, 1]",[7],1509.0,1.0,[ア],[つ.ぐ],"[Asia, rank next, come after, -ous]",,True,True,Азия,,,,
1,唖,[30],[10],,,"[ア, アク]",[おし],"[mute, dumb]",,False,False,,,,,
2,娃,[38],[9],,,"[ア, アイ, ワ]",[うつく.しい],[beautiful],,False,True,,,,,
3,阿,[170],[8],1126.0,1.0,"[ア, オ]","[おもね.る, くま]","[Africa, flatter, fawn upon, corner, nook, rec...",,False,True,,,,,
4,哀,"[30, 8]",[9],1715.0,1.0,[アイ],"[あわ.れ, あわ.れむ, かな.しい]","[pathetic, grief, sorrow, pathos, pity, sympat...",,True,True,печальный,,,,


In [109]:
kanji.to_pickle('kanji_full.pkl')
del kanji

## Последние штрихи

In [110]:
kanji_full = pd.read_pickle('kanji_full.pkl')

### Темеров

Приведём в порядок, для начала разрежем переводы-строки Темерова на списки

In [111]:
# Точка используются в двух переводах, её трогать не будем
kanji_full[ kanji_full.ru_tm.str.contains('[.]') == True ]

Unnamed: 0,kanji,radic,strokes,freq_kd,jlpt,on_kd,kun_kd,en_kd,kyoiku,joyo,jinmeiyo,ru_tm,en_ka,kun_ka,on_ka,examples_ka
23,宛,[40],[8],,,[エン],"[あ.てる, -あて, -づつ, あたか.も]","[address, just like, fortunately]",,True,True,адресовано ...,,,,
57,畏,[102],[9],2389.0,,[イ],"[おそ.れる, かしこま.る, かしこ, かしこ.し]","[fear, majestic, graciously, be apprehensive]",,True,True,искренне Ваш ...,,,,


In [112]:
# Запятые используются для разделения толкований, разрежем по запятым, а остальные переводы запакуем в списки
kanji_full.ru_tm = kanji_full.ru_tm.str.split(',')

### Разрежем переводы KanjiAlive

In [113]:
# Аналогично, точки используются лишь в многоточиях (одна карточка), запятые разграничивают толкование
kanji_full[ kanji_full.en_ka.str.contains('[.]') == True ]

Unnamed: 0,kanji,radic,strokes,freq_kd,jlpt,on_kd,kun_kd,en_kd,kyoiku,joyo,jinmeiyo,ru_tm,en_ka,kun_ka,on_ka,examples_ka
41,以,[9],[5],126,3,[イ],[もっ.て],"[by means of, because, in view of, compared with]",4,True,True,[от],"to the ... of, by means of","[も, もって]",[イ],"[Example(kanji='以上', kana='いじょう', en='more tha..."


In [114]:
kanji_full.en_ka = kanji_full.en_ka.str.split(',')

### Заменим пустые списки `[]` в колонках на `np.nan`

In [115]:
for col in kanji_full.columns:
    kanji_full[col][ kanji_full[col].apply(lambda x: x == []) == True ] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Заменим float NaN на np.nan

In [116]:
kanji_full.fillna(np.nan, inplace=True)

### Нормализуем символы

In [12]:
kanji_full.kanji = kanji_full.kanji.apply(lambda s: unicodedata.normalize('NFKC', s))

In [13]:
kanji_full.to_pickle('kanji_full.pkl')

# Выберем нужное подмножество данных

In [14]:
kanji_full = pd.read_pickle('kanji_full.pkl')

In [15]:
get_rules = {
    'kanji': 'kanji',
    'radic': 'radic',
    'strokes': 'strokes',
    'freq': 'freq_kd',
    'jlpt': 'jlpt',
    'on': lambda x: x.on_ka if x.on_ka is not np.nan else x.on_kd,
    'kun': lambda x: x.kun_ka if x.kun_ka is not np.nan else x.kun_kd,
    'en_kd': 'en_kd',
    'en_ka': 'en_ka',
    'ru': 'ru_tm',
    'examples': 'examples_ka',
    'kyoiku': 'kyoiku',
    'joyo': 'joyo',
    'jinmeiyo': 'jinmeiyo'
}

In [16]:
kanji = pd.DataFrame()

for col_name, rule in get_rules.items():

    if type(rule) is str:
        col = kanji_full[rule]
    else:
        col = kanji_full.apply(rule, axis=1)
    
    kanji[col_name] = col

In [17]:
kanji = kanji.sort_values('freq')

In [18]:
kanji.to_pickle('kanji.pkl')