In [0]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/tensorflow1/free_chat/chinese_qingyun/data')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
from collections import Counter
from pathlib import Path
from zh_wiki import zh2Hant

import numpy as np
import random
import re

In [0]:
Path('../vocab').mkdir(exist_ok=True)
char_counter = Counter()
src_lens, tgt_lens = [], []

intab, outtab = [], []
for s, t in zh2Hant.items():
  if len(s) == 1 and len(t) == 1:
    intab.append(t)
    outtab.append(s)
trantab = str.maketrans(''.join(intab), ''.join(outtab))

with open('./raw_data.csv') as f, open('./train.txt', 'w') as f_tr, open('./test.txt', 'w') as f_te:
  for line in f:
    line = line.rstrip().lower()
    line = re.sub('{.*}', ' ', line)
    line = line.replace('★', ' ')
    line = re.sub('\s+', ' ', line)
    if ('我的粉丝也不是' in line) or ('qq' in line) or ('菲菲' in line) or ('飲水得喇' in line):
      continue
    line = line.translate(trantab)
    src, tgt = line.split(' | ')
    src = src.strip()
    tgt = tgt.strip()
    if len(src) > 0 and len(tgt) > 0:
      if random.random() < 0.03:
        f_te.write(src+'<SEP>'+tgt+'\n')
      else:
        f_tr.write(src+'<SEP>'+tgt+'\n')
      char_counter.update(list(src))
      char_counter.update(list(tgt))
      src_lens.append(len(src))
      tgt_lens.append(len(tgt))

print('Source Average Length', sum(src_lens)/len(src_lens))
print('Target Average Length', sum(tgt_lens)/len(tgt_lens))

chars = ['<pad>', '<start>', '<end>'] + [char for char, freq in char_counter.most_common() if freq >= 5]
print(len(chars), 'Chars')
with open('../vocab/char.txt', 'w') as f:
  for c in chars:
    f.write(c+'\n')

Source Average Length 6.726910849536641
Target Average Length 10.60665363797653
3475 Chars


In [0]:
char2idx = {}
with open('../vocab/char.txt') as f:
  for i, line in enumerate(f):
    line = line.rstrip('\n')
    char2idx[line] = i

embedding = np.zeros((len(char2idx)+1, 300)) # + 1 for unknown word

with open('../vocab/cc.zh.300.vec') as f:
  count = 0
  for i, line in enumerate(f):
    if i == 0:
      continue
    if i % 100000 == 0:
      print('- At line {}'.format(i))
    line = line.rstrip()
    sp = line.split(' ')
    word, vec = sp[0], sp[1:]
    if word in char2idx:
      count += 1
      embedding[char2idx[word]] = np.asarray(vec, dtype='float32')
      
print("[%d / %d] characters have found pre-trained values"%(count, len(char2idx)))
np.save('../vocab/char.npy', embedding)
print('Saved ../vocab/char.npy')

- At line 100000
- At line 200000
- At line 300000
- At line 400000
- At line 500000
- At line 600000
- At line 700000
- At line 800000
- At line 900000
- At line 1000000
- At line 1100000
- At line 1200000
- At line 1300000
- At line 1400000
- At line 1500000
- At line 1600000
- At line 1700000
- At line 1800000
- At line 1900000
- At line 2000000
[3417 / 3475] characters have found pre-trained values
Saved ../vocab/char.npy
