In [1]:
import os
import re
import pickle
import zipfile
import tarfile
import collections
import numpy as np
from tqdm import tqdm
from opencc import OpenCC

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as Func
import torch.optim as optim
from torch.utils.data.dataset import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.autograd import Variable

In [3]:
file_path = './data/couplet.tar.gz'
tar = tarfile.open(file_path, 'r:gz')
tar.extractall('./data/')

with zipfile.ZipFile('./data/CWE02.zip', 'r') as zip_ref:
    zip_ref.extractall('./data/')

train_in, train_out, test_in, test_out = [], [], [], []
with open('./data/couplet/train/in.txt') as f:
    for line in f.readlines():
        train_in.append(line)

with open('./data/couplet/train/out.txt') as f:
    for line in f.readlines():
        train_out.append(line)

with open('./data/couplet/test/in.txt') as f:
    for line in f.readlines():
        test_in.append(line)
        
with open('./data/couplet/test/out.txt') as f:
    for line in f.readlines():
        test_out.append(line)

cc = OpenCC('s2tw')
train_in = [''.join(i.split(' ')) for i in train_in]
train_out = [''.join(i.split(' ')) for i in train_out]
train_in = [cc.convert(i) for i in train_in]
train_out = [cc.convert(i) for i in train_out]

test_in = [''.join(i.split(' ')) for i in test_in]
test_out = [''.join(i.split(' ')) for i in test_out]
test_in = [cc.convert(i) for i in test_in]
test_out = [cc.convert(i) for i in test_out]

train_in_7words = []
train_out_7words = []
test_in_7words = []
test_out_7words = []
pattern = re.compile('^[\u4e00-\u9fa5_a-zA-Z0-9]+$')
for i in range(len(train_in)):
    
    x = ''.join(train_in[i].split(' '))
    y = ''.join(train_out[i].split(' '))
    x = re.search(pattern, x)
    y = re.search(pattern, y)
    try:
        x = x.group(0)
        y = y.group(0)
        train_in_7words.append(x)
        train_out_7words.append(y)
    except:
        pass

for i in range(len(test_in)):
    
    x = ''.join(test_in[i].split(' '))
    y = ''.join(test_out[i].split(' '))
    x = re.search(pattern, x)
    y = re.search(pattern, y)
    try:
        x = x.group(0)
        y = y.group(0)
        test_in_7words.append(x)
        test_out_7words.append(y)
    except:
        pass

train_in_7words = [i for i in train_in_7words if len(i) == 7]
train_out_7words = [i for i in train_out_7words if len(i) == 7]
test_in_7words = [i for i in test_in_7words if len(i) == 7]
test_out_7words = [i for i in test_out_7words if len(i) == 7]
    
with open('./data/train_in_7words', 'wb+') as f:
    pickle.dump(train_in_7words, f)

with open('./data/train_out_7words', 'wb+') as f:
    pickle.dump(train_out_7words, f)

with open('./data/test_in_7words', 'wb+') as f:
    pickle.dump(test_in_7words, f)

with open('./data/test_out_7words', 'wb+') as f:
    pickle.dump(test_out_7words, f)

In [4]:
with open('./data/train_in_7words', 'rb') as f:
    train_in_7words = pickle.load(f)

with open('./data/train_out_7words', 'rb') as f:
    train_out_7words = pickle.load(f)
    
with open('./data/test_in_7words', 'rb') as f:
    test_in_7words = pickle.load(f)

with open('./data/test_out_7words', 'rb') as f:
    test_out_7words = pickle.load(f)

In [61]:
class words_dict():
    def __init__(self):
        self.word_count = collections.defaultdict(int)
        self.id_to_word = {0: '_sos_', 1: '_eos_', 2: '_unk_'}
        self.word_to_id = {'_sos_': 0, '_eos_': 1, '_unk_': 2}
        self.n_words = 3
        self.remain_id = []
        self.max_len = 9
        
    def add_word(self, tokens):
        for token in tokens:
            if self.word_to_id.get(token):
                self.word_count[token] += 1
            else:
                self.word_to_id[token] = self.n_words
                self.id_to_word[self.n_words] = token
                self.n_words += 1
                self.word_count[token] = 1

    def predict(self, tokens):
        t_sen = [0] + (self.max_len - 1) * [1]
        for idx, token in enumerate(tokens):
            if idx+1 == self.max_len - 1:
                t_sen = t_sen[:-1] +  [1]
                break
            if not self.word_to_id.get(token) :
                t_sen[idx+1] = 2
            else:
                t_sen[idx+1] = self.word_to_id[token]
        return t_sen
    
    def sort_dict(self):
        sort_d = sorted(self.word_count.items(), key = lambda x: x[1])[:int(self.n_words *0.8)]
        for (word, j) in sort_d:
            id = self.word_to_id[word]
            del self.word_to_id[word]
            del self.id_to_word[id]
        words = self.word_to_id.keys()
        print(f'Word count after reduce: {len(words)}')
        word_count = 0
        for w in words:
            self.id_to_word[word_count] = w
            self.word_to_id[w] = word_count 
            word_count += 1
        self.n_words = word_count
        return

In [62]:
def transform(data):
    t_in = []
    for sent in data:
        t_in.append(dictionary.predict(sent))
    t_in = torch.tensor(t_in)
    return t_in

In [64]:
dictionary = words_dict()

for sent in (train_in_7words+train_out_7words+test_in_7words+test_out_7words):
    for c in sent:
        dictionary.add_word(c)

print(f'Total {len(dictionary.word_to_id)} words')

dictionary.sort_dict()
with open('data/train_dict_cut.pkl', 'wb') as f:
    pickle.dump(dictionary, f, pickle.HIGHEST_PROTOCOL)

train_in_7words_id = transform(train_in_7words)
train_out_7words_id = transform(train_out_7words)
test_in_7words_id = transform(test_in_7words)
test_out_7words_id = transform(test_out_7words)

torch.save(train_in_7words_id, './data/train_in_7words_id.trc')
torch.save(train_out_7words_id, './data/train_out_7words_id.trc')
torch.save(test_in_7words_id, './data/test_in_7words_id.trc')
torch.save(test_out_7words_id, './data/test_out_7words_id.trc')

Total 7285 words
Word count after reduce: 1457


In [66]:
dictionary.id_to_word[1235]

'避'

In [9]:
embeddings_dict = {}
with open("./data/Glove_CNA_ASBC_300d.vec", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

embeddings_dict['_sos_'] =  np.random.rand(300, )
embeddings_dict['_eos_'] =  np.random.rand(300, )
embeddings_dict['_unk_'] =  np.random.rand(300, )

with open('./data/emb_glove_300.pkl', 'wb+') as f:
    pickle.dump(embeddings_dict, f)

In [67]:
mapping_dict = {}
for key, val in dictionary.id_to_word.items():
    mapping_dict[key] = embeddings_dict.get(val, embeddings_dict['_unk_'])

In [68]:
with open('data/glove_id_to_emb.pkl', 'wb') as f:
    pickle.dump(mapping_dict, f, pickle.HIGHEST_PROTOCOL)