In [None]:
from tqdm import tqdm

import tensorflow as tf
import numpy as np
import pandas as pd
import itertools
import pprint
import pickle

ori_train_csv = pd.read_csv('./data/train.csv')
print(ori_train_csv['label'].mean())
print(len(ori_train_csv))

thres = int(len(ori_train_csv) * 0.9)
train_csv = ori_train_csv[:thres]
val_csv = ori_train_csv[thres:]
print(len(train_csv), len(val_csv))
print(train_csv['label'].mean(), val_csv['label'].mean())
test_csv = pd.read_csv('./data/test.csv')

In [None]:
def glance(d, n=1):
    return dict(itertools.islice(d.items(), 1))


def fn(path):
    _q2w, _q2c = {}, {}
    _w_lens, _c_lens = [], []

    with open(path) as f:
        next(f)
        for line in f:
            l_split = line.split(',')
            qid, words, chars = l_split

            words_sp = words.split()
            chars_sp = chars.split()

            _q2w[qid] = words_sp
            _q2c[qid] = chars_sp

            _w_lens.append(len(words_sp))
            _c_lens.append(len(chars_sp))
            
    return _q2w, _q2c, _w_lens, _c_lens


_q2w, _q2c, _w_lens, _c_lens = fn('./data/question.csv')

print("Avg Word Len: %.1f | Max Word Len: %d"%(sum(_w_lens)/len(_w_lens), max(_w_lens)))
print("Avg Char Len: %.1f | Max Char Len: %d"%(sum(_c_lens)/len(_c_lens), max(_c_lens)))

assert len(_q2w) == len(_q2c), "len(q2w): %d, len(q2c): %d" % (len(_q2w), len(_q2c))
pprint.pprint(glance(_q2w))
pprint.pprint(glance(_q2c))

In [None]:
with open('./data/question.csv') as f:
    st = f.read()
    st = st.replace('W0', '')
    st = st.replace('L0', '')
    st = st.replace('W', '')
    st = st.replace('L', '')
with open('./question_cleaned.csv', 'w') as f:
    f.write(st)

In [None]:
def save_obj(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

q2w, q2c, w_lens, c_lens = fn('./question_cleaned.csv')

print("Avg Word Len: %.1f | Max Word Len: %d"%(sum(w_lens)/len(w_lens), max(w_lens)))
print("Avg Char Len: %.1f | Max Char Len: %d"%(sum(c_lens)/len(c_lens), max(c_lens)))

assert len(q2w) == len(q2c), "len(q2w): %d, len(q2c): %d" % (len(q2w), len(q2c))

save_obj(q2w, './q2w.pkl')
save_obj(q2c, './q2c.pkl')

In [None]:
q2w = load_obj('./q2w.pkl')
q2c = load_obj('./q2c.pkl')

pprint.pprint(glance(q2w))
pprint.pprint(glance(q2c))

In [None]:
embed_vals = []
with open('./data/word_embed.txt') as f:
    for line in f:
        line_sp = line.split()
        embed_vals.append([float(num) for num in line_sp[1:]])
embed_vals = np.asarray(embed_vals, dtype=np.float32)
PAD_INT = embed_vals.shape[0]
zeros = np.zeros((1,300), dtype=np.float32)
embed_vals = np.concatenate([embed_vals, zeros])
print(embed_vals.shape)
np.save('./word_embed.npy', embed_vals)

In [None]:
w_max_len = 20

def fn1(str_li, int_li):
    for i, s in enumerate(str_li[:w_max_len]):
        int_li[i] = int(str_li[i])

def train_fn(csv, path):
    writer = tf.python_io.TFRecordWriter(path)
    for arr_line in tqdm(csv.values, total=len(csv), ncols=70):
        q1_id_int, q2_id_int = [PAD_INT]*w_max_len, [PAD_INT]*w_max_len

        label, q1_id, q2_id = arr_line
        fn1(q2w[q1_id], q1_id_int)
        fn1(q2w[q2_id], q2_id_int)
        
        example = tf.train.Example(
            features = tf.train.Features(
                 feature = {
                   'input1': tf.train.Feature(
                       int64_list=tf.train.Int64List(value=q1_id_int)),
                   'input2': tf.train.Feature(
                       int64_list=tf.train.Int64List(value=q2_id_int)),
                   'label': tf.train.Feature(
                       int64_list=tf.train.Int64List(value=[label])),
                   }))
        serialized = example.SerializeToString()
        writer.write(serialized)

def test_fn(csv, path):
    writer = tf.python_io.TFRecordWriter(path)
    for arr_line in tqdm(csv.values, total=len(csv), ncols=70):
        q1_id_int, q2_id_int = [PAD_INT]*w_max_len, [PAD_INT]*w_max_len

        q1_id, q2_id = arr_line
        fn1(q2w[q1_id], q1_id_int)
        fn1(q2w[q2_id], q2_id_int)
        
        example = tf.train.Example(
            features = tf.train.Features(
                 feature = {
                   'input1': tf.train.Feature(
                       int64_list=tf.train.Int64List(value=q1_id_int)),
                   'input2': tf.train.Feature(
                       int64_list=tf.train.Int64List(value=q2_id_int)),
                   }))
        serialized = example.SerializeToString()
        writer.write(serialized)
        
#train_fn(train_csv, './train_word.tfrecord')
#test_fn(test_csv, './test_word.tfrecord')
train_fn(val_csv, './val_word.tfrecord')