# Environment Setup

In [1]:
import time
import numpy as np
import pandas as pd
#pd.options.display.max_columns = None
#pd.options.display.mpl_style = 'default'

import re
import os
import sys
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import tensorflow as tf

from scipy import sparse

## Total Running Time: Approximately 15 mins 

## Import raw data
* Dataset link: https://biendata.com/competition/zhihu/data/

In [2]:
t_0 = time.time()

df_train = pd.read_csv('data\question_train_set.txt', sep="\t", header = None, names = ['question_id', 'title_char_id','title_word_id','desc_char_id','desc_word_id'])
df_train_topic = pd.read_csv('data\question_topic_train_set.txt', sep="\t", header = None, names=['question_id', 'topic'])

df_train = pd.merge(df_train, df_train_topic, how='left', on='question_id')
# some of the title char/word are empty， here I removed all the questions which have NAN word title
#df_train = df_train[df_train['title_word_id'].notnull()]
df_eval = pd.read_csv('data\question_eval_set.txt', sep="\t", header = None, names = ['question_id', 'title_char_id','title_word_id','desc_char_id','desc_word_id'])

print('loading time: ',time.time() - t_0)

loading time:  50.23342728614807


In [3]:
t_0 = time.time()

df_train['title_char_len'] = df_train['title_char_id'].map(lambda x: len(x.split(',')) if type(x) != float else 0)
df_train['title_word_len'] = df_train['title_word_id'].map(lambda x: len(x.split(',')) if type(x) != float else 0)
df_train['desc_char_len'] = df_train['desc_char_id'].map(lambda x: len(x.split(',')) if type(x) != float else 0)
df_train['desc_word_len'] = df_train['desc_word_id'].map(lambda x: len(x.split(',')) if type(x) != float else 0)

df_eval['title_char_len'] = df_eval['title_char_id'].map(lambda x: len(x.split(',')) if type(x) != float else 0)
df_eval['title_word_len'] = df_eval['title_word_id'].map(lambda x: len(x.split(',')) if type(x) != float else 0)
df_eval['desc_char_len'] = df_eval['desc_char_id'].map(lambda x: len(x.split(',')) if type(x) != float else 0)
df_eval['desc_word_len'] = df_eval['desc_word_id'].map(lambda x: len(x.split(',')) if type(x) != float else 0)

print('time: ',time.time() - t_0)

time:  50.858458042144775


In [5]:
train_length = [df_train['title_char_len'].values, df_train['title_word_len'].values,\
                df_train['desc_char_len'].values, df_train['desc_word_len'].values]

eval_length = [df_eval['title_char_len'].values, df_eval['title_word_len'].values,\
                df_eval['desc_char_len'].values, df_eval['desc_word_len'].values]

np.save('preprocessed_data/train_length', train_length)
np.save('preprocessed_data/test_length', eval_length)

### Preprocessing data

In [9]:
def data_transform(file):
    data = []
    length = len(file)
    title_char = file['title_char_id'].values
    title_word = file['title_word_id'].values
    for i in range(length):
        instance = {"title_char": title_char[i], "title_word": title_word[i]}
        data.append(instance)
    return data

In [10]:
train = data_transform(df_train)
test = data_transform(df_eval)

In [11]:
def title_word_pipeline(data, max_sent_len=35):
    
    out_sentences = np.full([len(data),  max_sent_len], 0, dtype=np.int32)
    for i,instance in enumerate(data):
        sentence = instance['title_word']
        if pd.isnull(sentence):
            sents = [0]
        else:
            sents=  re.sub("[w]", "", sentence)
            sents = eval(sents) 
            if type(sents) == int: 
                sents = [sents]
        if len(sents) <= max_sent_len:
            out_sentences[i, 0:len(sents)] = sents
        else:
            out_sentences[i, 0: max_sent_len] = sents[:max_sent_len]

    return out_sentences

def title_char_pipeline(data, max_sent_len=50):
    
    out_sentences = np.full([len(data),  max_sent_len], 0, dtype=np.int32)
    for i,instance in enumerate(data):
        sentence = instance['title_char']
        if pd.isnull(sentence):
            sents = [0]
        else:
            sents=  re.sub("[c]", "", sentence)
            sents = eval(sents) 
            if type(sents) == int: 
                sents = [sents]
        if len(sents) <= max_sent_len:
            out_sentences[i, 0:len(sents)] = sents
        else:
            out_sentences[i, 0: max_sent_len] = sents[:max_sent_len]

    return out_sentences

In [12]:
t_0 = time.time()

train_title_word = title_word_pipeline(train)
train_title_char = title_char_pipeline(train)

print('loading time: ',time.time() - t_0)

loading time:  243.6631543636322


In [13]:
t_0 = time.time()

test_title_word = title_word_pipeline(test)
test_title_char = title_char_pipeline(test)

print('loading time: ',time.time() - t_0)

loading time:  17.917680263519287


In [14]:
np.save('preprocessed_data/train_title_char', train_title_char)
np.save('preprocessed_data/train_title_word', train_title_word)
np.save('preprocessed_data/test_title_char', test_title_char)
np.save('preprocessed_data/test_title_word', test_title_word)

In [21]:
np.max([np.amax(train_title_char),np.amax(test_title_char)]),np.max([np.amax(train_title_word),np.amax(test_title_word)])

(18377, 1138582)

### Char/Word Embedding

In [19]:
import collections
import operator
import random

t_0 = time.time()
word_dict= collections.defaultdict(list)
file= open('data/char_embedding.txt', 'r', encoding='utf-8')

for line in file:
    line = line.rstrip().split(' ')
    word_dict[line[0]]=[float(i) for i in line[1:]]
word_dict=dict(word_dict)

vocab_size = np.max([np.amax(train_title_char),np.amax(test_title_char)]) + 1

embedding_list=[]
OOV_vector  = [random.uniform(-0.5, 0.5) for i in range(256)]
for i in range(vocab_size):
    if i == 0:
        embedding_list.append(np.array([0 for i in range(256)], dtype='f'))
    elif 'c%d' % i in word_dict:
        embedding_list.append(word_dict['c%d' % i])
    else:
        embedding_list.append(OOV_vector)
        
C = np.array(embedding_list)
print('loading time: ',time.time() - t_0)

loading time:  1.6255218982696533


In [22]:
C .shape

(18378, 256)

In [12]:
t_0 = time.time()
word_dict= collections.defaultdict(list)
file= open('data/word_embedding.txt', 'r', encoding='utf-8')

for line in file:
    line = line.rstrip().split(' ')
    word_dict[line[0]]=[float(i) for i in line[1:]]
word_dict=dict(word_dict)


vocab_size = np.max([np.amax(train_title_word),np.amax(test_title_word)]) + 1

embedding_list=[]
OOV_vector  = [random.uniform(-0.5, 0.5) for i in range(256)]
for i in range(vocab_size):
    if i == 0:
        embedding_list.append(np.array([0 for i in range(256)], dtype='f'))
    elif 'w%d' % i in word_dict:
        embedding_list.append(word_dict['w%d' % i])
    else:
        embedding_list.append(OOV_vector)
       
W = np.array(embedding_list)
print('loading time: ',time.time() - t_0)

loading time:  54.12321472167969


In [25]:
np.save('preprocessed_data/word_embedding', W)
np.save('preprocessed_data/char_embedding', C)

### Topics Encoder

In [9]:
topic = pd.read_csv('data/topic.txt', sep="\t", header = None,\
                    names=['topic_id', 'parents_id','title_char_id','title_word_id','desc_char_id','desc_word_id'])

* transform topics as a list of list

In [4]:
t_0 = time.time()

df_train_topic['topic'] = df_train_topic['topic'].map(lambda x: eval(x))
df_train_topic['topic'] = df_train_topic['topic'].map(lambda x: [x] if type(x) is int else list(x))

train_topics = df_train_topic['topic'].values

print('loading time: ',time.time() - t_0)

loading time:  33.20693302154541


* encoding original labels (too large for multilabel binarizer) into smaller integer

In [5]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(list(topic['topic_id'].values))

LabelEncoder()

In [6]:
t_0 = time.time()

encoded_labels = [list(label_encoder.transform(i)) for i in train_topics]

print('loading time: ',time.time() - t_0)

loading time:  210.7425651550293


In [7]:
label_encoder.classes_[1401] == df_train_topic['topic'][0][1]

True

In [77]:
label_dictionary = {key: value for (key, value) in enumerate(label_encoder.classes_)}

np.save('preprocessed_data/label_dictionary', label_dictionary) 

* encoding the multi-labels

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_encoder = MultiLabelBinarizer(sparse_output= True)
multilabel_encoder.fit(encoded_labels)

MultiLabelBinarizer(classes=None, sparse_output=True)

In [28]:
final_labels = multilabel_encoder.transform(encoded_labels)

In [33]:
np.save('preprocessed_data/labels', final_labels)

* check if labels match

In [75]:
checking_index = 5

for i, j in enumerate(list(final_labels[checking_index].toarray()[0])):
    if j == 1:
        print(label_dictionary [i])

4195795391451929480
4351331710881888756


In [76]:
df_train_topic['topic'][checking_index]

[4351331710881888756, 4195795391451929480]

In [103]:
train_title_char.shape[0]/200

14999.835