In [None]:
import os 
import re 
import pandas as pd
import numpy as np 
import pickle 
from tokenizers import BertWordPieceTokenizer
from tqdm import tqdm_notebook, tnrange


In [None]:
def save_pickle(obj, filepath):
    with open(filepath, 'wb') as fp:
        pickle.dump(obj, fp)

def load_pickle(filepath):
    with open(filepath, 'rb') as fp:
        return pickle.load(fp)        

## Gather Data

In [None]:
data_1 = open('../data/train_1000.label', 'rb').readlines()
data_2 = open('../data/train_2000.label', 'rb').readlines()
data_3 = open('../data/train_3000.label', 'rb').readlines()
data_4 = open('../data/train_4000.label', 'rb').readlines()
data_5 = open('../data/train_5500.label', 'rb').readlines()

In [None]:
def decodeLines(b):
    try:
        return b.decode()
    except Exception as e:
        return None

In [None]:
data = []
data.extend([decodeLines(d) for d in data_1 if not decodeLines(d) == None])
data.extend([decodeLines(d) for d in data_2 if not decodeLines(d) == None])
data.extend([decodeLines(d) for d in data_3 if not decodeLines(d) == None])
data.extend([decodeLines(d) for d in data_4 if not decodeLines(d) == None])
data.extend([decodeLines(d) for d in data_5 if not decodeLines(d) == None])

In [None]:
len(data)

15447

In [None]:
data[:2]

['DESC:manner How did serfdom develop in and then leave Russia ?\n',
 'ENTY:cremat What films featured the character Popeye Doyle ?\n']

In [None]:
data[0].split()[0].split(':')[0], data[0].split()[0].split(':')[1]

('DESC', 'manner')

In [None]:
" ".join(data[0].split()[1:])

'How did serfdom develop in and then leave Russia ?'

In [None]:
dfFormat = [
    {
        "class": d.split()[0].split(':')[0],
        "sub-class": d.split()[0].split(':')[1],
        "question": " ".join(d.split()[1:])
    } for ix, d in enumerate(tqdm_notebook(data))
]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=15447.0), HTML(value='')))




In [None]:
len(dfFormat)

15447

In [None]:
dfFormat[:2]

[{'class': 'DESC',
  'question': 'How did serfdom develop in and then leave Russia ?',
  'sub-class': 'manner'},
 {'class': 'ENTY',
  'question': 'What films featured the character Popeye Doyle ?',
  'sub-class': 'cremat'}]

In [None]:
df = pd.DataFrame(dfFormat, index = [i for i in range(len(dfFormat))])

In [None]:
df.head()

Unnamed: 0,class,sub-class,question
0,DESC,manner,How did serfdom develop in and then leave Russ...
1,ENTY,cremat,What films featured the character Popeye Doyle ?
2,DESC,manner,How can I find a list of celebrities ' real na...
3,ENTY,animal,What fowl grabs the spotlight after the Chines...
4,ABBR,exp,What is the full form of .com ?


In [None]:
df.to_csv('../data/question-class-subclass-df.csv', index = False)

## Data to Tokens

In [None]:
uniqueClasses = pd.unique(df['class']).tolist()

In [None]:
uniqueClasses

['DESC', 'ENTY', 'ABBR', 'HUM', 'NUM', 'LOC']

In [None]:
classtoidx = {cl:ix for ix, cl in enumerate(uniqueClasses)}
idxtoclass = {ix:cl for ix, cl in enumerate(uniqueClasses)}
classtoidx, idxtoclass

({'ABBR': 2, 'DESC': 0, 'ENTY': 1, 'HUM': 3, 'LOC': 5, 'NUM': 4},
 {0: 'DESC', 1: 'ENTY', 2: 'ABBR', 3: 'HUM', 4: 'NUM', 5: 'LOC'})

In [None]:
save_pickle(classtoidx, '../data/class2idx.pkl')
save_pickle(idxtoclass, '../data/idx2class.pkl')

In [None]:
subclassUnique = pd.unique(df['sub-class']).tolist()

In [None]:
len(subclassUnique)

47

In [None]:
subclasstoidx = {cl:ix for ix, cl in enumerate(subclassUnique)}
idxtosubclass = {ix:cl for ix, cl in enumerate(subclassUnique)}

In [None]:
save_pickle(subclasstoidx, '../data/subclass2idx.pkl')
save_pickle(idxtosubclass, '../data/idx2subclass.pkl')

In [None]:
df['class'] = df['class'].map(classtoidx)

In [None]:
df.head()

Unnamed: 0,class,sub-class,question
0,0,manner,How did serfdom develop in and then leave Russ...
1,1,cremat,What films featured the character Popeye Doyle ?
2,0,manner,How can I find a list of celebrities ' real na...
3,1,animal,What fowl grabs the spotlight after the Chines...
4,2,exp,What is the full form of .com ?


In [None]:
df['sub-class'] = df['sub-class'].map(subclasstoidx)

In [None]:
df.head()

Unnamed: 0,class,sub-class,question
0,0,0,How did serfdom develop in and then leave Russ...
1,1,1,What films featured the character Popeye Doyle ?
2,0,0,How can I find a list of celebrities ' real na...
3,1,2,What fowl grabs the spotlight after the Chines...
4,2,3,What is the full form of .com ?


In [None]:
tokenizer = BertWordPieceTokenizer('../data/bert-word-piece-custom-wikitext-vocab-10k-vocab.txt', lowercase = True, strip_accents = True)

In [None]:
outputs = []
toklengths = []
with tqdm_notebook(total = len(df)) as pbar:
    for ix, row in df.iterrows():

        toks = tokenizer.encode(row.question).ids
        toklengths.append(len(toks))
        outputs.append({
            "question": row.question, 
            "question-tokens": toks,
            "question-class": row['class'],
            "question-subclass": row['sub-class']
        })
        pbar.update(1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=15447.0), HTML(value='')))




In [None]:
max(toklengths)

52

In [None]:
# outputs[:2]

In [None]:
save_pickle(outputs, '../data/tokenized_questions_classes_subclasses_dict.pkl')