# Text processor

###Import libraries

In [0]:
import pandas as pd
import numpy as np
import json
import psycopg2
import re
from sklearn.model_selection import train_test_split
import html
import spacy
from pathlib import Path
import collections
import datetime as dt
import pickle
import torch

#### Import ulangel library

In [0]:
from ulangel.data.text_processor import text_proc

### Import your text

In [0]:
text_df = pd.read_csv('your text path')

#### If necessary, combine columns to make the whole text

In [0]:
def get_txt(df):
    return '<Author> ' + df.author + ' <Title> ' + df.title + ' <Description> ' + df.description

In [0]:
text_df['text'] = get_txt(text_df)

### text processor
In the **text_proc** method of ulangel.data.text_processor, here are the processing steps:
1. Replace HTML special characters and emoji
2. Replace word repetitions and add `xxwrep` ahead: word word word -> xxwrep 3 word
3. Replace character repetitions and add `xxrep` ahead: cccc -> xxrep 4 c
4. Add spaces around /,@,#,:
5. Remove multiple spaces and keep just one
6. Tokenize the text
7. Replace tokens with all letters in capitals by their lower case and add `xxup` ahead: GOOD JOB -> xxup good xxup job
8. Replace tokens with the first letter in capital by their lower caser and add `xxmaj` ahead: We -> xxmaj we
9. Add `xbos` at the beginning and `xfld` at the end of the text

The method **get_all** applies **text_proc** line by line an return a list of lists of tokens.

In [0]:
def get_all(df):
    tok = []
    time_begin = dt.datetime.now()
    time_boucle = time_begin
    tokenizer = spacy.load('en').tokenizer
    for i, r in df.iterrows(): 
        text = r['text']
        tok_ = text_proc(text, tokenizer)
        tok += tok_
        if i%5000==0:
            time_end = dt.datetime.now()
            print('Time for 5000 lines: ' + str(time_end - time_boucle))
            time_boucle = time_end
    time_end = dt.datetime.now()
    print('Total Time: ' + str(time_end - time_begin))
    return tok

#### apply text processor

In [0]:
tok_lm = get_all(text_df)

In [0]:
text_df['tok'] = tok_lm

### Create the dictionary itos (integer to string) of the corpus and the inverse dictionary stoi (string to integer)

In [0]:
freq = collections.Counter(p for o in text_df['tok'].values for p in o)
# define the maximum size of the dictionary
max_vocab = 60000
# define the minimum of appearance of the word to be inclued into the dictionary
min_freq = 100
itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})

### Save the dictionary

In [0]:
pickle.dump(itos, open('../itos.pkl', 'wb'))

### Word(Token) Numeralization

In [0]:
text_df['ids'] = text_df['tok'].apply(lambda toks: [stoi[t] for t in toks])

### Labels Numeralization

In [0]:
def num_labels(df):
    df['type_nb'] = 0
    for i, row in df.iterrows():
        if row['label'] == 'cat1':
            df.at[i, 'type_nb'] = 0
        elif row['label'] == 'cat2':
            df.at[i, 'type_nb'] = 1
        elif row['label'] == 'cat3':
            df.at[i, 'type_nb'] = 2
        elif row['label'] == 'cat4':
            df.at[i, 'type_nb'] = 3

In [0]:
num_labels(text_df)

### Devide data into training set and validation set

In [0]:
trn_lm, val_lm = sklearn.model_selection.train_test_split(text_df, test_size = 0.2)

### Save datasets into json files

In [0]:
trn_lm_J = json.loads(trn_lm.to_json(orient='index'))
with open('../trn_lm_tok.json', 'w') as f:
    json.dump(trn_lm_J, f, indent=4)

In [0]:
val_lm_J = json.loads(val_lm.to_json(orient='index'))
with open('../val_lm_tok.json', 'w') as f:
    json.dump(val_lm_J, f, indent=4)

### Save tokens into npy files

In [0]:
np.save('../trn_lm_tok.npy', trn_lm['tok'].values)
np.save('../val_lm_tok.npy', val_lm['tok'].values)

### Save indexes into npy files

In [0]:
np.save('../trn_lm_ids.npy', trn_lm['ids'].values)
np.save('../val_lm_ids.npy', val_lm['ids'].values)

### Save labels into npy files

In [0]:
np.save('../trn_lm_labels.npy', trn_lm['type_nb'].values)
np.save('../val_lm_labels.npy', val_lm['type_nb'].values)

# Match our itos to the pretrained wt103 model

### get the pretrained model parameters

In [0]:
wgts = torch.load('../wt103/fwd_wt103.h5') # , map_location='cpu' if on a cpu

In [0]:
with open('../wt103/itos_wt103.pkl', 'rb') as f:
    itos_wt103 = pickle.load(f)
stoi_wt103 = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos_wt103)})

### define the size of our own model

In [0]:
vs = len(itos)
em_sz = 400
nh = 1150
nl = 3

### Corresponding the pretrained wt103 model to our own model

In [0]:
enc_wgts = wgts['0.encoder.weight'].numpy()
row_m = enc_wgts.mean(0)

In [0]:
new_w = np.zeros((vs, em_sz), dtype=np.float32)
for i,w in enumerate(itos):
    r = stoi_wt103[w]
    new_w[i] = enc_wgts[r] if r>=0 else row_m

In [0]:
wgts['0.encoder.weight'] = torch.FloatTensor(new_w)
wgts['0.encoder_with_dropout.embed.weight'] = torch.FloatTensor(np.copy(new_w))
wgts['1.decoder.weight'] = torch.FloatTensor(np.copy(new_w))

In [0]:
torch.save(wgts, '../model_after_corresponding.h5')