This notebook complete the last step of the data preprocessing before feeding the data to a model. These preprocessing includes:
- Select the label to train
- Train/Test Set Split
- Tokenization
- Token to Sequence
- Pad the Sequence

In [1]:
import os
import pickle
import pandas as pd
import numpy as np
import pickle
from importlib import reload
from helpers import constants; reload(constants)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from helpers.helper_functions import *

In [2]:
# Load some constants
select_label = constants.SELECT_LABEL
sample_rate = constants.SAMPLE_RATE

raw_data_path = constants.RAW_TEXT_DIR
base_data_path = constants.BASE_DATA_DIR
intermediate_path = constants.ITM_DATA_DIR
model_data_path = constants.PRCD_DATA_DIR
tokenizer_path = constants.TOKEN_DIR
sample_data_path = constants.SAMPLE_DATA_DIR

max_len = constants.MAX_SEQUENCE_LENGTH # max number of words in a post to use
max_word_no = constants.MAX_NUM_WORDS # how many unique words to use (i.e num rows in embedding vector)

In [3]:
# Load the data
post_path = intermediate_path +'/post_df_short.pickle'
df_raw = pd.read_pickle(path=post_path)

In [4]:
def train_test_split(df, ratio=0.8):
    m = np.random.rand(len(df)) < ratio
    train, test = df[m].copy(deep = True), df[~m].copy(deep = True)
    x_train=train.post
    y_train=train.target
    x_test=test.post
    y_test=test.target
    return (x_train, y_train, x_test, y_test)

In [5]:
def sample_data(data, label, ratio=0.1):
    num_full = data.shape[0]
    if num_full != label.shape[0]:
        return None
    else:
        num_sample = int(num_full*ratio)
        m = (np.random.rand(num_sample)*num_full).astype(int)
        sample_data = data[m]
        sample_label = label[m]
        return (sample_data, sample_label)

In [6]:
df = prep_dataset(df_raw,'post', select_label)

In [7]:
df.head()

Unnamed: 0,post,target
0,Hello to anyone who's reading this. I've be...,1
1,"Gosh, Jer. This is a blog. Not a chatroom. ...",1
2,Hey! people please come here! I'm dying......,1
3,"Is that ""mandough"" the mando I know? Or so...",1
4,"Omigodessess!!!! What the fork! Hey, doofu...",1


In [8]:
x_train, y_train, x_test, y_test = train_test_split(df, ratio=0.95)

In [16]:
# No need to re-run this cell again if the fitted tokenizer object is pickled
tokenizer = Tokenizer(num_words=max_word_no)
tokenizer.fit_on_texts(x_train.tolist())

# save fitted tokenizer so I don't have to train it every time
with open(tokenizer_path, 'wb') as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

In [3]:
# read in the fitted tokenizer from pickle
with open(tokenizer_path, 'rb') as tokenizer_file:
    tokenizer = pickle.load(tokenizer_file)

In [10]:
seq_train = tokenizer.texts_to_sequences(x_train.tolist()) # turn words into indices
seq_test = tokenizer.texts_to_sequences(x_test.tolist()) # turn words into indices
#seq_train and seq_test are two level embeded list[[...],[...],...[...]], each item is one text converted to indices

In [11]:
word_index = tokenizer.word_index # a dictionary in the form of {'word': word_index,...}
print('Found %s unique tokens.' % len(word_index))

Found 520667 unique tokens.


In [12]:
data_train = pad_sequences(seq_train, maxlen = max_len)
data_test = pad_sequences(seq_test, maxlen = max_len)
labels_train = to_categorical(np.asarray(y_train))
labels_test = to_categorical(np.asarray(y_test))
print('Shape of training data:', data_train.shape)
print('Shape of training label:', labels_train.shape)

Shape of training data: (449782, 500)
Shape of training label: (449782, 2)


In [13]:
# Create data sample for model test
(data_train_sample, labels_train_sample), (data_test_sample, labels_test_sample) = sample_data(data_train, labels_train, ratio=sample_rate), sample_data(data_test, labels_test, ratio=sample_rate)

In [14]:
pickle.dump((data_train, labels_train, data_test, labels_test), open(model_data_path, 'wb'))

In [15]:
pickle.dump((data_train_sample, labels_train_sample, data_test_sample, labels_test_sample),open(sample_data_path,'wb'))

In [4]:
tokenizer.word

<keras_preprocessing.text.Tokenizer at 0x19b86e97978>