# Vectorize, Split and Export Data

## Read Data, Map Labels & Verify Indices

In [54]:
import pathlib
import pandas as pd
import random

BASE_DIR = pathlib.Path().resolve().parent

DATASET_DIR = BASE_DIR / 'datasets'
EXPORT_DIR = DATASET_DIR / 'exports'
EXPORT_DIR.mkdir(exist_ok=True, parents=True)
SPAM_DATASET_PATH = EXPORT_DIR / 'spam-dataset.csv'

METADATA_EXPORT_PATH = EXPORT_DIR / 'spam-metadata.pkl'
TOKENIZER_EXPORT_PATH = EXPORT_DIR / 'spam-tokenizer.json'

In [55]:
df = pd.read_csv(SPAM_DATASET_PATH)

In [3]:
df.head(3)

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms-spam
1,ham,Ok lar... Joking wif u oni...,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam


In [4]:
labels = df['label'].tolist()
texts = df['text'].tolist()

In [5]:
label_legend = {'ham':0, 'spam': 1}
label_legend_inverted = {f'{v}': k for k,v in label_legend.items()}
label_legend_inverted

{'0': 'ham', '1': 'spam'}

In [6]:
label_as_int = [label_legend[x] for x in labels]
label_as_int[:10]

[0, 0, 1, 0, 0, 1, 0, 0, 1, 1]

In [12]:
random_idx = random.randint(0, len(labels))

assert texts[random_idx] == df.iloc[random_idx].text
assert labels[random_idx] == df.iloc[random_idx].label
assert label_legend_inverted[str(label_as_int[random_idx])] == df.iloc[random_idx].label

## Tokenize Text & Padding Sequence

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [15]:
MAX_NUM_WORDS = 280

In [16]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [19]:
sequences[:3]

[[55, 66, 10, 123, 143, 204, 169, 77, 68, 187],
 [64, 8],
 [59, 10, 25, 4, 2, 211, 95, 2, 2, 110, 104]]

In [35]:
word_index = tokenizer.word_index
#word_index

In [36]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [37]:
MAX_SEQ_LENGTH = 300

In [38]:
X = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)

In [40]:
X[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

## Vectorize Categorial Labels

In [41]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [44]:
labels_as_int_array = np.asarray(label_as_int)
Y = to_categorical(labels_as_int_array)

In [45]:
Y[:3] # Here postion 0 ON signals ham

array([[1., 0.],
       [1., 0.],
       [0., 1.]], dtype=float32)

## Split & Export Vectorized Datasets

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [51]:
import pickle

In [56]:
training_data = {
    'x_train': x_train,
    'x_test': x_test,
    'y_train': y_train,
    'y_test': y_test,
    'max_words': MAX_NUM_WORDS,
    'max_seq_length': MAX_SEQ_LENGTH,
    'label_legend': label_legend,
    'label_legend_inverted': label_legend_inverted,
}

tokenizer_json = tokenizer.to_json()
TOKENIZER_EXPORT_PATH.write_text(tokenizer_json)

1090335

In [57]:
with open(METADATA_EXPORT_PATH, 'wb') as f:
    pickle.dump(training_data, f)