## Load Data

In [1]:
import os
import pathlib
import pandas as pd
import random

In [2]:
USE_PROJECT_ROOT = True
BASE_DIR = pathlib.Path().resolve()
if USE_PROJECT_ROOT:
    BASE_DIR = BASE_DIR.parent
    
DATASET_DIR = BASE_DIR / "datasets"
EXPORT_DIR = DATASET_DIR / "exports"

DATASET_CSV_PATH = EXPORT_DIR / 'spam-dataset.csv'
TRAINING_DATA_PATH = EXPORT_DIR / 'spam-training-data.pkl'

print("BASE_DIR is", BASE_DIR)

BASE_DIR is /Users/XiaoboTang/Public/ml_api


In [3]:
RUN_DATASET_PREPARE = False
if RUN_DATASET_PREPARE:
    SOURCE_NB = pathlib.Path('Download, View, Combine & Save Datasets.ipynb')
    if SOURCE_NB.exists():
        %run './{SOURCE_NB}'
    else:
        print("The data preparation notebook does not exist.")

In [4]:
if not DATASET_CSV_PATH.exists():
    raise Exception(f"{DATASET_CSV_PATH} not found.")

In [5]:
df = pd.read_csv(str(DATASET_CSV_PATH))

In [6]:
df.head(3)

Unnamed: 0,label,text,source
0,ham,"Go until jurong point, crazy.. Available only ...",sms-spam
1,ham,Ok lar... Joking wif u oni...,sms-spam
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam


## Convert Lists & Verify Indices

In [7]:
labels = df['label'].tolist()
texts = df['text'].tolist()

In [8]:
labels_legend = {'ham': 0, 'spam': 1}
labels_legend_inverted = {f'{v}': k for k,v in labels_legend.items()}

Alternatively, a faster way if there are more than 2 labels.

In [9]:
legend = {f"{x}": i for i, x in enumerate(list(set(labels)))}
legend_inverted = {f'{v}': k for k,v in legend.items()}

In [10]:
labels_as_int = [labels_legend[str(x)] for x in labels]

In [11]:
random_idx = random.randint(0, len(labels))
print('Random Index', random_idx)

assert texts[random_idx] == df.iloc[random_idx].text
assert labels[random_idx] == df.iloc[random_idx].label
assert labels_legend_inverted[str(labels_as_int[random_idx])] == df.iloc[random_idx].label

Random Index 5073


## Tokenize Text & Padding Sequence

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [13]:
MAX_NUM_WORDS = 280

In [14]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

print(f'Found {len(word_index)} unique tokens.')

Found 12077 unique tokens.


In [15]:
assert len(sequences) == len(texts) == len(labels_as_int)

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
MAX_SEQUENCE_LENGTH = 280

In [18]:
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

## Vectorize Categorial Labels

In [19]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [20]:
labels_as_int_array = np.asarray(labels_as_int)
y = to_categorical(labels_as_int_array)

## Split & Export Vectorized Datasets

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [23]:
import pickle

In [24]:
training_data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'max_words': MAX_NUM_WORDS,
    'max_sequence': MAX_SEQUENCE_LENGTH,
    'legend': labels_legend,
    'labels_legend_inverted': labels_legend_inverted,
    "tokenizer": tokenizer,
}

In [25]:
with open(TRAINING_DATA_PATH, 'wb') as f:
    pickle.dump(training_data, f)

In [26]:
data = {}

with open(TRAINING_DATA_PATH, 'rb') as f:
    data = pickle.load(f)