In [None]:
import pandas as pd
import keras
import numpy as np
import tensorflow as tf

from keras import models, layers, optimizers

tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.TPUStrategy(tpu)

Running on TPU  ['10.98.108.178:8470']


In [None]:
# getting data files
!wget -q https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget -q https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
# function takes file path as input and returns features and labels in list format
def retrieve_data(file_path):
    column_names = ['labels', 'text']

    df = pd.read_csv(file_path, sep='\t', header=None, names=column_names)
    df['labels'].replace({'ham': 0, 'spam': 1}, inplace=True)

    features = df['text'].tolist()
    labels = df['labels'].tolist()

    return features, labels

train_features, train_labels = retrieve_data(train_file_path)
test_features, test_labels = retrieve_data(test_file_path)

In [None]:
# initializing Tokenizer() and fitting it on training features
tokenizer = keras.preprocessing.text.Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(train_features)

In [None]:
# function takes in tokenized features and labels, pads the features, returns both features and labels as numpy arrays
def preprocess_data(features, labels):
    features = tokenizer.texts_to_sequences(features)

    # the longest text message in train and test data has a length of 189 tokens
    features = keras.preprocessing.sequence.pad_sequences(features, maxlen=190, padding='post', truncating='post') # the output is a numpy array
    labels = np.array(labels)

    return features, labels

x_train, y_train = preprocess_data(train_features, train_labels)
x_test, y_test = preprocess_data(test_features, test_labels)

train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(32)

In [None]:
max_sequence_length = 190
embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1

with tpu_strategy.scope():
    model = models.Sequential()

    model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
    model.add(layers.LSTM(units=50, return_sequences=True))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(units=1, activation='sigmoid'))

    model.compile(optimizer=optimizers.Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(train_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7bd2af9702b0>

In [None]:
model.evaluate(x_test, y_test)



[0.045941855758428574, 0.9899425506591797]