In [1]:
# import libraries
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization, Input, Dense, GlobalAveragePooling1D

print(tf.__version__)

2023-03-12 10:13:38.022748: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-12 10:13:38.245845: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


2.11.0


## Load data

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

In [3]:
# load the data into dataframes
train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"
df_train = pd.read_table('train-data.tsv', names=['ham_or_spam', 'sms'])
df_val = pd.read_table('valid-data.tsv', names=['ham_or_spam', 'sms'])

# check the shapes
print(f'training dataset shape: {df_train.shape}')
print(f'validation dataset shape: {df_val.shape}')

training dataset shape: (4179, 2)
validation dataset shape: (1392, 2)


## Pre-process data

In [4]:
# create a label column with 0's for ham and 1's for spam
df_train['label'] = df_train['ham_or_spam'].map({'ham': 0, 'spam': 1})
df_val['label'] = df_val['ham_or_spam'].map({'ham': 0, 'spam': 1})

In [5]:
# check the class distribution of the training data
df_train['ham_or_spam'].value_counts()

ham     3619
spam     560
Name: ham_or_spam, dtype: int64

In [6]:
# set up class weights to account for the renatively small number of positive samples
neg = (df_train['label'] == 0).sum()
pos = (df_train['label'] == 1).sum()

# calculate weights using the formula recommended in Tensorflow documentation
# https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#class_weights
weight_for_0 = (1 / neg) * (len(df_train) / 2.0)
weight_for_1 = (1 / pos) * (len(df_train) / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}
print(class_weight)

{0: 0.5773694390715667, 1: 3.7312499999999997}


In [7]:
# set up train and validation sets as numpy arrays
train_texts = df_train['sms'].to_numpy()
train_labels = df_train['label'].to_numpy()
val_texts = df_val['sms'].to_numpy()
val_labels = df_val['label'].to_numpy()

## Set up text processing layers

In [8]:
# max number of words to use in our vocab
max_vocab_length = 10000

# max tokens in each sequence (sequences will be truncated or padded to this length)
output_sequence_length = 30

# create a text vectorizer layer to convert text to sequences of integers
text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode='int',
                                    output_sequence_length=output_sequence_length)

# Fit the text vectorizer to the training text
text_vectorizer.adapt(train_texts)

2023-03-12 10:18:57.265524: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
# Create an embedding layer to convert integer sequences to dense vectors
embedding = tf.keras.layers.Embedding(input_dim=max_vocab_length,
                                      output_dim=128, 
                                      input_length=output_sequence_length)

## Build & fit the model

In [10]:
# build the model
inputs = Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of those numbers
x = GlobalAveragePooling1D()(x) # condense the embeddings (1 vector per token) down to one vector
outputs = Dense(1, activation='sigmoid')(x) # create an output layer
model = tf.keras.Model(inputs, outputs)


In [11]:
# view a summary of the model
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVec  (None, 30)               0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 30, 128)           1280000   
                                                                 
 global_average_pooling1d (G  (None, 128)              0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 1,280,129
Trainable params: 1,280,129
Non-train

In [12]:
# Compile the model
model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

In [13]:
# Fit the model
history = model.fit(train_texts,
                    train_labels,
                    epochs=10,
                    validation_data=(val_texts, val_labels),
                    class_weight=class_weight)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Evaluate the model

In [14]:
# Evaluate the model
model.evaluate(val_texts, val_labels)



[0.06043415889143944,
 0.9863505959510803,
 0.9666666388511658,
 0.9304812550544739]

In [15]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
  # get the prediction and round it to 0 or 1
  prediction = round(model.predict([pred_text])[0][0]) 

  if prediction == 1:
    return [prediction, 'spam']
  else:
    return [prediction, 'ham']

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

[0, 'ham']


In [16]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


You passed the challenge. Great job!
