<a href="https://colab.research.google.com/github/zGamingTechz/FreeCodeCamp-Machine-Learning-with-Python-Solutions/blob/main/fcc_sms_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# import libraries

In [None]:
!pip install --upgrade tensorflow
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds

# get data files


In [None]:
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
train_data = pd.read_csv(train_file_path, sep="\t", header=None)
train_data.columns = ['label', 'text']
test_data = pd.read_csv(test_file_path, sep="\t", header=None)
test_data.columns = ['label', 'text']

Visualising the data

In [None]:
train_data.isna().sum()
test_data.isna().sum()
train_data['label'].value_counts()/train_data.shape[0]
plt.rcParams["figure.figsize"] = [8,10]
train_data.label.value_counts().plot(kind='pie', autopct='%1.0f%%')

In [None]:
y_train = train_data['label'].astype('category').cat.codes
y_test  = test_data['label'].astype('category').cat.codes

Preprocessing

In [None]:
import re
import nltk
from nltk import stem
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
stemmer = stem.SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))

In [None]:
def clean_messages(text):
  text = re.sub(r'([^\s\w])+'," ",text)
  text = text.lower()
  text = text.split()
  text = [lemmatizer.lemmatize(word) for word in text if not word in stopwords]
  text = " ".join(text)
  return text

X_train = train_data['text'].apply(lambda x: clean_messages(x))
X_test = test_data['text'].apply(lambda x: clean_messages(x))

Vectorizing

In [None]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(X_train)

# convert to sequence of integers
X_train_sequence = tokenizer.texts_to_sequences(X_train)
print(X_train_sequence[0])

X_train_sequence = sequence.pad_sequences(X_train_sequence, maxlen=500)
X_train_sequence[:5]

In [None]:
X_test_sequence = tokenizer.texts_to_sequences(X_test)
X_test_sequence = sequence.pad_sequences(X_test_sequence, maxlen=500)

#Building The Model

In [None]:
# Keep top 1000 frequently occurring words
max_words = 1000

# Cut off the words after seeing 500 words in each document
max_len = 500
!pip show tensorflow

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Input(shape = [max_len]))
model.add(tf.keras.layers.Embedding(input_dim=max_words, output_dim=50, input_shape=(max_len,)))
model.add(tf.keras.layers.LSTM(64))
model.add(tf.keras.layers.Dense(256, activation = 'relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1, activation = 'sigmoid'))
model.summary()

In [None]:
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping

In [None]:
model.compile(loss = 'binary_crossentropy', optimizer = RMSprop(), metrics=['accuracy'])
history = model.fit(X_train_sequence, y_train, batch_size = 128, epochs=10,
          validation_split = 0.2, callbacks = [EarlyStopping(monitor = 'val_loss', min_delta = 0.0001)])

In [None]:
accuracy = model.evaluate(X_test_sequence, y_test, verbose = 2)

Preprocessing

In [None]:
def pre_process(pred_text):
  pred_text = pred_text.apply(lambda x: clean_messages(x))

  pred_text_sequence = tokenizer.texts_to_sequences(pred_text)
  pred_text_sequence = sequence.pad_sequences(pred_text_sequence, maxlen=500)
  return pred_text_sequence

Predicting message

In [None]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
  text = pre_process(pd.Series([pred_text]))
  prediction = []
  result = model.predict(text)[0]
  if result[0] < 0.5:
    prediction.append(result[0])
    prediction.append('ham')
  else:
    prediction.append(result[0])
    prediction.append('spam')

  return (prediction)

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()