<a href="https://colab.research.google.com/github/thirza258/freecodecamp_rock_paper_scissors/blob/main/Copy_of_fcc_sms_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
import pandas as pd

df_train = pd.read_csv(train_file_path, sep='\t')

df_train = df_train.rename(columns={df_train.columns[0]: 'class', df_train.columns[1]: 'messages'})

# Display the updated DataFrame to verify the changes
df_train.head()



In [None]:

df_test = pd.read_csv(test_file_path, sep='\t')

df_test = df_test.rename(columns={df_test.columns[0]: 'class', df_test.columns[1]: 'messages'})

# Display the updated DataFrame to verify the changes
df_test.head()

In [None]:
from tensorflow.keras import layers, Sequential

model = Sequential([
    # Embedding layer to convert words to dense vectors
    layers.Embedding(input_dim=10000, output_dim=32, input_length=20),  # Increased output_dim for better representation
    layers.GlobalAveragePooling1D(),  # Reduce dimensionality after embedding
    layers.Dense(64, activation='relu'),  # Increased the size of the dense layer
    layers.Dropout(0.3),  # Dropout to prevent overfitting
    layers.Dense(32, activation='relu'),  # Another dense layer for more complex patterns
    layers.Dense(1, activation='sigmoid')  # Output layer with sigmoid for binary classification
])

In [None]:
df_train['class'] = df_train['class'].map({'ham': 0, 'spam': 1})
df_test['class'] = df_test['class'].map({'ham': 0, 'spam': 1})

In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from imblearn.over_sampling import ADASYN
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

# Assuming df_train is your DataFrame with 'messages' and 'class' columns

# Step 1: Check class proportions
class_proportions = df_train['class'].value_counts(normalize=True)
print("Original class proportions:")
print(class_proportions)

# Step 2: Tokenize and pad the messages
tokenizer = Tokenizer(num_words=1000, oov_token="<OOV>")
tokenizer.fit_on_texts(df_train['messages'])
sequences = tokenizer.texts_to_sequences(df_train['messages'])
padded_sequences = pad_sequences(sequences, maxlen=20, padding='post')

# Step 3: Define features and target
X_train = padded_sequences  # Use the padded sequences as features
y_train = df_train['class']  # The target is the 'class' column

# Step 4: Initialize the ADASYN sampler
adasyn = ADASYN(sampling_strategy='minority', random_state=42)

# Step 5: Fit and resample the data
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)

# Step 6: Check the new class proportions
resampled_class_proportions = pd.Series(y_resampled).value_counts(normalize=True)
print("\nResampled class proportions:")
print(resampled_class_proportions)

# Step 7: Combine the resampled data back into a DataFrame
# Reverse the tokenization to get the messages back
reverse_tokenizer = {v: k for k, v in tokenizer.word_index.items()}
def sequence_to_text(sequence):
    return ' '.join([reverse_tokenizer.get(i, '<OOV>') for i in sequence])

resampled_messages = [sequence_to_text(seq) for seq in X_resampled]

# Create the final DataFrame with messages and class
resampled_df = pd.DataFrame({
    'messages': resampled_messages,
    'class': y_resampled
})

# Display the first few rows of the resampled DataFrame
print(resampled_df.head())

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_resampled, y_resampled, epochs=10, verbose=1)

In [None]:
# 1. Tokenize and pad test messages
test_sequences = tokenizer.texts_to_sequences(df_test['messages'])
test_padded_sequences = pad_sequences(test_sequences, maxlen=20, padding='post')

# 2. Make predictions
predictions = model.predict(test_padded_sequences)

# 3. Convert predictions to labels (0 or 1)
predicted_labels = (predictions > 0.5).astype(int)

# 4. Create a DataFrame for comparison
comparison_df = pd.DataFrame({'Actual': df_test['class'], 'Predicted': predicted_labels.flatten()})

# 5. (Optional) Display or further analyze the comparison DataFrame
comparison_df.head()  # Display the first few rows

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming you have 'comparison_df' with 'Actual' and 'Predicted' columns

# Create the confusion matrix
conf_matrix = confusion_matrix(comparison_df['Actual'], comparison_df['Predicted'])

# Plot the confusion matrix using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
def predict_message(pred_text):
    # Preprocess the input text
    sequence = tokenizer.texts_to_sequences([pred_text])
    padded_sequence = pad_sequences(sequence, maxlen=20, padding='post')

    # Predict the class
    probability = model.predict(padded_sequence)[0][0]

    # Determine label
    label = 'spam' if probability > 0.5 else 'ham'
    print(label)

    return [float(probability), label]

# Example usage
pred_text = "how are you doing today?"
prediction = predict_message(pred_text)
print(prediction)


In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
