#Data Loading and Pre-processing

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense, Bidirectional
from sklearn.metrics import classification_report


try:
    df = pd.read_csv('/content/spam.csv', encoding='latin1')
    df = df[['v1', 'v2']]
    df = df.rename(columns={'v1': 'label', 'v2': 'text'})
    print("Data loaded successfully from spam.csv")
except FileNotFoundError:
    print("Error: spam.csv not found.")
    raise
except Exception as e:
    print(f"Error loading data: {e}")
    raise


print(df.head())

print(df['label'].value_counts())


X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

#Tokenize Data and Train-Test splits

In [None]:

max_words = 1000
tokenizer = Tokenizer(num_words=max_words, oov_token="<UNK>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)


max_len = 20
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')


y_train = y_train.map({'ham': 0, 'spam': 1})
y_test = y_test.map({'ham': 0, 'spam': 1})

print("Shape of X_train_padded:", X_train_padded.shape)
print("Shape of X_test_padded:", X_test_padded.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


#Load Glove and create Embedding Layer

In [None]:

import tensorflow as tf
import numpy as np
import gensim.downloader as api


try:
    glove_model = api.load("glove-wiki-gigaword-100")
    embedding_dim = glove_model.vector_size
    print("GloVe embeddings loaded successfully!")
except Exception as e:
    print(f"Error loading GloVe embeddings: {e}")
    raise


word_index = tokenizer.word_index
num_words = min(max_words, len(word_index) + 1)

embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
    if i < num_words:
        try:
            embedding_vector = glove_model[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            pass

embedding_layer = Embedding(num_words,
                            embedding_dim,
                            embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                            input_length=max_len,
                            trainable=False)


#Define Model

In [None]:
import tensorflow as tf


model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

print(model.summary())


#Train Model

In [None]:

epochs = 10
batch_size = 32

history = model.fit(X_train_padded, y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_split=0.1)


#Results and Evaluations

In [None]:

loss, accuracy = model.evaluate(X_test_padded, y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)


y_pred = model.predict(X_test_padded)
y_pred = (y_pred > 0.5).astype(int)
print(classification_report(y_test, y_pred))
