In [None]:
import numpy as np 
import pandas as pd 

In [None]:
!pip install tensorflow -q

In [None]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

keras = tf.keras

In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

hyperparams = {
    'vocab_size' : 1000,
    'dim' : 30,
    'max_len' : 515
}

In [None]:
import re
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('wordnet')
# ! unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/



In [None]:
data = pd.read_csv("../data/spamDetection/spam.csv", encoding='latin1')
data = data.loc[:, ['v1', 'v2']]
data = data.rename(columns={'v1': 'label', 'v2': 'message'})

In [None]:
import seaborn as sns
sns.countplot(x=data['label'])

### Prepare Y

In [None]:
y = data['label']
le = LabelEncoder()
Y = le.fit_transform(y)

### Clean X

In [None]:
# Initialize lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def clean_message(message, method):
    
    def semmatize_msg():
        return [stemmer.stem(word) for word in tokens]
    
    def lemmatize_msg():
        return [lemmatizer.lemmatize(word) for word in tokens]
    
    def lemmatize_pos_msg():
        pos_tags = nltk.pos_tag(tokens)
        return [
            lemmatizer.lemmatize(word, pos='v' if tag.startswith('V') else 'n' if tag.startswith('N') else 'a' if tag.startswith('J') else 'r' if tag.startswith('R') else 'n')
            for word, tag in pos_tags
        ]
    
    # Lowercase the message
    message = message.lower()
    
    # Remove everything except letters
    message = re.sub(r'[^a-z\s]', '', message)
    
    # Tokenize the message
    tokens = nltk.word_tokenize(message)
    
    # Remove single character words and stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if len(word) > 1 and word not in stop_words]
    
    # Apply the selected method for word processing
    if method == 'stem':
        processed_tokens = semmatize_msg()
    elif method == 'lemmatize':
        processed_tokens = lemmatize_msg()
    elif method == 'lemmatize_pos':
        processed_tokens = lemmatize_pos_msg()
    else:
        raise ValueError("Invalid method. Choose from 'stem', 'lemmatize', or 'lemmatize_pos'.")
    
    # Join the tokens back into a single string
    cleaned_message = ' '.join(processed_tokens)

    return cleaned_message

In [None]:
X = data['message'].apply(lambda x: clean_message(x, method='stem'))

### Split dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)

### Preproces X

In [None]:
tokenizer = Tokenizer(num_words = hyperparams['vocab_size'])
tokenizer.fit_on_texts(X_train)

def prepare_input(inputx):
    seq_docs = tokenizer.texts_to_sequences(inputx)
    padded_docs = pad_sequences(seq_docs, padding='pre', maxlen=hyperparams['max_len'])
    return padded_docs

X_train = prepare_input(X_train)

### Build Model

In [None]:
def build_model():
    model = tf.keras.models.Sequential([
            keras.layers.Embedding(hyperparams['vocab_size'], hyperparams['dim'], input_shape=(hyperparams['max_len'], )),
            keras.layers.LSTM(256),
            keras.layers.Dense(1, activation='sigmoid')
        ])

    model.compile(optimizer='adam',
                  loss='mse',
                  metrics=['accuracy'])
    print(model.summary())
    return model


model = build_model()

### Train Model

In [None]:
model.fit(X_train, y_train, batch_size=128, epochs=10, validation_split=0.3)

### Evaluate Model

In [None]:
X_test = prepare_input(X_test)
acc = model.evaluate(X_test, y_test)
acc