# Installing Library

In [None]:
%pip install -r "../requirements.txt"

## Configs for notebook

In [None]:
TRAINING_FILEPATH = r"Text Moderation\Dataset\messages.csv" # Path to training Data Csv file
INPUT_DATA_COLUMN_NAME = "C3" # Represent 3rd Column
TARGET_DATA_COLUMN_NAME = "C5" # Represent 5th Column
MAX_COLUMN_TO_READ = 100 # Default is 100 and working fine
MODEL_WEIGHT_PATH = r"../weights/spam_model_weights/Spam_Model_weight" # Default path to save weights


#Model training parameters
NUM_OF_DATASET_SPLIT = 5
EPOCHS = 5
BATCH_SIZE = 128

# Import necessary Library 

In [None]:
# DATA operations library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Text preprocessing library
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Deep learning model library
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, pipeline
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight

# Utils Function

1. convert_to_number(pd.Series:): Convert the String series to number with Error handling
2. remove_html_tags_special_character(pd.Series): Remove all unnecessary Html tags from the input data
3. remove_punctuations (pd.Series): Remove all punctuation from input data
4. remove_stopwords (text: str): remove all english stop words from input text.
5. balance_data(df: pd.DataFrame, y_column_name: target Column Name): balance out the input dataframe.

In [None]:
def convert_to_number(series):
    for i  in range(len(series)):
        try:
            series[i] = int(series[i])
        except ValueError:
            series[i] = None
    return series


def remove_html_tags_special_character(col: pd.Series) -> pd.Series:
    tags_list = ['<p>' ,'</p>' , '<p*>',
                 '<ul>','</ul>',
                 '<li>','</li>',
                 '<br>',
                 '<strong>','</strong>',
                 '<span*>','</span>',
                 '<a href*>','</a>',
                 '<em>','</em>','<br>','<br />','<div>','</div>','\\n','~']
    for tag in tags_list:
        col.replace(to_replace=tag,value='',regex=False,inplace=True)
    return col

punctuations_list = string.punctuation
def remove_punctuations(text):
    temp = str.maketrans('', '', punctuations_list)
    text = str(text)
    return text.translate(temp)


def remove_stopwords(text):
    stop_words = stopwords.words('english')
 
    imp_words = []
 
    # Storing the important words
    for word in str(text).split():
        word = word.lower()
 
        if (word not in stop_words) and 'br' not in word:
            imp_words.append(word)
 
    output = " ".join(imp_words)
 
    return output

def balance_data(df,y_column_name):
    ham_msg = df[df[y_column_name] == 0]
    spam_msg = df[df[y_column_name] == 1]
    if len(ham_msg) >= len(spam_msg):
        ham_msg = ham_msg.sample(n=len(spam_msg),random_state=42)
    else:
        spam_msg = spam_msg.sample(n=len(ham_msg),random_state=42)
    return pd.concat([ham_msg, spam_msg],ignore_index=True)

# Preprocess function to clean the dataset for model Trainig

In [None]:
def preprocess(df,X_column_name="C3",y_column_name="C5",split_ratio=[0.1,0.2]):
    """
    Preprocess the data for model training
    Arguments:
    df: Dataframe of raw data
    X_column_name: represent the input x column name in DataFrame
    Y_Column_nameL represent the target y column name in DataFrame
    split_ratio: use to define spliting ratio for training and testing data default is 0.2 (20% of data is use for testing and 80% for training)
    """
    # Target value preprocessing
    df = df[[X_column_name,y_column_name]] # C3 for input column ,C5 target column
    df[y_column_name] = pd.Series(convert_to_number(df[y_column_name].to_list()),name=y_column_name)
    df = df[ df[y_column_name] <= 1]

    #input Value preprocessing
    #Step 1 remove html Tags an extra special character
    df[X_column_name] = remove_html_tags_special_character(df[X_column_name])
    df[X_column_name].replace(to_replace='\n',value='',inplace=True,regex=True)
    df[X_column_name].replace(to_replace='\\?',value='',inplace=True,regex=True)
    df[X_column_name].dropna(inplace=True)
    
    # Step 3 NLP Text Preprocessing
    df[X_column_name] = df[X_column_name].apply(lambda x: remove_punctuations(x))
    df[X_column_name] = df[X_column_name].apply(lambda text: remove_stopwords(text))
    return df

In [None]:
#Preprocessing raw Data
raw_df = pd.read_csv(TRAINING_FILEPATH, encoding="UTF-8",names=[f"C{i}" for i in range(MAX_COLUMN_TO_READ)])
df = preprocess(raw_df,
                INPUT_DATA_COLUMN_NAME,
                TARGET_DATA_COLUMN_NAME)
df.reset_index(inplace=True)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=df[TARGET_DATA_COLUMN_NAME].unique(),
    y= df[TARGET_DATA_COLUMN_NAME]
    )
class_weights = {0:class_weights[0], 1:class_weights[1]}

# Text Encoding using BERT uncase encodings

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode(texts, tokenizer, max_len):
    input_ids, attention_masks = [], []
    for text in texts:
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
        )
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])

    return np.array(input_ids), np.array(attention_masks)

In [None]:
skf = StratifiedKFold(n_splits=NUM_OF_DATASET_SPLIT,shuffle=True)
MAX_LENGTH = 100
# Load the pre-trained model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')



# Model Training Loop

In [None]:
for train_idx, val_idx in skf.split(df[INPUT_DATA_COLUMN_NAME], df[TARGET_DATA_COLUMN_NAME]):
    X_train, X_val = df[INPUT_DATA_COLUMN_NAME][train_idx], df[INPUT_DATA_COLUMN_NAME][val_idx]
    y_train, y_val = df[TARGET_DATA_COLUMN_NAME][train_idx], df[TARGET_DATA_COLUMN_NAME][val_idx]

    train_input_ids, train_attention_mask = encode(X_train, tokenizer, MAX_LENGTH)
    val_input_ids, val_attention_mask = encode(X_val, tokenizer, MAX_LENGTH)

    # Fine-tune the model
    EPOCHS = EPOCHS
    BATCH_SIZE = BATCH_SIZE
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

    history = model.fit(
        [train_input_ids, train_attention_mask],
        y_train,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        validation_data=([val_input_ids, val_attention_mask], y_val),
        class_weight=class_weights
    )

    # Evaluate the model
    preds = model.predict([val_input_ids, val_attention_mask])
    preds = np.argmax(preds[0], axis=1)
    acc = accuracy_score(y_val, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(y_val, preds, average="binary")
    print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1 score: {f1:.4f}")

# Saving The Model Weight

In [None]:
model.save_weights(MODEL_WEIGHT_PATH)
print(f"Spam Model weights are saved at {MODEL_WEIGHT_PATH}")