In [4]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import DistilBertTokenizerFast

KeyboardInterrupt: 

In [29]:
# ----------------------------
# 1. Load dataset
# ----------------------------
df = pd.read_csv("Dataset_5971.csv")  # replace with your CSV path

# ----------------------------
# 2. binary classification
# ----------------------------
# Convert LABEL column to lowercase
df['LABEL'] = df['LABEL'].str.lower()

# Keep only relevant labels
df = df[df['LABEL'].isin(['ham', 'smishing', 'spam'])]

# Map labels to binary: ham=0, phishing/spam=1
label_map = {'ham': 0, 'smishing': 1, 'spam': 1}
df['label_encoded'] = df['LABEL'].map(label_map)

# ----------------------------
# 3. Text cleaning function
# ----------------------------
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"http\S+", "url", text)  # replace URLs
    text = re.sub(r"\b\d{10,}\b", "phone", text)  # replace long numbers (phones)
    text = re.sub(r"\S+@\S+", "email", text)  # replace emails
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # remove punctuation/special chars
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

df['cleaned_text'] = df['TEXT'].apply(clean_text)

# ----------------------------
# 4. Train/test split
# ----------------------------
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['cleaned_text'], df['label_encoded'], 
    test_size=0.2, random_state=42, stratify=df['label_encoded']
)

In [None]:
# ----------------------------
# 5a. Random Forest preprocessing (TF-IDF)
# ----------------------------
tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2), sublinear_tf=True)
X_train_rf = tfidf.fit_transform(X_train_text)
X_test_rf = tfidf.transform(X_test_text)

In [31]:
# ----------------------------
# 5b. LSTM preprocessing (token sequences)
# ----------------------------
max_words = 5000
max_len = 100

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text)

X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_test_seq = tokenizer.texts_to_sequences(X_test_text)

X_train_lstm = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_lstm = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

In [32]:
# ----------------------------
# 5c. DistilBERT preprocessing
# ----------------------------
pretrained_model = "distilbert-base-uncased"
tokenizer_bert = DistilBertTokenizerFast.from_pretrained(pretrained_model)

max_len_bert = 128  # max length for BERT

train_encodings = tokenizer_bert(
    list(X_train_text),
    truncation=True,
    padding=True,
    max_length=max_len_bert
)

test_encodings = tokenizer_bert(
    list(X_test_text),
    truncation=True,
    padding=True,
    max_length=max_len_bert
)

# Convert to numpy arrays (for TensorFlow/Keras)
X_train_bert = {
    'input_ids': np.array(train_encodings['input_ids']),
    'attention_mask': np.array(train_encodings['attention_mask'])
}

X_test_bert = {
    'input_ids': np.array(test_encodings['input_ids']),
    'attention_mask': np.array(test_encodings['attention_mask'])
}

# Labels for all models
y_train = np.array(y_train, dtype='float32')
y_test = np.array(y_test, dtype='float32')

In [33]:
# ----------------------------
# 6. Summary
# ----------------------------
print("Random Forest input shape:", X_train_rf.shape)
print("LSTM input shape:", X_train_lstm.shape)
print("DistilBERT input_ids shape:", X_train_bert['input_ids'].shape)
print("DistilBERT attention_mask shape:", X_train_bert['attention_mask'].shape)
print("Labels distribution:", np.bincount(y_train.astype(int)))

Random Forest input shape: (4776, 5000)
LSTM input shape: (4776, 100)
DistilBERT input_ids shape: (4776, 128)
DistilBERT attention_mask shape: (4776, 128)
Labels distribution: [3875  901]


In [34]:
print("Original dataset:", len(pd.read_csv("Dataset_5971.csv")))
print("After filtering labels:", len(df))
print("Training set:", len(X_train_text))
print("Test set:", len(X_test_text))
print("Label counts:", df['label_encoded'].value_counts())


Original dataset: 5971
After filtering labels: 5971
Training set: 4776
Test set: 1195
Label counts: label_encoded
0    4844
1    1127
Name: count, dtype: int64
