<a href="https://colab.research.google.com/github/surya211099/DetectingPhishingEmail/blob/main/TF-IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import re

df = pd.read_csv("/content/drive/MyDrive/Dessertation/updatedfinal_dataset.csv")


#Subject column

def clean_subject(text):
    text = str(text).lower()                       # lowercase
    text = re.sub(r"[^a-z0-9 ]", " ", text)       # remove special characters
    text = re.sub(r"\s+", " ", text)              # remove extra spaces
    return text.strip() if text.strip() else "empty"

df['subject'] = df['subject'].apply(clean_subject)


#Body column

def clean_body(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", " url ", text)      # replace URLs with token "url"
    text = re.sub(r"\S+@\S+", " email ", text)   # replace email addresses with token "email"
    text = re.sub(r"[^a-z0-9 ]", " ", text)      # remove other special characters
    text = re.sub(r"\s+", " ", text)             # remove extra spaces
    return text.strip() if text.strip() else "empty"

df['body'] = df['body'].apply(clean_body)


#Sender and Receiver

def extract_domain(email):
    email = str(email)
    match = re.search(r"@(\S+)", email)
    return match.group(1).lower() if match else "unknown"

df['sender'] = df['sender'].apply(extract_domain)
df['receiver'] = df['receiver'].apply(extract_domain)


#Date
df['date'] = pd.to_datetime(df['date'], errors='coerce')  # convert to datetime
df['timestamp'] = df['date'].apply(lambda x: x.timestamp() if pd.notnull(x) else 0)
   # convert to UNIX timestamp
df.drop('date', axis=1, inplace=True)


#Url and Label

df['urls'] = pd.to_numeric(df['urls'], errors='coerce').fillna(0).astype(int)
df['label'] = pd.to_numeric(df['label'], errors='coerce').fillna(0).astype(int)

#cleaned CSV

df.to_csv("phishing_cleaned.csv", index=False)



  df['date'] = pd.to_datetime(df['date'], errors='coerce')  # convert to datetime


In [3]:
df=pd.read_csv("phishing_cleaned.csv")
df

Unnamed: 0,subject,receiver,sender,body,urls,label,timestamp
0,never agree to be a loser,gvc.ceas-challenge.cc,iworld.de>,buck up your troubles caused by small dimensio...,1,1,1.217979e+09
1,befriend jenna jameson,gvc.ceas-challenge.cc,icable.ph>,upgrade your sex and pleasures with these tech...,1,1,1.217979e+09
2,cnn com daily top 10,gvc.ceas-challenge.cc,universalnet.psi.br>,the daily top 10 from cnn com top videos and s...,1,1,1.218011e+09
3,re svn commit r619753 in spamassassin trunk li...,spamassassin.apache.org>,pobox.com>,would anyone object to removing so from this l...,1,0,1.217979e+09
4,specialpricespharmmoreinfo,gvc.ceas-challenge.cc,loanofficertool.com>,welcomefastshippingcustomersupport url,1,1,1.217979e+09
...,...,...,...,...,...,...,...
45816,job offer,yahoo.co.jp,phreego.com>,sunoco sun oil company ltd trusty kojimachi bl...,0,1,1.188223e+09
45817,urgent response needed,aclweb.org,yahoo.fr,from the ministre de la construction mohamed n...,0,1,1.188425e+09
45818,from james bongani,latinmail.com,latinmail.com>,james bongani st edde eglise rue 11 ave 27 vri...,1,1,1.188741e+09
45819,hello,m,yahoo.co.uk>,vincent cheung foreign operations department h...,0,1,1.189108e+09


In [4]:
df=pd.read_csv("phishing_cleaned.csv")
cols=list(df.columns)
col_to_move=cols.pop(5)
new_position=6 if len(cols)>=6 else len(cols)
cols.insert(new_position,col_to_move)
df=df[cols]
df.to_csv("phishing_cleaned",index=False)

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split


df = pd.read_csv("phishing_cleaned.csv")

# Columns
text_columns = ['subject', 'body', 'sender', 'receiver']  # text to vectorize
numeric_columns = ['urls', 'timestamp']                   # numeric features
target_column = 'label'                                  # target

#Convert text columns to TF-IDF

tfidf_vectorizers = {}
tfidf_matrices = []

for col in text_columns:
    vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df[col])
    tfidf_vectorizers[col] = vectorizer
    tfidf_matrices.append(tfidf_matrix)

#Combine all text features

X_text = hstack(tfidf_matrices)  # sparse matrix for text features

#numeric features

X_numeric = df[numeric_columns].values  # dense numeric features
from scipy.sparse import csr_matrix
X = hstack([X_text, csr_matrix(X_numeric)])  # combine text + numeric

# Target

y = df[target_column].values

#Optional: Split for ML

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Feature matrix shape:", X.shape)
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

# -------------------------------
# 7️⃣ Optional: Save numeric features to CSV
# -------------------------------
# Convert sparse to dense if memory allows
import numpy as np
X_dense = X.toarray()

# Generate column names for text features
feature_names = []
for col, vectorizer in tfidf_vectorizers.items():
    feature_names.extend([f"{col}_{feat}" for feat in vectorizer.get_feature_names_out()])

# Add numeric columns
feature_names.extend(numeric_columns)

# Create DataFrame
df_features = pd.DataFrame(X_dense, columns=feature_names)
df_features[target_column] = y

# Save to CSV
df_features.to_csv("/content/drive/MyDrive/Dessertation/TFIDF_dataset.csv", index=False)
print("Numeric ML-ready dataset saved to TFIDF_dataset.csv")

Feature matrix shape: (45821, 7928)
Training set shape: (36656, 7928)
Test set shape: (9165, 7928)
Numeric ML-ready dataset saved to TFIDF_dataset.csv


In [6]:
df=pd.read_csv("/content/drive/MyDrive/Dessertation/TFIDF_dataset.csv")
df

Unnamed: 0,subject_00,subject_000,subject_01,subject_02,subject_03,subject_04,subject_05,subject_06,subject_07,subject_08,...,receiver_zoo,receiver_zooko,receiver_zpr,receiver_ztreet,receiver_zurich,receiver_zzn,receiver_zzzzason,urls,timestamp,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.217979e+09,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.217979e+09,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.218011e+09,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.217979e+09,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.217979e+09,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.188223e+09,1
45817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.188425e+09,1
45818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.188741e+09,1
45819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.189108e+09,1
