In [None]:
import pandas as pd
url_df = pd.read_csv(r"C:\Users\Sushmaja\new_data_urls.csv")
email_df = pd.read_csv(r"C:\Users\Sushmaja\Phishing_Email.csv")
html_df = pd.read_csv(r"C:\Users\Sushmaja\dataset_phishing.csv")


In [None]:
# URL dataset
url_df = url_df.rename(columns={
    "url": "content",
    "Class": "label"
})

# Email dataset
email_df = email_df.rename(columns={
    "Email Text": "content",
    "Label": "label"
})

# HTML/Webpage dataset
html_df = html_df.rename(columns={
    "html": "content",
    "class": "label"
})


In [None]:
url_df["type"] = "url"
email_df["type"] = "email"
html_df["type"] = "html"


In [None]:
print(url_df.columns)
print(email_df.columns)
print(html_df.columns)


In [None]:
# URL dataset
url_df = url_df.rename(columns={"status": "label"})

# Email dataset
email_df = email_df.rename(columns={"Email Type": "label"})

# HTML/Web dataset
html_df = html_df.rename(columns={"status": "label"})


In [None]:
email_df = email_df.drop(columns=["Unnamed: 0"])


In [None]:
html_df = html_df.rename(columns={"url": "content"})


In [None]:
print(url_df.columns)
print(email_df.columns)
print(html_df.columns)


In [None]:
label_map = {
    "phishing": 1,
    "legitimate": 0,
    "benign": 0,
    1: 1,
    0: 0
}

url_df["label"] = url_df["label"].map(label_map)
email_df["label"] = email_df["label"].map(label_map)
html_df["label"] = html_df["label"].map(label_map)


In [None]:
import pandas as pd

combined_df = pd.concat(
    [url_df, email_df, html_df],
    ignore_index=True
)

combined_df = combined_df.sample(frac=1).reset_index(drop=True)


In [None]:
combined_df.to_csv("combined_phishing_dataset.csv", index=False)


In [None]:
import re

def clean_email(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-z ]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

combined_df.loc[
    combined_df["type"]=="email",
    "clean_content"
] = combined_df.loc[
    combined_df["type"]=="email",
    "content"
].apply(clean_email)


In [None]:
combined_df["label"] = combined_df["label"].fillna(0)

numeric_cols = combined_df.select_dtypes(
    include=["int64","float64"]
).columns

combined_df[numeric_cols] = combined_df[numeric_cols].fillna(0)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

url_html_df = combined_df[
    combined_df["type"].isin(["url","html"])
].copy()   # <-- important

scaled_numeric = scaler.fit_transform(
    url_html_df[numeric_cols]
)

url_html_df.loc[:, numeric_cols] = scaled_numeric


In [None]:
import urllib.parse

def extract_url_features(url):
    parsed = urllib.parse.urlparse(str(url))

    return {
        "url_length": len(url),
        "dot_count": url.count("."),
        "digit_count": sum(c.isdigit() for c in url),
        "has_https": 1 if parsed.scheme == "https" else 0,
        "has_ip": 1 if re.search(r"\d+\.\d+\.\d+\.\d+", url) else 0
    }

url_features = combined_df[combined_df["type"]=="url"]["content"] \
                .apply(extract_url_features)

url_features = pd.DataFrame(list(url_features))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

text_df = combined_df[
    combined_df["type"]=="email"
]

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_text = tfidf.fit_transform(text_df["clean_content"])


In [None]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    stop_words="english"
)

X_text = tfidf.fit_transform(text_df["clean_content"])
y_text = text_df["label"]


In [None]:
combined_df[combined_df["type"]=="url"].columns


In [None]:
url_html_df = combined_df[
    combined_df["type"].isin(["url","html"])
]

numeric_cols = url_html_df.select_dtypes(
    include=["int64","float64"]
).columns.drop("label", errors="ignore")

X_numeric = url_html_df[numeric_cols].values
y_numeric = url_html_df["label"]


In [None]:
print("Text labels value counts:")
print(y_text.value_counts())

print("\nUnique values:")
print(y_text.unique())


In [None]:
combined_df["label"].value_counts()


In [None]:
combined_df[
    combined_df["type"]=="email"
]["label"].value_counts()


In [None]:
['Unnamed: 0', 'content', 'Email Type', 'type']


In [None]:
combined_df["label"] = None


In [None]:
print(combined_df.columns)
