In [1]:
# Import
import pandas as pd
from datasets import DatasetDict, Dataset
import kagglehub

In [2]:
# Data prep
# download datad from: https://www.kaggle.com/datasets/taruntiwarihp/phishing-site-urls/data
# Download latest version
path = kagglehub.dataset_download("taruntiwarihp/phishing-site-urls")
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/phishing-site-urls


In [3]:
import os
dataset_path = "/root/.cache/kagglehub/datasets/taruntiwarihp/phishing-site-urls/versions/1"
files = os.listdir(dataset_path)
print("Files in dataset directory:")
for file in files:
    print(f"- {file}")

Files in dataset directory:
- phishing_site_urls.csv


In [4]:
df = pd.read_csv(r"/root/.cache/kagglehub/datasets/taruntiwarihp/phishing-site-urls/versions/1/phishing_site_urls.csv")

In [5]:
df.head()

Unnamed: 0,URL,Label
0,nobell.it/70ffb52d079109dca5664cce6f317373782/...,bad
1,www.dghjdgf.com/paypal.co.uk/cycgi-bin/webscrc...,bad
2,serviciosbys.com/paypal.cgi.bin.get-into.herf....,bad
3,mail.printakid.com/www.online.americanexpress....,bad
4,thewhiskeydregs.com/wp-content/themes/widescre...,bad


In [6]:
# drop data
df = df.dropna()

# create dataframes from each class

df_safe = df[df['Label']=='good']
df_not_safe = df[df['Label']=='bad']

# define number of samples to keep
num_samples = 1500

# Sample min_size rows from each class to ensure a 50-50 split
df_safe_sample = df_safe.sample(num_samples, random_state=42)
df_not_safe_sample = df_not_safe.sample(num_samples, random_state=42)

# replace "Email Type" with Boolean flag "isPhising"
df_safe_sample = df_safe_sample.assign(isPhishing=False)
df_safe_sample = df_safe_sample.drop('Label',axis=1)
df_not_safe_sample = df_not_safe_sample.assign(isPhishing=True)
df_not_safe_sample = df_not_safe_sample.drop('Label',axis=1)

# Concatenate the samples to create a new balanced dataset
balanced_df = pd.concat([df_safe_sample, df_not_safe_sample])
balanced_df.columns = ['text', 'labels']

# convert labels column to int
balanced_df['labels'] = balanced_df['labels'].astype(int)

# Shuffle the balanced dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train, validation, and test sets (e.g., 70% train, 15% validation, 15% test)
train_frac = 0.7
valid_frac = 0.15
test_frac = 0.15

# define train and validation size
train_size = int(train_frac * len(balanced_df))
valid_size = int(valid_frac * len(balanced_df))

# create train, validation, and test datasets
train_df = balanced_df[:train_size]
valid_df = balanced_df[train_size:train_size + valid_size]
test_df = balanced_df[train_size + valid_size:]

# Convert the pandas DataFrames back to Hugging Face Datasets
train_ds = Dataset.from_pandas(train_df)
valid_ds = Dataset.from_pandas(valid_df)
test_ds = Dataset.from_pandas(test_df)

# Combine into a DatasetDict
dataset_dict = DatasetDict({
    'train': train_ds,
    'validation': valid_ds,
    'test': test_ds
})

In [7]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 450
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 450
    })
})

In [8]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:
# push data to hub
dataset_dict.push_to_hub("Vyshnev/phishing-data-classification")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]