In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from cuml import DBSCAN
from sklearn.decomposition import PCA
from sklearn.manifold import MDS, TSNE
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
import os

In [2]:
# import data sources
raw_data = pd.read_csv("data/email_phishing_data.csv")
no_label_data = raw_data.drop('label', axis=1)
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(no_label_data), columns=no_label_data.columns)

We will then use DBSCAN to see what clusters it makes. Using DBSCAN is expensive, so we reduce the number of columns using PCA. Values for DBSCAN params `min_samples` and `eps` were chosen by just messing around with DBSCAN on 30% of the data and general advice:
- An `eps` value higher than 0.14 means all of the data gets grouped into one cluster. So I just used that. I then halved it in an attempt to get more clusters (it didn't have a drastic change).
- `min_samples` was chosen from general advice found online to start with a min_samples that is $2 * d$, where $d$ is the number of features. For a 4D PCA decomposition, min_samples started at 8. There wasn't enough time to really test different values.

The `cuml` library's implememntation of DBSCAN is used because otherwise it is too slow for our large dataset. `cuml` is a NVIDIA library making use of Cuda for GPU acceleration.

In [3]:
%%capture
# above used to suppress annoying CUDA warnings about integer sizing

store_at = 'data_with_dbscan_cluster_df.pkl'
if os.path.exists(store_at):
    data_after_dbscan = pd.read_pickle(store_at)
else:
    pca = PCA(n_components=4, random_state=0)
    # Commented out is for sklearn DBSCAN which was too slow, we use cuml library for GPU acceleration.
    # dbscan = DBSCAN(n_jobs=4) # limit for available RAM when using sklearn DBSCAN
    dbscan = DBSCAN(min_samples=8, eps=0.14, random_state=0)
    data_after_dbscan = data.copy()
    data_after_dbscan['cluster'] = dbscan.fit_predict(pca.fit_transform(data_after_dbscan))
    data_after_dbscan['label'] = raw_data['label']
    # storing data (it is expensive to run dbscan)
    data_after_dbscan.to_pickle(store_at);

# Investigating Phishing Email / Total Ratio per Cluster

In [9]:
dataset_ratio = len(raw_data[raw_data['label'] == 1]) / len(raw_data)
print(f'Original percentage of phishing emails is {dataset_ratio:.2%}')
for c in data_after_dbscan['cluster'].unique():
    if (c == -1): continue # handles the noise values from DBSCAN
    cluster_data = data_after_dbscan[data_after_dbscan['cluster'] == c]
    num_phishing = len(cluster_data[cluster_data['label'] == 1])
    cluster_ratio = num_phishing / len(cluster_data)
    print(f'For cluster {c}, percentage is {cluster_ratio:.2%}: {num_phishing} out of {len(cluster_data)} points.')

Original percentage of phishing emails is 1.32%
For cluster 0, percentage is 1.25%: 5276 out of 421818 points.
For cluster 1, percentage is 1.41%: 1173 out of 83228 points.
For cluster 2, percentage is 2.52%: 386 out of 15342 points.
For cluster 3, percentage is 2.57%: 82 out of 3194 points.
For cluster 4, percentage is 2.84%: 29 out of 1022 points.
For cluster 5, percentage is 1.82%: 3 out of 165 points.
For cluster 6, percentage is 0.00%: 0 out of 48 points.
For cluster 7, percentage is 0.00%: 0 out of 11 points.
