In [1]:
# Standard imports
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import DBSCAN, KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import MDS, TSNE
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

In [2]:
# Import data sources
raw_data = pd.read_csv("data/email_phishing_data.csv")
no_label_data = raw_data.drop('label', axis=1)
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(no_label_data), columns=no_label_data.columns)

# t-SNE Tests Below

In [18]:
# assumed that you would use for perplexity values that are integer-valued but floats.
def make_tsne_plot(raw_data, data, perplexity = 30.0): # perplexity default in scikit-learn
    tsne = TSNE(random_state=0, perplexity=perplexity)
    tsne_vals = tsne.fit_transform(data)
    data_tsne = pd.DataFrame(data={'x':tsne_vals[:, 0], 'y':tsne_vals[:, 1], 'label':raw_data['label']})
    plt.figure(figsize=(8, 6))
    plt.scatter(data_tsne['x'], data_tsne['y'], c=data_tsne['label'])
    plt.title(f't-SNE Reduction on MinMaxScaler (perplexity={perplexity:.0f})')
    plt.xlabel('x')
    plt.ylabel('y')
    plt.grid(True)
    plt.savefig(f't-SNE_clustering_minmax_p-{perplexity:.0f}.png')
    plt.close()
    data_tsne.to_pickle(f't-SNE_data_p-{perplexity:.0f}.pkl')

Note that we have already tried default perplexity and it is very bad (just creates a lump). The below tests are done to see if we get more than just a lump. 

The value of 725 (the last one below) was obtained from [this StackOverflow post](https://stats.stackexchange.com/a/564304) which said that a good perplexity for t-SNE is $N^{(1/2)}$; N for our dataset is 524846 rows.

In [None]:
make_tsne_plot(raw_data, data, 10.0) # took 10m
make_tsne_plot(raw_data, data, 50.0) # took 15m
make_tsne_plot(raw_data, data, 725.0) # took 2h15m
# all outputs are saved to the appropriate PNGs