In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.neighbors import kneighbors_graph

### Please Install Plotly on your machine to see plots in notebooks.

In [2]:
data = pd.read_csv("fraud_cluster.csv")
data.head()

Unnamed: 0,P1,P2,P3,P4,P5,P6,P7,P8,P9
0,0.595378,-0.531958,0.679654,-0.126799,0.432046,0.988092,-0.029813,0.768742,-0.054167
1,0.982237,0.991481,0.337646,0.228144,0.920032,0.999985,-0.032259,2.161651,-0.05435
2,0.996162,0.893987,0.767413,0.60684,0.970808,0.882602,-0.032267,0.369607,-0.054157
3,0.999928,0.922748,-0.444438,-0.371287,0.528038,-0.221645,-0.032692,-1.065439,0.381914
4,0.985838,0.937512,0.699592,0.585263,0.838804,0.999602,-0.033713,2.08972,-0.054379


In [3]:
data.shape

(1163, 9)

In [4]:
X = data.values

## Exploratory Data Analysis

In [5]:
from pandas_profiling import ProfileReport
profile = ProfileReport(data, title="Pandas Profiling Report", progress_bar=False)
profile.to_file("data_analysis.html")
profile



## Visualize entire data with t-SNE

In [6]:
tsne_model = TSNE(perplexity=40, n_components=3, init='pca', n_iter=500, random_state=23)
new_values = tsne_model.fit_transform(X)

In [7]:
def plot_data(new_values, labels):
    output = pd.DataFrame(new_values, columns=['x', 'y', 'z'])
    output['class'] = labels#.astype(np.int)
    fig = px.scatter_3d(output, x='x', y='y', z='z', opacity=1.0, color='class')
    fig.update_traces(marker=dict(size=2), selector=dict(mode='markers'))
    fig.update_layout(margin={'l': 0, 'r': 0, 'b': 100, 't': 0}, width=800, height=600)
    fig.show()

In [8]:
plot_data(new_values, np.ones(len(new_values)))

## Kmeans Clustering

In [9]:
kmeans = KMeans(n_clusters=3, random_state=42).fit(X)
plot_data(new_values, kmeans.labels_)

In [10]:
kmeans = KMeans(n_clusters=4, random_state=42).fit(X)
plot_data(new_values, kmeans.labels_)

In [11]:
kmeans = KMeans(n_clusters=5, random_state=42).fit(X)
plot_data(new_values, kmeans.labels_)