# Working with fraud datasets for EDA

In [None]:
import glob
import numpy as np
import pandas as pd
import pyspark
import urllib
import pylab as pl
import seaborn as sns

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.feature import *

In [None]:
spark = SparkSession.builder.appName('eda').getOrCreate()

In [None]:
# Downloaded a bunch of datasets from kaggle / similar
glob.glob("dataset/*")

In [None]:
# inferSchema reads the file twice, but detects numerical columns
data = spark.read.csv('dataset/creditcard_2023.csv', header=True, inferSchema=True)
data.printSchema()

## Clustering

In [None]:
data

In [None]:
# create feature column

feature_cols = data.columns[1:-2]
assembler = pyspark.ml.feature.VectorAssembler(inputCols=feature_cols, outputCol='features')
data = assembler.transform(data)
data.show(10)

In [None]:
data = data.select(['features', 'Class'])
data.show(10)

In [None]:
km = pyspark.ml.clustering.KMeans(k=3)
xs = data.select(['features'])
clustering = km.fit(xs)
labels = clustering.transform(xs)

In [None]:
labels.groupBy('prediction').count().show()

In [None]:
labels.select(['prediction']).summary().show(10)

In [None]:
train, test = data.randomSplit([0.8, 0.2])

In [None]:
train.summary().show()

In [None]:
test.summary().show()

## PCA

In [None]:
xs = data.select(['features'])

scaler = StandardScaler(
    inputCol = 'features', 
    outputCol = 'scaledFeatures',
    withMean = True,
    withStd = True
).fit(xs)

xs_scaled = scaler.transform(xs)
xs_scaled.show(6)

In [None]:
n_components = 2
pca = PCA(
    k = n_components, 
    inputCol = 'scaledFeatures', 
    outputCol = 'pcaFeatures'
).fit(xs_scaled)

xs_pca = pca.transform(xs_scaled)
print('Explained Variance Ratio', pca.explainedVariance.toArray())
xs_pca.show(6)

In [None]:
# pull back to "local machine"
local_xs = xs_pca.rdd.map(lambda row: row.pcaFeatures).collect()
local_xs = np.array(local_xs)

In [None]:
local_y = np.array(data.select('Class').collect()).ravel()

In [None]:
pl.scatter(local_xs[:, 0], local_xs[:, 1], s=8, alpha=0.3, c=local_y)
pl.tight_layout()

## Second dataset

In [None]:
data = spark.read.csv('dataset/detect/creditcard_train.csv', header=True, inferSchema=True)
data.printSchema()

In [None]:
feature_cols = data.columns[1:-1]
assembler = pyspark.ml.feature.VectorAssembler(inputCols=feature_cols, outputCol='features')
data = assembler.transform(data)
data.show(10)

In [None]:
xs = data.select(['features'])

scaler = StandardScaler(
    inputCol = 'features', 
    outputCol = 'scaledFeatures',
    withMean = True,
    withStd = True
).fit(xs)

xs_scaled = scaler.transform(xs)
xs_scaled.show(6)

n_components = 2
pca = PCA(
    k = n_components, 
    inputCol = 'scaledFeatures', 
    outputCol = 'pcaFeatures'
).fit(xs_scaled)

xs_pca = pca.transform(xs_scaled)
print('Explained Variance Ratio', pca.explainedVariance.toArray())
xs_pca.show(6)

# pull back to "local machine"
local_xs = xs_pca.rdd.map(lambda row: row.pcaFeatures).collect()
local_xs = np.array(local_xs)

local_y = np.array(data.select('Class').collect()).ravel()

In [None]:
pl.scatter(local_xs[:, 0], local_xs[:, 1], s=8, alpha=0.3, c=local_y)
pl.tight_layout()

## Third dataset
More complex, text features

In [None]:
data = spark.read.csv('dataset/insurance/train.csv', header=True, inferSchema=True)
data.printSchema()