In [1]:
%load_ext nb_black
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

spark = SparkSession.builder.getOrCreate()

<IPython.core.display.Javascript object>

In [2]:
spark

<IPython.core.display.Javascript object>

In [3]:
import os
from os.path import isfile, join

loc = os.path.abspath("")
data_loc = f"{loc}/data/"

<IPython.core.display.Javascript object>

In [4]:
df = spark.read.csv(
    f"{data_loc}Synthetic Financial Data.csv", inferSchema=True, header=True
)

<IPython.core.display.Javascript object>

In [5]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)



<IPython.core.display.Javascript object>

In [6]:
df.show(2)

+---+----+-------+--------+----------+-------------+--------------+-----------+--------------+--------------+-------+
|_c0|step|   type|  amount|  nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|
+---+----+-------+--------+----------+-------------+--------------+-----------+--------------+--------------+-------+
|  0| 688|CASH_IN|23557.12|C867750533|       8059.0|      31616.12|C1026934669|     169508.66|     145951.53|      0|
|  1| 274|PAYMENT| 6236.13|C601099070|          0.0|           0.0| M701283411|           0.0|           0.0|      0|
+---+----+-------+--------+----------+-------------+--------------+-----------+--------------+--------------+-------+
only showing top 2 rows



<IPython.core.display.Javascript object>

In [7]:
df = df[["type", "amount", "oldbalanceOrg", "newbalanceOrig", "isFraud"]]

<IPython.core.display.Javascript object>

In [8]:
df.show(2)

+-------+--------+-------------+--------------+-------+
|   type|  amount|oldbalanceOrg|newbalanceOrig|isFraud|
+-------+--------+-------------+--------------+-------+
|CASH_IN|23557.12|       8059.0|      31616.12|      0|
|PAYMENT| 6236.13|          0.0|           0.0|      0|
+-------+--------+-------------+--------------+-------+
only showing top 2 rows



<IPython.core.display.Javascript object>

### Train/test split

In [9]:
train, test = df.randomSplit([0.7, 0.3], seed=7)

<IPython.core.display.Javascript object>

In [10]:
print(f"Train set length: {train.count()} records")
print(f"Test set length: {test.count()} records")

Train set length: 445207 records
Test set length: 191055 records


<IPython.core.display.Javascript object>

In [11]:
train.show(2)

+-------+------+-------------+--------------+-------+
|   type|amount|oldbalanceOrg|newbalanceOrig|isFraud|
+-------+------+-------------+--------------+-------+
|CASH_IN|  1.91|     316450.0|     316451.91|      0|
|CASH_IN| 93.31|    170084.31|     170177.62|      0|
+-------+------+-------------+--------------+-------+
only showing top 2 rows



<IPython.core.display.Javascript object>

### Dtypes
In this dataset, any column of type string is treated as a categorical feature, but sometimes we might have numeric features we want treated as categorical or vice versa. We’ll need to carefully identify which columns are numeric and which are categorical.

In [12]:
train.dtypes

[('type', 'string'),
 ('amount', 'double'),
 ('oldbalanceOrg', 'double'),
 ('newbalanceOrig', 'double'),
 ('isFraud', 'int')]

<IPython.core.display.Javascript object>

In [13]:
catCols = [x for (x, dataType) in train.dtypes if dataType == "string"]
numCols = [
    x for (x, dataType) in train.dtypes if ((dataType == "double") & (x != "isFraud"))
]

<IPython.core.display.Javascript object>

In [14]:
print(numCols)
print(catCols)

['amount', 'oldbalanceOrg', 'newbalanceOrig']
['type']


<IPython.core.display.Javascript object>

In [15]:
train.show(5)

+-------+------+-------------+--------------+-------+
|   type|amount|oldbalanceOrg|newbalanceOrig|isFraud|
+-------+------+-------------+--------------+-------+
|CASH_IN|  1.91|     316450.0|     316451.91|      0|
|CASH_IN| 93.31|    170084.31|     170177.62|      0|
|CASH_IN|166.91|   1491268.35|    1491435.26|      0|
|CASH_IN|191.92|      62016.0|      62207.92|      0|
|CASH_IN|300.22|     253339.0|     253639.22|      0|
+-------+------+-------------+--------------+-------+
only showing top 5 rows



<IPython.core.display.Javascript object>

### StandardScaler

In [16]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

vector_assembler = VectorAssembler(inputCols=numCols, outputCol="SS_features")

<IPython.core.display.Javascript object>

In [17]:
temp_train = vector_assembler.transform(train)

<IPython.core.display.Javascript object>

In [20]:
temp_train.show(2, truncate=False)

+-------+------+-------------+--------------+-------+---------------------------+
|type   |amount|oldbalanceOrg|newbalanceOrig|isFraud|SS_features                |
+-------+------+-------------+--------------+-------+---------------------------+
|CASH_IN|1.91  |316450.0     |316451.91     |0      |[1.91,316450.0,316451.91]  |
|CASH_IN|93.31 |170084.31    |170177.62     |0      |[93.31,170084.31,170177.62]|
+-------+------+-------------+--------------+-------+---------------------------+
only showing top 2 rows



<IPython.core.display.Javascript object>

In [21]:
standard_scaler = StandardScaler(inputCol="SS_features", outputCol="scaled")

<IPython.core.display.Javascript object>

In [22]:
train = standard_scaler.fit(temp_train).transform(temp_train)

<IPython.core.display.Javascript object>

In [24]:
train.select("scaled").show(5, truncate=False)

+-----------------------------------------------------------------+
|scaled                                                           |
+-----------------------------------------------------------------+
|[3.1846821026279503E-6,0.10907656108945796,0.10773023478117204]  |
|[1.5558255863676128E-4,0.058626044019824,0.057933842008098736]   |
|[2.7830119882179643E-4,0.514022509968558,0.5077317141827913]     |
|[3.2000219326510797E-4,0.021376179530806838,0.021177542669432357]|
|[5.005786706025986E-4,0.08732294804816618,0.08634680928395519]   |
+-----------------------------------------------------------------+
only showing top 5 rows



<IPython.core.display.Javascript object>

### MinMaxScaler

In [25]:
from pyspark.ml.feature import MinMaxScaler

<IPython.core.display.Javascript object>

In [26]:
temp_train.select("SS_features").show(2)

+--------------------+
|         SS_features|
+--------------------+
|[1.91,316450.0,31...|
|[93.31,170084.31,...|
+--------------------+
only showing top 2 rows



<IPython.core.display.Javascript object>

In [27]:
minmax_scaler = MinMaxScaler(inputCol="SS_features", outputCol="scaled")

<IPython.core.display.Javascript object>

In [28]:
train = minmax_scaler.fit(temp_train).transform(temp_train)

<IPython.core.display.Javascript object>

In [29]:
train.select("scaled").show(5, truncate=False)

+-------------------------------------------------------------------+
|scaled                                                             |
+-------------------------------------------------------------------+
|[2.8295977163785848E-8,0.0066879764610618736,0.0072436809390222685]|
|[1.382354779661182E-6,0.0035946274661903957,0.003895417734221212]  |
|[2.4727128525693697E-6,0.03151704099202585,0.0341394089378311]     |
|[2.843227192289937E-6,0.0013106700844026328,0.0014239583017850081] |
|[4.447653541419784E-6,0.005354164223949926,0.0058058792670977275]  |
+-------------------------------------------------------------------+
only showing top 5 rows



<IPython.core.display.Javascript object>