### Load relevant packages and data

In [73]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import functions as F
from IPython.core.display import HTML
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline 
from pyspark.ml.classification import LogisticRegression 
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator 

  
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [74]:
# Create SparkSession
spark = SparkSession.builder\
          .appName("wb")\
          .config("spark.executor.memory", "10g")\
          .config("spark.executor.cores", "12")\
          .getOrCreate()

# Read CSV File
data = spark.read.options(header='True', inferSchema='True', delimiter=',').csv("C:/Users/timur/WB Datasets/exercise_3_creditcard.csv")

In [75]:
data.show()

+----+------------------+-------------------+------------------+-------------------+-------------------+-------------------+--------------------+-------------------+------------------+-------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+------+-----+
|Time|                V1|                 V2|                V3|                 V4|                 V5|                 V6|                  V7|                 V8|                V9|                V10|               V11|               V12|                V13|                V14|                V15|                V16|                 V17|                V18|                V19|                V20|                 V

### Summarise data

In [76]:
data.describe().show()

+-------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+--------------------+
|summary|              Time|                  V1|                  V2|                  V3|                  V4|                  V5|                  V6|                  V7|                  V8|                  V9|                 V10|                 V11|                 V12|                 V13|                 V14|                 V15|

### Perform data quality checks

In [64]:
# Calculate how many null values exist in data 
data.select(*(F.sum(F.col(c).isNull().cast("int")).alias(c) for c in data.columns)).show()

+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+------+-----+
|Time| V1| V2| V3| V4| V5| V6| V7| V8| V9|V10|V11|V12|V13|V14|V15|V16|V17|V18|V19|V20|V21|V22|V23|V24|V25|V26|V27|V28|Amount|Class|
+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+------+-----+
|   0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|     0|    0|
+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+------+-----+



In [67]:
# Find transactions with invalid amounts (in this case any amount less than or equal to 0)
zero_value_transactions = data.filter(F.col("Amount")<=0)
print("Number of transactions with invalid amounts: "+str(zero_value_transactions.count()))

Number of transactions with invalid amounts: 1825


In [79]:
# Remove rows where the amount is 0, as there shouldn't be 0 amount transactions (this is just to demonstrate filtering in spark)
data = data.filter(F.col("Amount")>0)
data.show()

+----+------------------+-------------------+------------------+-------------------+-------------------+-------------------+--------------------+-------------------+------------------+-------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+------+-----+--------------------+
|Time|                V1|                 V2|                V3|                 V4|                 V5|                 V6|                  V7|                 V8|                V9|                V10|               V11|               V12|                V13|                V14|                V15|                V16|                 V17|                V18|                V19|                V

In [80]:
# Remove columns with null values
data = data.na.drop() 
data.show()

+----+------------------+-------------------+------------------+-------------------+-------------------+-------------------+--------------------+-------------------+------------------+-------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+-------------------+--------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+-------------------+------+-----+--------------------+
|Time|                V1|                 V2|                V3|                 V4|                 V5|                 V6|                  V7|                 V8|                V9|                V10|               V11|               V12|                V13|                V14|                V15|                V16|                 V17|                V18|                V19|                V

### Generate summary statistics 

In [81]:
# Summarise transaction amounts and counts by class. 
data.groupBy("Class").agg(F.count(F.col("Amount")).alias("count"), F.avg(F.col("Amount")).alias("avgAmount"), F.sum(F.col("Amount")).alias("sumAmount")).show()

+-----+------+------------------+--------------------+
|Class| count|         avgAmount|           sumAmount|
+-----+------+------------------+--------------------+
|    1|   465|129.30746236559142|   60127.97000000001|
|    0|282517| 88.85292580623285|2.5102462039999485E7|
+-----+------+------------------+--------------------+



### Build ML model to predict whether a transaction is a fraudulent or not

In [82]:
# Build logistic regression 

# Create a log transformation of the transaction amount
data = data.withColumn("Log Amount", F.log(F.col("Amount")))

# Given we have a highly imbalanced dataset we will under-sample the majority class. 
under_sample = data.filter(F.col("Class")==1).union(data.filter(F.col("Class")==0).sample(fraction=0.05, seed=1))

# Reshuffle data to ensure classes are evenly distributed across nodes
under_sample = under_sample.sample(fraction=1.0, seed=1)

under_sample.show()

+------+-------------------+-----------------+------------------+----------------+------------------+-------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+-----------------+--------------------+------------------+-----------------+------------------+-------------------+-------------------+------------------+--------------------+--------------------+-------------------+------------------+------------------+-------------------+------------------+------+-----+-----------------+
|  Time|                 V1|               V2|                V3|              V4|                V5|                 V6|                V7|                 V8|                V9|               V10|               V11|               V12|               V13|              V14|                 V15|               V16|              V17|               V18|                V19|                V20|               V21|           

In [83]:
# Vectorise the data into a new column "features", this will be the input feature set for the logistic regression
assembler = VectorAssembler(inputCols= data.columns[1:-3]+["Log Amount"],  outputCol='Features')

# Create logistic regression model
log_reg = LogisticRegression(featuresCol='Features', labelCol='Class') 
  
# Create the pipeline
pipeline = Pipeline(stages=[ assembler, log_reg]) 

# Create train and test dataset
train_data, test_data = data.randomSplit([0.7, .3]) 
  
# Fit the model to the training dataset
fit_model = pipeline.fit(train_data) 
  
# Append the predicted values to the test dataset
results = fit_model.transform(test_data) 
  
# Showing the results 
results.select("Class", "Probability", "prediction").show() 

+-----+--------------------+----------+
|Class|         Probability|prediction|
+-----+--------------------+----------+
|    0|[0.99987727574199...|       0.0|
|    0|[0.99964521171474...|       0.0|
|    0|[0.99967152196828...|       0.0|
|    0|[0.99991777310013...|       0.0|
|    0|[0.99996627983024...|       0.0|
|    0|[0.99905191061912...|       0.0|
|    0|[0.99826959547526...|       0.0|
|    0|[0.99952182010483...|       0.0|
|    0|[0.99946556097778...|       0.0|
|    0|[0.99997596868919...|       0.0|
|    0|[0.99977403208890...|       0.0|
|    0|[0.99969600671827...|       0.0|
|    0|[0.99904200395831...|       0.0|
|    0|[0.99963463569892...|       0.0|
|    0|[0.99986202501376...|       0.0|
|    0|[0.99968910302848...|       0.0|
|    0|[0.99945345151052...|       0.0|
|    0|[0.99959185709763...|       0.0|
|    0|[0.99806072104782...|       0.0|
|    0|[0.99883858860945...|       0.0|
+-----+--------------------+----------+
only showing top 20 rows



In [84]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='Class') 
  
# Get the area under curve
print(evaluator.evaluate(results))

0.7968098811877672


In [87]:
# 

tpr = results.filter( (F.col("prediction")==F.col("Class")) & (F.col("Class")==1)).count()/results.filter(F.col("Class")==1).count()

print("The true positive rate is " +str(tpr)+"\n")

tnr = results.filter( (F.col("prediction")==F.col("Class")) & (F.col("Class")==0)).count()/results.filter(F.col("Class")==0).count()
print("The true negative rate is " +str(tnr)+"\n")


fnr = results.filter( (F.col("prediction")!=F.col("Class")) & (F.col("Class")==1)).count()/results.filter(F.col("Class")==1).count()
print("The false negative rate is " +str(fnr)+"\n")


fpr = results.filter( (F.col("prediction")!=F.col("Class")) & (F.col("Class")==0)).count()/results.filter(F.col("Class")==0).count()
print("The false positive rate is " +str(fpr)+"\n")


The true positive rate is 0.59375

The true negative rate is 0.9998697623755343

The false negative rate is 0.40625

The false positive rate is 0.00013023762446572976

