In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, count, when
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, monotonically_increasing_id, lit, date_add, explode
import numpy as np
import matplotlib.pyplot as plt
import warnings
import plotly.express as px
from pyspark.ml.evaluation import  MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier,GBTClassifier,MultilayerPerceptronClassifier
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
# Define categorical and numerical columns
warnings.filterwarnings('ignore')

In [2]:
sc = SparkContext(master = 'local')
spark = SparkSession.builder \
          .appName("Python Spark Classifier") \
          .getOrCreate()

25/02/21 15:38:15 WARN Utils: Your hostname, Khim3 resolves to a loopback address: 127.0.1.1; using 10.0.120.234 instead (on interface wlo1)
25/02/21 15:38:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/21 15:38:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df  = spark.read.csv('data.csv', header=True, inferSchema=True)
df.show(5)

                                                                                

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|   181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|   181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT|11668.14|C2048537720|      41554.0|      29885.86|M1230701703|      

## EDA

### Schema of the data

In [4]:
df.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)



## Check Summary of the data

In [5]:
df.summary().show()

25/02/21 15:38:29 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/02/21 15:38:33 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors

+-------+------------------+--------+-----------------+-----------+-----------------+------------------+-----------+------------------+------------------+--------------------+--------------------+
|summary|              step|    type|           amount|   nameOrig|    oldbalanceOrg|    newbalanceOrig|   nameDest|    oldbalanceDest|    newbalanceDest|             isFraud|      isFlaggedFraud|
+-------+------------------+--------+-----------------+-----------+-----------------+------------------+-----------+------------------+------------------+--------------------+--------------------+
|  count|           6362620| 6362620|          6362620|    6362620|          6362620|           6362620|    6362620|           6362620|           6362620|             6362620|             6362620|
|   mean|243.39724563151657|    NULL|179861.9035491287|       NULL|833883.1040744764| 855113.6685785812|       NULL|1100701.6665196533|1224996.3982019224|0.001290820448180152| 2.51468734577894E-6|
| stddev|142.33

                                                                                

In [6]:
duplicate_count = df.groupBy(df.columns).count().where("count > 1").select(count("*")).collect()[0][0]
print(f"Number of duplicate rows: {duplicate_count}")
df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]).show()

                                                                                

Number of duplicate rows: 0




+----+----+------+--------+-------------+--------------+--------+--------------+--------------+-------+--------------+
|step|type|amount|nameOrig|oldbalanceOrg|newbalanceOrig|nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+----+------+--------+-------------+--------------+--------+--------------+--------------+-------+--------------+
|   0|   0|     0|       0|            0|             0|       0|             0|             0|      0|             0|
+----+----+------+--------+-------------+--------------+--------+--------------+--------------+-------+--------------+



                                                                                

In [7]:
df = (
    df.withColumnRenamed('nameOrig', 'origin')
       .withColumnRenamed('oldbalanceOrg', 'sender_old_balance')
       .withColumnRenamed('newbalanceOrig', 'sender_new_balance')
       .withColumnRenamed('nameDest', 'destination')
       .withColumnRenamed('oldbalanceDest', 'receiver_old_balance')
       .withColumnRenamed('newbalanceDest', 'receiver_new_balance')
       .withColumnRenamed('isFraud', 'isfraud')
)
df.columns

['step',
 'type',
 'amount',
 'origin',
 'sender_old_balance',
 'sender_new_balance',
 'destination',
 'receiver_old_balance',
 'receiver_new_balance',
 'isfraud',
 'isFlaggedFraud']

In [8]:
# drop column isFlaggedFraud

df = df.drop('isFlaggedFraud')

In [9]:
df.groupBy("isfraud", "type").count().orderBy("isfraud", "type").show()



+-------+--------+-------+
|isfraud|    type|  count|
+-------+--------+-------+
|      0| CASH_IN|1399284|
|      0|CASH_OUT|2233384|
|      0|   DEBIT|  41432|
|      0| PAYMENT|2151495|
|      0|TRANSFER| 528812|
|      1|CASH_OUT|   4116|
|      1|TRANSFER|   4097|
+-------+--------+-------+



                                                                                

## Visualize the data

In [10]:
df = df.withColumn(
    "type2",
    when((col("origin").contains("C")) & (col("destination").contains("C")), "CC")
    .when((col("origin").contains("C")) & (col("destination").contains("M")), "CM")
    .when((col("origin").contains("M")) & (col("destination").contains("C")), "MC")
    .when((col("origin").contains("M")) & (col("destination").contains("M")), "MM")
    .otherwise(None)
)
df.show(5)

+----+--------+--------+-----------+------------------+------------------+-----------+--------------------+--------------------+-------+-----+
|step|    type|  amount|     origin|sender_old_balance|sender_new_balance|destination|receiver_old_balance|receiver_new_balance|isfraud|type2|
+----+--------+--------+-----------+------------------+------------------+-----------+--------------------+--------------------+-------+-----+
|   1| PAYMENT| 9839.64|C1231006815|          170136.0|         160296.36|M1979787155|                 0.0|                 0.0|      0|   CM|
|   1| PAYMENT| 1864.28|C1666544295|           21249.0|          19384.72|M2044282225|                 0.0|                 0.0|      0|   CM|
|   1|TRANSFER|   181.0|C1305486145|             181.0|               0.0| C553264065|                 0.0|                 0.0|      1|   CC|
|   1|CASH_OUT|   181.0| C840083671|             181.0|               0.0|  C38997010|             21182.0|                 0.0|      1|   CC|

In [11]:
fraud_trans = df.filter(col("isfraud") == 1)
valid_trans = df.filter(col("isfraud") == 0)

# Count occurrences of each type2 category for fraud transactions
print("Number of fraud transactions according to type are below:")
fraud_trans.groupBy("type2").agg(count("*").alias("count")).orderBy(col("count").desc()).show()

# Count occurrences of each type2 category for valid transactions
print("Number of valid transactions according to type are below:")
valid_trans.groupBy("type2").agg(count("*").alias("count")).orderBy(col("count").desc()).show()

Number of fraud transactions according to type are below:


                                                                                

+-----+-----+
|type2|count|
+-----+-----+
|   CC| 8213|
+-----+-----+

Number of valid transactions according to type are below:




+-----+-------+
|type2|  count|
+-----+-------+
|   CC|4202912|
|   CM|2151495|
+-----+-------+



                                                                                

In [12]:
# Drop 'origin' and 'destination' columns
df = df.drop("origin", "destination")

# Show the updated DataFrame
df.show()


+----+--------+---------+------------------+------------------+--------------------+--------------------+-------+-----+
|step|    type|   amount|sender_old_balance|sender_new_balance|receiver_old_balance|receiver_new_balance|isfraud|type2|
+----+--------+---------+------------------+------------------+--------------------+--------------------+-------+-----+
|   1| PAYMENT|  9839.64|          170136.0|         160296.36|                 0.0|                 0.0|      0|   CM|
|   1| PAYMENT|  1864.28|           21249.0|          19384.72|                 0.0|                 0.0|      0|   CM|
|   1|TRANSFER|    181.0|             181.0|               0.0|                 0.0|                 0.0|      1|   CC|
|   1|CASH_OUT|    181.0|             181.0|               0.0|             21182.0|                 0.0|      1|   CC|
|   1| PAYMENT| 11668.14|           41554.0|          29885.86|                 0.0|                 0.0|      0|   CM|
|   1| PAYMENT|  7817.71|           5386

In [13]:
df.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- sender_old_balance: double (nullable = true)
 |-- sender_new_balance: double (nullable = true)
 |-- receiver_old_balance: double (nullable = true)
 |-- receiver_new_balance: double (nullable = true)
 |-- isfraud: integer (nullable = true)
 |-- type2: string (nullable = true)



In [14]:
label_indexer = StringIndexer(inputCol="isfraud", outputCol="label")
df = label_indexer.fit(df).transform(df)

categorical_cols = ["type", "type2"]
numerical_cols = ["amount", "sender_old_balance", "sender_new_balance",
                  "receiver_old_balance", "receiver_new_balance"]

# Pipeline stages
stages = []

# StringIndexer and OneHotEncoder for categorical features
for col in categorical_cols:
    indexer = StringIndexer(inputCol=col, outputCol=f"indexed_{col}")
    encoder = OneHotEncoder(inputCol=f"indexed_{col}", outputCol=f"encoded_{col}")
    stages.extend([indexer, encoder])

# Assemble all features into a single vector
assembler_inputs = [f"encoded_{col}" for col in categorical_cols] + numerical_cols
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")
stages.append(assembler)


                                                                                

In [15]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Define Naïve Bayes model
nb_classifier = NaiveBayes(labelCol="isfraud", featuresCol="features", modelType="multinomial")

# Add model to the pipeline
nb_pipeline = Pipeline(stages=stages + [nb_classifier])

# Split data into training and test sets
training, test = df.randomSplit([0.8, 0.2], seed=42)

# Hyperparameter tuning grid
nb_param_grid = ParamGridBuilder() \
    .addGrid(nb_classifier.smoothing, [0.0, 1.0, 10.0]) \
    .build()

# Cross-validation
nb_cv = CrossValidator(
    estimator=nb_pipeline,
    estimatorParamMaps=nb_param_grid,
    evaluator=MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="isfraud", metricName="f1"),
    numFolds=3
)

# Train model
nb_cv_model = nb_cv.fit(training)

# Make predictions
nb_predictions = nb_cv_model.transform(test)

# Evaluate metrics
nb_metrics = {
    "F1-Score": MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="isfraud", metricName="f1").evaluate(nb_predictions),
    "Accuracy": MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="isfraud", metricName="accuracy").evaluate(nb_predictions),
    "Precision": MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="isfraud", metricName="weightedPrecision").evaluate(nb_predictions),
    "Recall": MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="isfraud", metricName="weightedRecall").evaluate(nb_predictions),
}

# Print metrics
print("\n🔍 Naïve Bayes Metrics:")
for metric, value in nb_metrics.items():
    print(f"{metric}: {value:.4f}")


25/02/21 15:42:04 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


🔍 Naïve Bayes Metrics:
F1-Score: 0.9549
Accuracy: 0.9161
Precision: 0.9981
Recall: 0.9161


                                                                                