In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import rand

import pyspark.sql.types as tp

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("CC Fraud Detection via MLP") \
    .config("spark.executor.extraJavaOptions", "-Dcom.github.fommil.netlib.BLAS=com.github.fommil.netlib.F2jBLAS") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/18 17:08:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Schema to change datatype to float to take less memory

In [3]:
my_schema = tp.StructType([
    tp.StructField(name = "distance_from_home", dataType=tp.FloatType(), nullable=True),
    tp.StructField(name = "distance_from_last_transaction", dataType=tp.FloatType(), nullable=True),
    tp.StructField(name = "ratio_to_median_purchase_price", dataType=tp.FloatType(), nullable=True),
    tp.StructField(name = "repeat_retailer", dataType=tp.FloatType(), nullable=True),
    tp.StructField(name = "used_chip", dataType=tp.FloatType(), nullable=True),
    tp.StructField(name = "used_pin_number", dataType=tp.FloatType(), nullable=True),
    tp.StructField(name = "online_order", dataType=tp.FloatType(), nullable=True),
    tp.StructField(name = "fraud", dataType=tp.FloatType(), nullable=True)
])

In [4]:
df = spark.read.csv('card_transdata.csv',schema= my_schema,header= True)
df.printSchema()
df.show(1)
print("# of columns:",len(df.columns))
print("# of rows:", df.count())

root
 |-- distance_from_home: float (nullable = true)
 |-- distance_from_last_transaction: float (nullable = true)
 |-- ratio_to_median_purchase_price: float (nullable = true)
 |-- repeat_retailer: float (nullable = true)
 |-- used_chip: float (nullable = true)
 |-- used_pin_number: float (nullable = true)
 |-- online_order: float (nullable = true)
 |-- fraud: float (nullable = true)

+------------------+------------------------------+------------------------------+---------------+---------+---------------+------------+-----+
|distance_from_home|distance_from_last_transaction|ratio_to_median_purchase_price|repeat_retailer|used_chip|used_pin_number|online_order|fraud|
+------------------+------------------------------+------------------------------+---------------+---------+---------------+------------+-----+
|         57.877857|                       0.31114|                       1.94594|            1.0|      1.0|            0.0|         0.0|  0.0|
+------------------+----------------

In [5]:
df.createOrReplaceTempView("card_data")

Check if there are any rows with NULL values

In [6]:
query = "SELECT * \
         FROM card_data \
         WHERE distance_from_home IS NULL \
            OR distance_from_last_transaction IS NULL \
            OR ratio_to_median_purchase_price IS NULL \
            OR repeat_retailer IS NULL \
            OR used_chip IS NULL \
            OR used_pin_number IS NULL \
            OR online_order IS NULL"
missing_data = spark.sql(query)

missing_data.show()

+------------------+------------------------------+------------------------------+---------------+---------+---------------+------------+-----+
|distance_from_home|distance_from_last_transaction|ratio_to_median_purchase_price|repeat_retailer|used_chip|used_pin_number|online_order|fraud|
+------------------+------------------------------+------------------------------+---------------+---------+---------------+------------+-----+
+------------------+------------------------------+------------------------------+---------------+---------+---------------+------------+-----+



Shuffle and split dataset

In [7]:
df = df.orderBy(rand(seed = 42))
train_data, test_data = df.randomSplit([.8, .2], seed=42)
print("Total # of rows:", df.count())
print("Training # of rows:", train_data.count())
print("Testing # of rows:", test_data.count())

Total # of rows: 1000000


                                                                                

Training # of rows: 800330




Testing # of rows: 199749


                                                                                

Fraud column is the label col, while the rest are feature cols
PySpark ML algorithms expect input data in a single vector 

In [8]:
feature = ["distance_from_home", "distance_from_last_transaction",
                "ratio_to_median_purchase_price", "repeat_retailer",
                "used_chip", "used_pin_number", "online_order"]
vectorAssembler = VectorAssembler(inputCols=feature, outputCol="features")

Training data and testing data into a single vector

In [9]:
train_data = vectorAssembler.transform(train_data).select("features", "fraud")
test_data = vectorAssembler.transform(test_data).select("features", "fraud")


In [10]:
train_data.show(2, False)
test_data.show(2, False)

                                                                                

+-------------------------------------------------------------------------+-----+
|features                                                                 |fraud|
+-------------------------------------------------------------------------+-----+
|(7,[0,1,2],[0.0304512158036232,14.45405101776123,0.4479562044143677])    |0.0  |
|(7,[0,1,2],[0.030510978773236275,0.2940792441368103,0.38904786109924316])|0.0  |
+-------------------------------------------------------------------------+-----+
only showing top 2 rows





+----------------------------------------------------------------------------+-----+
|features                                                                    |fraud|
+----------------------------------------------------------------------------+-----+
|(7,[0,1,2],[0.03167009726166725,0.5118361115455627,2.4324018955230713])     |0.0  |
|[0.040437523275613785,5.006600379943848,0.08538206666707993,0.0,1.0,0.0,1.0]|0.0  |
+----------------------------------------------------------------------------+-----+
only showing top 2 rows



                                                                                

Train Model 1

In [11]:
layers1 = [len(feature), 32, 16, 2] # 2 hidden layers with 32, 16 neurons respectively

In [12]:
mlp1 = MultilayerPerceptronClassifier(featuresCol="features", labelCol="fraud", \
                                      layers=layers1, seed=42, \
                                      maxIter=1000, solver='gd')

In [13]:
model1 = mlp1.fit(train_data)

25/02/18 17:09:07 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS

Evaluation Metric

In [None]:
m1_pred = model1.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="fraud", predictionCol="prediction", metricName="accuracy")
test_accuracy1 = evaluator.evaluate(m1_pred)

                                                                                

In [22]:
print(f"Model 1 test accuracy: {test_accuracy1*100}%")


Model 1 test accuracy: 91.23641406379069%


Train Model 2

In [16]:
layers2 = [len(feature), 32, 2] # 1 hidden layers with 32 neurons 

In [None]:
mlp2 = MultilayerPerceptronClassifier(featuresCol="features", labelCol="fraud", \
                                      layers=layers2, seed=42, \
                                      maxIter=1000, solver='gd')

In [18]:
model2 = mlp2.fit(train_data)



In [19]:
m2_pred = model2.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="fraud", predictionCol="prediction", metricName="accuracy")
test_accuracy2 = evaluator.evaluate(m2_pred)

                                                                                

In [23]:
print(f"Model 2 test accuracy: {test_accuracy2*100}%")


Model 2 test accuracy: 91.70592269607573%
