In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import rand

import pyspark.sql.types as tp

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("CC Fraud Detection via Logistic Regression") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/18 23:33:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
my_schema = tp.StructType([
    tp.StructField(name = "distance_from_home", dataType=tp.FloatType(), nullable=True),
    tp.StructField(name = "distance_from_last_transaction", dataType=tp.FloatType(), nullable=True),
    tp.StructField(name = "ratio_to_median_purchase_price", dataType=tp.FloatType(), nullable=True),
    tp.StructField(name = "repeat_retailer", dataType=tp.FloatType(), nullable=True),
    tp.StructField(name = "used_chip", dataType=tp.FloatType(), nullable=True),
    tp.StructField(name = "used_pin_number", dataType=tp.FloatType(), nullable=True),
    tp.StructField(name = "online_order", dataType=tp.FloatType(), nullable=True),
    tp.StructField(name = "fraud", dataType=tp.FloatType(), nullable=True)
])

In [4]:
df = spark.read.csv('card_transdata.csv',schema= my_schema,header= True)
df.printSchema()
df.show(1)
print("# of columns:",len(df.columns))
print("# of rows:", df.count())

root
 |-- distance_from_home: float (nullable = true)
 |-- distance_from_last_transaction: float (nullable = true)
 |-- ratio_to_median_purchase_price: float (nullable = true)
 |-- repeat_retailer: float (nullable = true)
 |-- used_chip: float (nullable = true)
 |-- used_pin_number: float (nullable = true)
 |-- online_order: float (nullable = true)
 |-- fraud: float (nullable = true)

+------------------+------------------------------+------------------------------+---------------+---------+---------------+------------+-----+
|distance_from_home|distance_from_last_transaction|ratio_to_median_purchase_price|repeat_retailer|used_chip|used_pin_number|online_order|fraud|
+------------------+------------------------------+------------------------------+---------------+---------+---------------+------------+-----+
|         57.877857|                       0.31114|                       1.94594|            1.0|      1.0|            0.0|         0.0|  0.0|
+------------------+----------------

[Stage 1:>                                                          (0 + 8) / 8]

# of rows: 1000000


                                                                                

In [5]:
df.createOrReplaceTempView("card_data")

Check if there are any rows with NULL values

In [6]:
query = "SELECT * \
         FROM card_data \
         WHERE distance_from_home IS NULL \
            OR distance_from_last_transaction IS NULL \
            OR ratio_to_median_purchase_price IS NULL \
            OR repeat_retailer IS NULL \
            OR used_chip IS NULL \
            OR used_pin_number IS NULL \
            OR online_order IS NULL"
missing_data = spark.sql(query)

missing_data.show()

+------------------+------------------------------+------------------------------+---------------+---------+---------------+------------+-----+
|distance_from_home|distance_from_last_transaction|ratio_to_median_purchase_price|repeat_retailer|used_chip|used_pin_number|online_order|fraud|
+------------------+------------------------------+------------------------------+---------------+---------+---------------+------------+-----+
+------------------+------------------------------+------------------------------+---------------+---------+---------------+------------+-----+



Shuffle and split dataset

In [7]:
df = df.orderBy(rand(seed = 42))
train_data, test_data = df.randomSplit([.8, .2], seed=42)
print("Total # of rows:", df.count())
print("Training # of rows:", train_data.count())
print("Testing # of rows:", test_data.count())

Total # of rows: 1000000


                                                                                

Training # of rows: 800330




Testing # of rows: 199749


                                                                                

Fraud column is the label col, while the rest are feature cols
PySpark ML algorithms expect input data in a single vector 

In [8]:
feature = ["distance_from_home", "distance_from_last_transaction",
                "ratio_to_median_purchase_price", "repeat_retailer",
                "used_chip", "used_pin_number", "online_order"]
vectorAssembler = VectorAssembler(inputCols=feature, outputCol="features")

Training data and testing data into a single vector

In [9]:
train_data = vectorAssembler.transform(train_data).select("features", "fraud")
test_data = vectorAssembler.transform(test_data).select("features", "fraud")


In [10]:
sample_row = train_data.select("features", "fraud").first()
print(sample_row)
sample_row = test_data.select("features", "fraud").first()
print(sample_row)

                                                                                

Row(features=SparseVector(7, {0: 0.0305, 1: 14.4541, 2: 0.448}), fraud=0.0)




Row(features=SparseVector(7, {0: 0.0317, 1: 0.5118, 2: 2.4324}), fraud=0.0)


                                                                                

Train Model

In [11]:
lr = LogisticRegression(featuresCol="features", labelCol="fraud", maxIter=1000, regParam=0.0)
lr_model = lr.fit(train_data)

25/02/18 23:33:56 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS

In [12]:
train_pred = lr_model.transform(train_data)
test_pred = lr_model.transform(test_data)

In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="fraud", \
    predictionCol="prediction", \
    metricName="accuracy"
)

train_acc = evaluator.evaluate(train_pred)
test_acc = evaluator.evaluate(test_pred)

print(f"Train Accuracy: {train_acc}")
print(f"Test Accuracy: {test_acc}")



Train Accuracy: 0.9587577080951399
Test Accuracy: 0.9585999379174301


                                                                                

In [14]:
spark.stop()