In [117]:
# libraries
import warnings
# import findspark
import pandas as pd
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder, StandardScaler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql import SparkSession
from pyspark.ml.feature import Bucketizer

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [118]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [119]:
spark_df = spark.read.csv('train.csv', inferSchema=True, header=True)
spark_df.show(5)

+------+--------------------+--------------------+------------+-----------+--------+----------+--------------------+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|TARGET|              CST_NR|               CC_NR|DAY_OF_MONTH|DAY_OF_WEEK|TXN_TIME|TXN_SOURCE|             TXN_TRM|           TXN_ENTRY|TXN_AMNT|                CITY|             COUNTRY|             MC_NAME|               MC_ID|            MCC_CODE|
+------+--------------------+--------------------+------------+-----------+--------+----------+--------------------+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   0.0|c64007916f81cb954...|bf4e876751b878515...|         8.0|        4.0|  1800.0|         O|4a1917677530762f2...|12d7720f7273e2a1c...|   540.0|8c1e38274fba4c211...|368a2b6e4893c8ab0...|09adaccd88433a530...|e0d92b0729b140698...|9bcde6cbc90213

In [120]:
print("Shape: ", (spark_df.count(), len(spark_df.columns)))

Shape:  (607507, 15)


In [121]:
spark_df.printSchema() #types of Variables

root
 |-- TARGET: double (nullable = true)
 |-- CST_NR: string (nullable = true)
 |-- CC_NR: string (nullable = true)
 |-- DAY_OF_MONTH: double (nullable = true)
 |-- DAY_OF_WEEK: double (nullable = true)
 |-- TXN_TIME: double (nullable = true)
 |-- TXN_SOURCE: string (nullable = true)
 |-- TXN_TRM: string (nullable = true)
 |-- TXN_ENTRY: string (nullable = true)
 |-- TXN_AMNT: double (nullable = true)
 |-- CITY: string (nullable = true)
 |-- COUNTRY: string (nullable = true)
 |-- MC_NAME: string (nullable = true)
 |-- MC_ID: string (nullable = true)
 |-- MCC_CODE: string (nullable = true)



In [122]:
spark_df.describe(["TXN_TRM"]).show()

+-------+--------------------+
|summary|             TXN_TRM|
+-------+--------------------+
|  count|              607507|
|   mean|                null|
| stddev|                null|
|    min|4a1917677530762f2...|
|    max|4a1917677530762f2...|
+-------+--------------------+



In [123]:
spark_df.groupby("TXN_TRM").count().show() #all values are equal, therefore TXN_TRM should be dropped.

+--------------------+------+
|             TXN_TRM| count|
+--------------------+------+
|4a1917677530762f2...|607507|
+--------------------+------+



In [124]:
columns_to_drop = ['TXN_TRM', "CST_NR"]  # I also dropped CST_NR because it's highly correlated with CC_NR.
spark_df = spark_df.drop(*columns_to_drop)

In [125]:
spark_df.show(5)

+------+--------------------+------------+-----------+--------+----------+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|TARGET|               CC_NR|DAY_OF_MONTH|DAY_OF_WEEK|TXN_TIME|TXN_SOURCE|           TXN_ENTRY|TXN_AMNT|                CITY|             COUNTRY|             MC_NAME|               MC_ID|            MCC_CODE|
+------+--------------------+------------+-----------+--------+----------+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   0.0|bf4e876751b878515...|         8.0|        4.0|  1800.0|         O|12d7720f7273e2a1c...|   540.0|8c1e38274fba4c211...|368a2b6e4893c8ab0...|09adaccd88433a530...|e0d92b0729b140698...|9bcde6cbc90213586...|
|   0.0|933c28d9d4c6fb6db...|        11.0|        7.0|  1404.0|         O|12b4164904d6ecac8...| 49490.0|8c1e38274fba4c211...|368a2b6e4893c8ab0...|0b0b39198d9a5f

In [126]:
spark_df.select("TARGET").distinct().show()

+------+
|TARGET|
+------+
|   0.0|
|   1.0|
+------+



In [127]:
num_cols = [col[0] for col in spark_df.dtypes if col[1] != 'string']
spark_df.select(num_cols).describe().show() 

+-------+--------------------+-----------------+------------------+------------------+------------------+
|summary|              TARGET|     DAY_OF_MONTH|       DAY_OF_WEEK|          TXN_TIME|          TXN_AMNT|
+-------+--------------------+-----------------+------------------+------------------+------------------+
|  count|              607507|           607507|            607507|            607507|            607391|
|   mean|0.006064127656142234|10.64453413705521| 4.031207870855809|1434.6514673904992|479.29211990299495|
| stddev| 0.07763609942149745| 5.74144331272658|1.9863533004906548| 596.4313362735044|10471.584699228084|
|    min|                 0.0|              1.0|               1.0|               0.0|               0.0|
|    max|                 1.0|             20.0|               7.0|            2359.0|         4792970.0|
+-------+--------------------+-----------------+------------------+------------------+------------------+



In [128]:
cat_cols = [col[0] for col in spark_df.dtypes if col[1] == 'string']
spark_df.select(cat_cols).describe().show() 

+-------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|summary|               CC_NR|TXN_SOURCE|           TXN_ENTRY|                CITY|             COUNTRY|             MC_NAME|               MC_ID|            MCC_CODE|
+-------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  count|              607507|    607507|              607507|              607507|              607507|              607507|              607507|              607507|
|   mean|                null|      null|                null|                null|                null|                null|                null|                null|
| stddev|                null|      null|                null|                null|                null|                null|                null|              

In [129]:
from pyspark.sql.functions import when, count, col
spark_df.select([count(when(col(c).isNull(), c)).alias(c) for c in spark_df.columns]).toPandas().T

# there are some missing values on TXN_AMNT feature. 

Unnamed: 0,0
TARGET,0
CC_NR,0
DAY_OF_MONTH,0
DAY_OF_WEEK,0
TXN_TIME,0
TXN_SOURCE,0
TXN_ENTRY,0
TXN_AMNT,116
CITY,0
COUNTRY,0


In [130]:
spark_df = spark_df.fillna({'TXN_AMNT':'0'})  # I fill them with 0.

In [131]:
spark_df.select([count(when(col(c).isNull(), c)).alias(c) for c in spark_df.columns]).toPandas().T # zero missing values+

Unnamed: 0,0
TARGET,0
CC_NR,0
DAY_OF_MONTH,0
DAY_OF_WEEK,0
TXN_TIME,0
TXN_SOURCE,0
TXN_ENTRY,0
TXN_AMNT,0
CITY,0
COUNTRY,0


In [132]:
spark_df = spark_df.toDF(*[c.lower() for c in spark_df.columns])
spark_df.show(5)

+------+--------------------+------------+-----------+--------+----------+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|target|               cc_nr|day_of_month|day_of_week|txn_time|txn_source|           txn_entry|txn_amnt|                city|             country|             mc_name|               mc_id|            mcc_code|
+------+--------------------+------------+-----------+--------+----------+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+
|   0.0|bf4e876751b878515...|         8.0|        4.0|  1800.0|         O|12d7720f7273e2a1c...|   540.0|8c1e38274fba4c211...|368a2b6e4893c8ab0...|09adaccd88433a530...|e0d92b0729b140698...|9bcde6cbc90213586...|
|   0.0|933c28d9d4c6fb6db...|        11.0|        7.0|  1404.0|         O|12b4164904d6ecac8...| 49490.0|8c1e38274fba4c211...|368a2b6e4893c8ab0...|0b0b39198d9a5f

In [133]:
spark_df = spark_df.drop('mc_name')  # this feature is highly correlated with mc_id, so we don't need it.

In [134]:
spark_df = spark_df.withColumn("TXN_AMNT", spark_df["TXN_AMNT"].cast("integer")) #float to integer

In [135]:
# indexing all categorical columns in the dataset
from pyspark.ml.feature import StringIndexer
indexer1 = StringIndexer(inputCol="cc_nr", outputCol="cc_nr_index")
indexer2 = StringIndexer(inputCol="txn_source", outputCol="txn_source_index")
indexer3 = StringIndexer(inputCol="txn_entry", outputCol="txn_entry_index")
indexer4 = StringIndexer(inputCol="city", outputCol="city_index")
indexer5 = StringIndexer(inputCol="country", outputCol="country_index")
indexer6 = StringIndexer(inputCol="mc_id", outputCol="mc_id_label")
indexer7 = StringIndexer(inputCol="mcc_code", outputCol="mcc_code_label")

In [137]:
#One Hot Encoding day features
encoder = OneHotEncoder(inputCols=["day_of_month", "day_of_week"], outputCols=["day_of_month_ohe", "day_of_week_ohe"])
spark_df = encoder.fit(spark_df).transform(spark_df)

In [142]:
train_data, test_data = spark_df.randomSplit([0.75, 0.25])  #train_test_split

In [140]:
spark_df = spark_df.drop("day_of_month", "day_of_week")  # we already encoded them, so we dont need their original forms.

In [145]:
#vector assembling
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['txn_time',
                                       #"txn_amnt",
 'day_of_month_ohe',
 'day_of_week_ohe'],outputCol='features')

In [146]:
#DecisionTreeAlgorithm
from pyspark.ml.classification import DecisionTreeClassifier
dtc = DecisionTreeClassifier(labelCol='target',featuresCol='features')

In [147]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[indexer1, indexer2, indexer3, indexer4, indexer5, indexer6, indexer7, assembler, dtc])

In [153]:
# splitting training and validation data
train_data,val_data = spark_df.randomSplit([0.7,0.3])

# training model pipeline with data
model = pipeline.fit(train_data)

In [159]:
# making prediction on model with validation data
dtc_predictions = model.transform(val_data)

# Select example rows to display.
dtc_predictions.select("prediction","probability", "target", "features").show(5)

+----------+--------------------+------+--------------------+
|prediction|         probability|target|            features|
+----------+--------------------+------+--------------------+
|       0.0|[0.99397231923425...|   0.0|(28,[0,8,24],[162...|
|       0.0|[0.99397231923425...|   0.0|(28,[0,15,24],[93...|
|       0.0|[0.99397231923425...|   0.0|(28,[0,7,23],[210...|
|       0.0|[0.99397231923425...|   0.0|(28,[0,4,27],[103...|
|       0.0|[0.99397231923425...|   0.0|(28,[0,20,22],[30...|
+----------+--------------------+------+--------------------+
only showing top 5 rows



In [160]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
acc_evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction", metricName="accuracy")
dtc_acc = acc_evaluator.evaluate(dtc_predictions)
print('A Decision Tree algorithm had an accuracy of: {0:2.2f}%'.format(dtc_acc*100))

A Decision Tree algorithm had an accuracy of: 99.39%
