In [1]:
import pandas as pd
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler, StandardScaler, OneHotEncoder, StringIndexer
from pyspark.ml.classification import LogisticRegression, GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

In [2]:
from mmlspark.lightgbm import LightGBMClassifier

In [19]:
file_path = "data/creditcard.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)
df.limit(5).toPandas().head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [20]:
pd.DataFrame(df.take(5), columns=df.columns).transpose()

Unnamed: 0,0,1,2,3,4
Time,0.0,0.0,1.0,1.0,2.0
V1,-1.35981,1.19186,-1.35835,-0.966272,-1.15823
V2,-0.0727812,0.266151,-1.34016,-0.185226,0.877737
V3,2.53635,0.16648,1.77321,1.79299,1.54872
V4,1.37816,0.448154,0.37978,-0.863291,0.403034
V5,-0.338321,0.0600176,-0.503198,-0.0103089,-0.407193
V6,0.462388,-0.0823608,1.8005,1.2472,0.0959215
V7,0.239599,-0.078803,0.791461,0.237609,0.592941
V8,0.0986979,0.0851017,0.247676,0.377436,-0.270533
V9,0.363787,-0.255425,-1.51465,-1.38702,0.817739


In [21]:
df.printSchema()

root
 |-- Time: decimal(10,0) (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double

In [22]:
df.groupBy("Class").count().show()

+-----+------+
|Class| count|
+-----+------+
|    1|   492|
|    0|284315|
+-----+------+



In [23]:
feature_cols = ["V" + str(i) for i in range(1,29)] + ["Amount"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
stages = [assembler]

In [24]:
best_params = {   
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'eval_metric': 'binary_error',
    'feature_fraction': 0.944714847210862,
    'lambda_l1': 1.0,
    'lambda_l2': 45.0,
    'learning_rate': 0.1,
    'loss_function': 'binary_error',
    'max_bin': 60,
    'max_depth': 58,
    'metric': 'binary_error',
    'num_iterations': 379,
    'num_leaves': 850,
    'objective': 'binary',
    'random_state': 7,
    'verbose': None}

In [25]:
lgb = LightGBMClassifier(learningRate=0.1,
                  earlyStoppingRound=100,
                  featuresCol='features',
                  labelCol='Class',
                  isUnbalance=True,
                  baggingFraction=best_params["bagging_fraction"],
                  baggingFreq=1,
                  featureFraction=best_params["feature_fraction"],
                  lambdaL1=best_params["lambda_l1"],
                  lambdaL2=best_params["lambda_l2"],
                  maxBin=best_params["max_bin"],
                  maxDepth=best_params["max_depth"],
                  numIterations=best_params["num_iterations"],
                  numLeaves=best_params["num_leaves"],
                  objective="binary",
                  baggingSeed=7
                  )
stages += [lgb]

In [26]:
pipelineModel = Pipeline(stages=stages)

In [27]:
train, test = df.randomSplit([0.8, 0.2], seed=7)

In [28]:
train.count()

227940

In [29]:
test.count()

56867

In [30]:
model = pipelineModel.fit(train)

In [31]:
preds = model.transform(test)

In [34]:
preds.select('Class', 'prediction', 'probability').show(20, False)

+-----+----------+----------------------------------------+
|Class|prediction|probability                             |
+-----+----------+----------------------------------------+
|0    |0.0       |[14.216923922937408,-13.216923922937408]|
|0    |0.0       |[12.52037084264169,-11.52037084264169]  |
|0    |0.0       |[11.883296627729981,-10.883296627729981]|
|0    |0.0       |[14.084807429723773,-13.084807429723773]|
|0    |0.0       |[15.026599524111603,-14.026599524111603]|
|0    |0.0       |[12.486209620503447,-11.486209620503447]|
|0    |0.0       |[12.55522450190023,-11.55522450190023]  |
|0    |0.0       |[11.778570726099733,-10.778570726099733]|
|0    |0.0       |[13.576808370547326,-12.576808370547326]|
|0    |0.0       |[13.807849468334608,-12.807849468334608]|
|0    |0.0       |[13.767023685574399,-12.767023685574399]|
|0    |0.0       |[15.009274770523993,-14.009274770523993]|
|0    |0.0       |[12.28362392424323,-11.28362392424323]  |
|0    |0.0       |[12.88129007079969,-11

In [35]:
binaryEvaluator = BinaryClassificationEvaluator(labelCol="Class")
print ("Test Area Under ROC: " + str(binaryEvaluator.evaluate(preds, {binaryEvaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.9367749389900486


In [36]:
tp = preds[(preds.Class == 1) & (preds.prediction == 1)].count()
tn = preds[(preds.Class == 0) & (preds.prediction == 0)].count()
fp = preds[(preds.Class == 0) & (preds.prediction == 1)].count()
fn = preds[(preds.Class == 1) & (preds.prediction == 0)].count()

print ("True Positives:", tp)

print ("True Negatives:", tn)

print ("False Positives:", fp)

print ("False Negatives:", fn)

print ("Total", preds.count())

r = float(tp)/(tp + fn)

print ("recall", r)

p = float(tp) / (tp + fp)

print ("precision", p)

f1 = 2 * p * r /(p + r)

print ("f1", f1)

True Positives: 80
True Negatives: 56745
False Positives: 20
False Negatives: 22
Total 56867
recall 0.7843137254901961
precision 0.8
f1 0.792079207920792
