In [76]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import StandardScaler
from pyspark.ml.classification import LogisticRegression, GBTClassifier

from pyspark.ml import Pipeline


In [77]:
spark = SparkSession.builder.appName("Fraud Detection Modeling").getOrCreate()

input_path = "../data/features/fraud_features_v2.parquet"
df_model = spark.read.parquet(input_path)


### Data Spliting (Time-based split)

Train: Jan â†’ May (80%)

Validation: June (15%)

Test: July (5%)

The dataset spans from January 1, 2023 to July 2, 2023.
To prevent temporal leakage, a time-based split was applied. Data from January to May was used for training, June for validation, and early July for final testing. This setup simulates a real-world fraud detection scenario where models are evaluated on future transactions.

In [78]:
train_df = df_model.filter(col("TX_DATE") < "2023-06-01")
val_df   = df_model.filter((col("TX_DATE") >= "2023-06-01") & (col("TX_DATE") < "2023-07-01"))
test_df  = df_model.filter(col("TX_DATE") >= "2023-07-01")


print("Train:", train_df.count())
print("Val:  ", val_df.count())
print("Test: ", test_df.count())


Train: 1447243
Val:   287840
Test:  19072


In [79]:
feature_cols = [
    "TX_AMOUNT",
    "LOG_TX_AMOUNT",
    "TX_TIME_SECONDS",
    "TX_TIME_DAYS",
    "TX_HOUR",
    "IS_NIGHT",
    "IS_WEEKEND"
]


In [80]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)


In [81]:
scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withMean=True,
    withStd=True
)

lr = LogisticRegression(
    featuresCol="scaled_features",
    labelCol="TX_FRAUD"
)

pipeline = Pipeline(stages=[assembler, scaler, lr])


In [82]:
lr_model = pipeline.fit(train_df)
val_pred = lr_model.transform(val_df)
val_pred.select("TX_FRAUD", "probability", "prediction").show(5)


                                                                                

+--------+--------------------+----------+
|TX_FRAUD|         probability|prediction|
+--------+--------------------+----------+
|       0|[0.99978536182453...|       0.0|
|       1|[0.24325566066173...|       1.0|
|       0|[0.99999033257916...|       0.0|
|       0|[0.99998703625320...|       0.0|
|       0|[0.95695074777610...|       0.0|
+--------+--------------------+----------+
only showing top 5 rows


In [83]:

gbt = GBTClassifier(
    featuresCol="features",
    labelCol="TX_FRAUD",
    maxDepth=4,        
    maxIter=20,        # default 20
    subsamplingRate=0.8,
    stepSize=0.1
)

gbt_pipeline = Pipeline(stages=[assembler, gbt])
gbt_model = gbt_pipeline.fit(train_df)


                                                                                

In [84]:
pred = gbt_model.transform(val_df)
pred.select("TX_FRAUD", "probability", "prediction").show(5)


+--------+--------------------+----------+
|TX_FRAUD|         probability|prediction|
+--------+--------------------+----------+
|       0|[0.95325922538474...|       0.0|
|       1|[0.04364652142728...|       1.0|
|       0|[0.95343301850236...|       0.0|
|       0|[0.95343301850236...|       0.0|
|       0|[0.95317046835999...|       0.0|
+--------+--------------------+----------+
only showing top 5 rows


In [85]:
# Save models and predictions for evaluation notebook
lr_model.write().overwrite().save("../models/lr_model")
gbt_model.write().overwrite().save("../models/gbt_model")

# Save predictions (optional, can also recreate them in evaluation notebook)
val_pred.write.mode('overwrite').parquet("../data/predictions/lr_predictions.parquet")
pred.write.mode('overwrite').parquet("../data/predictions/gbt_predictions.parquet")

print("Models and predictions saved successfully!")


                                                                                

Models and predictions saved successfully!


                                                                                