# Imports and installing packages

In [1]:
!git clone https://github.com/youcef-ely/credit_card_transactions_fraud_detection.git

Cloning into 'credit_card_transactions_fraud_detection'...
remote: Enumerating objects: 42, done.[K
remote: Counting objects: 100% (42/42), done.[K
remote: Compressing objects: 100% (35/35), done.[K
remote: Total 42 (delta 17), reused 24 (delta 6), pack-reused 0 (from 0)[K
Unpacking objects: 100% (42/42), 6.10 MiB | 4.79 MiB/s, done.


In [2]:
import os

os.chdir("credit_card_transactions_fraud_detection")

In [3]:
!pip install pip upgrade
!pip install -r requirements.txt --quiet

[31mERROR: Could not find a version that satisfies the requirement upgrade (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for upgrade[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
beatrix-jupyterlab 2024.66.154055 requires jupyterlab~=3.6.0, but you have jupyterlab 4.3.1 which is incompatible.
bigframes 0.22.0 requires google-cloud-bigquery[bqstorage,pandas]>=3.10.0, but you have google-cloud-bigquery 2.34.4 which is incompatible.
bigframes 0.22.0 requires google-cloud-storage>=2.0.0, but you have google-cloud-storage 1.44.0 which is incompatible.
bigframes 0.22.0 requires pandas<2.1.4,>=1.5.0, but you have pandas 2.2.2 which is incompatible.
bigframes 0.22.0 requires sqlglot<20,>=19.9.0, but you have sqlglot 26.0.0 which is incompatible.
cesium 0.12.3 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which 

In [4]:
import warnings

warnings.filterwarnings('error', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [5]:
import tensorflow as tf

gpu_devices = tf.config.list_physical_devices('GPU')
print(f"Number of available GPUs: {len(gpu_devices)}")

Number of available GPUs: 0


In [7]:
from src.metrics import *
from src.processing import *

In [34]:
import os
import time
import numpy as np
import pretty_errors
from tqdm import tqdm
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from sklearn.metrics import classification_report
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Setting and data processing

In [8]:
import logging
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder \
    .appName("credit_card_fraud_analysis") \
    .config("spark.driver.memory", "32g") \
    .config("spark.executor.memory", "32g") \
    .config("spark.debug.maxToStringFields", "100") \
    .getOrCreate()

# Set Spark log level to ERROR to suppress most logs
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/18 15:21:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [9]:
data = spark.read.csv("/kaggle/input/processed-data/processed_data", header=True, inferSchema=True)
data.show(5)

                                                                                

+--------------------+-------------+------+------+----------------+-----+-------+---------+--------+--------------------+------------------+-----------+--------+-------+---+-----------+----------+-----------------+------------------+-----------------+-----------+
|            merchant|     category|   amt|gender|            city|state|    lat|     long|city_pop|                 job|         merch_lat| merch_long|is_fraud|portion|age|trans_month|trans_hour|trans_day_of_week|trans_day_of_month|trans_day_of_year|distance_km|
+--------------------+-------------+------+------+----------------+-----+-------+---------+--------+--------------------+------------------+-----------+--------+-------+---+-----------+----------+-----------------+------------------+-----------------+-----------+
|fraud_Raynor, Rei...|gas_transport| 61.15|     M|   West Hartford|   VT|43.7185| -72.4439|     140|Development worke...|         44.208355| -72.133812|       0|  train| 35|          8|        10|            

In [10]:
# Split data into training and testing sets
train_data = data.where(col('portion') == 'train')
test_data = data.where(col('portion') == 'test')
del data

In [11]:
train_sample = train_data.sampleBy('is_fraud', fractions={0: 0.10, 1: 1.0}, seed=123)
print('trainset proportions: ')
calculate_distribution(train_sample, 'is_fraud')

print('testset proportions: ')
calculate_distribution(test_data, 'is_fraud')

del train_data

trainset proportions: 




+--------+------+------------------+
|is_fraud| count|        proportion|
+--------+------+------------------+
|       1|  7506|0.0550903125894502|
|       0|128743|0.9449096874105498|
+--------+------+------------------+

testset proportions: 




+--------+------+--------------------+
|is_fraud| count|          proportion|
+--------+------+--------------------+
|       1|  2145|0.003859864427885...|
|       0|553574|  0.9961401355721147|
+--------+------+--------------------+



                                                                                

In [12]:
cols_to_drop = ['portion', 'lat', 'long', 'merch_lat', 'merch_long', 'distance_km', 'state', 'city']
train_sample = train_sample.drop(*cols_to_drop)

### Process categorical columns

In [13]:
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer, StandardScaler

categorical_columns = [col for col, dtype in train_sample.dtypes if dtype == 'string']
categorical_columns

['merchant', 'category', 'gender', 'job']

In [14]:
categorical_index_cols = [f"{col}_index" for col in categorical_columns]
categorical_ohe_cols = [f"{col}_ohe" for col in categorical_columns]

string_indexer = StringIndexer(
    inputCols=categorical_columns,
    outputCols=categorical_index_cols,
    handleInvalid='keep'
)

one_hot_encoder = OneHotEncoder(
    inputCols=categorical_index_cols,
    outputCols=categorical_ohe_cols,
    #handleInvalid='keep'
)


### Process numerical columns

In [15]:
numerical_columns = [col for col in train_sample.columns if col not in categorical_columns and col not in ['is_fraud']]
numerical_columns

['amt',
 'city_pop',
 'age',
 'trans_month',
 'trans_hour',
 'trans_day_of_week',
 'trans_day_of_month',
 'trans_day_of_year']

In [16]:
log_transformed_cols = [f"ln_{col}" for col in numerical_columns]

log_transformer = LogTransformer(
    input_cols=numerical_columns,
    output_cols=log_transformed_cols
)

pairwise_transformer = PairwiseTransformer(inputCols=numerical_columns)

In [17]:
assembler_input_cols = (
    numerical_columns +
    log_transformer.get_output_cols() +
    pairwise_transformer.getOutputCols() +
    categorical_ohe_cols
)

assembler = VectorAssembler(
    inputCols=assembler_input_cols,
    outputCol="final_features",
    handleInvalid='skip'
)

# Pipeline training

In [18]:
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from xgboost.spark import SparkXGBClassifier


gbt_classifier = GBTClassifier(featuresCol='final_features',
                               labelCol='is_fraud',
                               seed=0)

xgboost =  SparkXGBClassifier(features_col='final_features',
                              label_col='is_fraud',
                              num_workers=len(gpu_devices) if gpu_devices else os.cpu_count(),
                              device='cuda' if gpu_devices else 'cpu',
                              tree_method='hist',
                              verbosity=0,
                              seed=0)

models = {'XGBoost': xgboost,
          'GBTClassifier': gbt_classifier,
          }

In [19]:
from pyspark.ml import Pipeline

trained_models = dict()
train_sample = train_sample.cache()
for model_name, model in tqdm(models.items(), desc='Training models'):

    pl = Pipeline(stages=[
        string_indexer,
        one_hot_encoder,
        log_transformer,
        pairwise_transformer,
        assembler,
        model])

    trained_models[model_name] = pl.fit(train_sample)

[15:22:17] Task 1 got rank 1                                        (0 + 4) / 4]
[15:22:17] Task 2 got rank 2
[15:22:17] Task 0 got rank 0
[15:22:17] Task 3 got rank 3
Training models: 100%|██████████| 2/2 [01:59<00:00, 59.54s/it]                  


In [20]:
models_results = generate_model_results(trained_models, test_data)

                                                                                


Area under ROC curve for GBTClassifier: 0.8996


Classification report for GBTClassifier: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    535609
           1       0.60      0.80      0.69      1957

    accuracy                           1.00    537566
   macro avg       0.80      0.90      0.84    537566
weighted avg       1.00      1.00      1.00    537566



Generating model results:  50%|█████     | 1/2 [01:12<01:12, 72.79s/it]


Area under ROC curve for XGBoost: 0.9235


Classification report for XGBoost: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    535609
           1       0.63      0.85      0.73      1957

    accuracy                           1.00    537566
   macro avg       0.82      0.92      0.86    537566
weighted avg       1.00      1.00      1.00    537566



Generating model results: 100%|██████████| 2/2 [01:12<00:00, 36.40s/it]


In [21]:
final_model_name = 'GBTClassifier'
final_model = models[final_model_name]

# Optimizing

In [22]:
evaluator = MulticlassClassificationEvaluator(labelCol='is_fraud', metricName='f1')

gbt_param_dict = {
    'maxDepth': [5, 10],                 # Reduced range to balance tree depth.
    'maxBins': [32],                     # Retained the standard value for bins.
    'stepSize': [0.05, 0.1],             # Reduced learning rates to test fewer options.
    'maxIter': [50],                     # Limiting to 50 iterations (smaller number of trees).
    'subsamplingRate': [0.7, 1.0],       # Fraction of data used for each tree (narrowed range).
    'minInstancesPerNode': [5],          # Retained minimum instances per node.
    'minInfoGain': [0.01]                # Retained minimum gain required for split.
}


pipeline = Pipeline(stages=[
                    string_indexer,
                    one_hot_encoder,
                    log_transformer,
                    pairwise_transformer,
                    assembler,
                    final_model
    ])

In [23]:
from src.genetic_algorithm import GeneticAlgorithm

genetic_algorithm = GeneticAlgorithm(pipeline=pipeline,
                                     evaluator=evaluator,
                                     train_data=train_sample,
                                     parameters_ranges=gbt_param_dict if final_model_name == 'GBTClassifier' else xgb_param_dict,
                                     size_of_population=6,
                                     time_limit=30,
                                     score_limit=1,
                                     num_folds=3)

In [24]:
best_solution = genetic_algorithm.run()

2024-12-18 15:25:05,222 - INFO - Starting Genetic Algorithm...
2024-12-18 15:25:05,225 - INFO - Generation 0 - Evaluating population...
2024-12-18 15:34:01,341 - INFO - Closing down clientserver connection           
2024-12-18 15:34:01,342 - INFO - Closing down clientserver connection
2024-12-18 15:34:01,939 - INFO - Closing down clientserver connection
2024-12-18 15:34:01,940 - INFO - Closing down clientserver connection
2024-12-18 15:34:05,649 - INFO - Closing down clientserver connection           
2024-12-18 15:34:05,651 - INFO - Closing down clientserver connection
2024-12-18 15:34:11,984 - INFO - Closing down clientserver connection           
2024-12-18 15:34:11,985 - INFO - Closing down clientserver connection
2024-12-18 15:39:34,785 - INFO - Closing down clientserver connection           
2024-12-18 15:39:34,792 - INFO - Closing down clientserver connection
2024-12-18 15:39:49,445 - INFO - Closing down clientserver connection           
2024-12-18 15:39:49,448 - INFO - Closin

# Evaluation final model and saving

In [29]:
best_solution

{'maxDepth': 10,
 'maxBins': 32,
 'stepSize': 0.05,
 'maxIter': 50,
 'subsamplingRate': 0.7,
 'minInstancesPerNode': 5,
 'minInfoGain': 0.01}

In [25]:
classifier = models['GBTClassifier']
classifier.setParams(**best_solution)

pipeline = Pipeline(stages=[string_indexer,
                      one_hot_encoder,
                      log_transformer,
                      pairwise_transformer,
                      assembler,
                      classifier])

In [35]:
param_grid = ParamGridBuilder().build() 

cv_validator = CrossValidator(
    estimator=pipeline,
    evaluator=None,  # We'll set the evaluator dynamically in the loop
    numFolds=3,
    estimatorParamMaps=param_grid
)

print("Performing 3 Folds Cross-Validation on train sample:\n")

# Loop through each metric and evaluate
for metric_name in tqdm(["f1", 'weightedRecall', 'weightedPrecision', 'accuracy', 'precisionByLabel', 'recallByLabel', 'fMeasureByLabel']):
    start_time = time.time()  
    
    evaluator = MulticlassClassificationEvaluator(labelCol='is_fraud', metricName=metric_name)
    
    # Set the evaluator for cross-validation
    cv_validator.setEvaluator(evaluator)
    
    # Fit the model and evaluate
    cv_model = cv_validator.fit(train_sample)
    
    # Get the score from the best model
    score = evaluator.evaluate(cv_model.bestModel.transform(train_sample))
    
    elapsed_time = time.time() - start_time  # Time taken to evaluate the metric
    print(f"Metric: {metric_name:<20} | Score: {score:.4f} | Time: {elapsed_time:.2f}s")

Performing 3 Folds Cross-Validation on train sample:



2024-12-18 16:56:39,411 - INFO - Closing down clientserver connection           
2024-12-18 16:56:39,414 - INFO - Closing down clientserver connection

Metric: f1                   | Score: 0.9841 | Time: 240.25s


2024-12-18 17:00:52,175 - INFO - Closing down clientserver connection           
2024-12-18 17:00:52,179 - INFO - Closing down clientserver connection

Metric: weightedRecall       | Score: 0.9851 | Time: 252.83s


2024-12-18 17:04:59,347 - INFO - Closing down clientserver connection           
2024-12-18 17:04:59,347 - INFO - Closing down clientserver connection

Metric: weightedPrecision    | Score: 0.9851 | Time: 247.42s


2024-12-18 17:09:12,741 - INFO - Closing down clientserver connection           
2024-12-18 17:09:12,745 - INFO - Closing down clientserver connection

Metric: accuracy             | Score: 0.9851 | Time: 253.67s


2024-12-18 17:13:33,559 - INFO - Closing down clientserver connection           
2024-12-18 17:13:33,560 - INFO - Closing down clientserver connection

Metric: precisionByLabel     | Score: 0.9852 | Time: 260.38s


2024-12-18 17:17:46,031 - INFO - Closing down clientserver connection           
2024-12-18 17:17:46,034 - INFO - Closing down clientserver connection

Metric: recallByLabel        | Score: 0.9993 | Time: 252.85s


2024-12-18 17:22:00,433 - INFO - Closing down clientserver connection           
2024-12-18 17:22:00,437 - INFO - Closing down clientserver connection

Metric: fMeasureByLabel      | Score: 0.9922 | Time: 253.96s


100%|██████████| 7/7 [29:21<00:00, 251.63s/it]                                  


In [36]:
trained_model = pl.fit(train_sample)
train_results = evaluate_model(trained_model, train_sample, '')

print(f"""Classication report for final model on train data:\n
        {train_results['classification_report']}""")
print(f"AUC for final model on train data: {train_results['roc']}")

                                                                                

Classication report for final model on train data:

                      precision    recall  f1-score   support

           0       0.99      1.00      0.99    124575
           1       0.98      0.73      0.84      6871

    accuracy                           0.99    131446
   macro avg       0.98      0.86      0.91    131446
weighted avg       0.99      0.99      0.98    131446

AUC for final model on train data: 0.8637899482604121


In [38]:
test_results = evaluate_model(trained_model, test_data, '')

print(f"Classication report for final model on test data:\n{test_results['classification_report']}")
print(f"AUC for final model on test data: {test_results['roc']}")

                                                                                

Classication report for final model on test data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    535609
           1       0.74      0.71      0.72      1957

    accuracy                           1.00    537566
   macro avg       0.87      0.86      0.86    537566
weighted avg       1.00      1.00      1.00    537566

AUC for final model on test data: 0.8551815061806164


In [44]:
trained_model.write().overwrite().save('saved_models/GBTClassifier')