# Model Payment Failures at TrueLayer

## 1. Setting up the environment, importing libraries and loading the data

In [5]:
import tpot
from helper import get_env_variable


import pandas as pd
import logging
from dotenv import load_dotenv
import os

# load the .env file in the directory
load_dotenv()

# import data path from .env file 
path = os.environ['DATA_PATH']

# Configure logging
logging.basicConfig(level=logging.INFO)

try:
    truelayer_data = pd.read_csv(path)
    logging.info("Data loaded successfully.")
except FileNotFoundError:
    logging.error("The specified CSV file was not found.")
except Exception as e:
    logging.error(f"An error occurred: {e}")



INFO:root:Data loaded successfully.


In [6]:
truelayer_data.head()

Unnamed: 0,id,bank_id,currency,status,api_version,failure_reason,failure_stage,customer_id,vertical,connectivity_type,amount_in_currency,country_id,createdat_ts,lastupdatedat_ts,initiated_at,executed_at,failed_at,authorizing_at,authorized_at,settled_at
0,3c6646ef17a52ac2e35fdb6a15aa44db6d85985b,b5ab8f51d35b64af079485e5bbbc335213f3a0ae,CAD,Executed,v3,,,365a3d703f257f52adde00af51c7f4897a6f9c6a,vertical 2,type 8,28.0,9e2b06736b477b7a924f60de14a7e329d82d6f4f,2010-02-10 06:00:06.452249,2010-02-10 06:00:14.766862,2010-02-10 06:00:06.452249,2010-02-10 06:00:14.766862,,2010-02-10 06:00:06.452249,2010-02-10 06:00:06.452249,
1,6c3e58befae5a4880c62c54bdffe7243ab7b66cc,34a0061ba48d1c2810cd930dd69d42482ca92d85,CAD,Executed,v3,,,365a3d703f257f52adde00af51c7f4897a6f9c6a,vertical 2,type 8,18.0,9e2b06736b477b7a924f60de14a7e329d82d6f4f,2010-01-12 06:00:17.890703,2010-01-12 06:00:22.517782,2010-01-12 06:00:17.890703,2010-01-12 06:00:22.517782,,2010-01-12 06:00:17.890703,2010-01-12 06:00:17.890703,
2,b2ee452081f287aa12f0efbd05c278edd332def9,660054a4565377c6e43ff7709abf56d8494ba604,CAD,Executed,v3,,,365a3d703f257f52adde00af51c7f4897a6f9c6a,vertical 2,type 8,28.5,9e2b06736b477b7a924f60de14a7e329d82d6f4f,2010-02-10 07:20:57.816799,2010-02-10 07:21:04.284348,2010-02-10 07:20:57.816799,2010-02-10 07:21:04.284348,,2010-02-10 07:20:57.816799,2010-02-10 07:20:57.816799,
3,5aa68c655e0ca1e57bcf4a70c269151811e3ee77,660054a4565377c6e43ff7709abf56d8494ba604,CAD,Executed,v3,,,365a3d703f257f52adde00af51c7f4897a6f9c6a,vertical 2,type 8,16.0,9e2b06736b477b7a924f60de14a7e329d82d6f4f,2010-02-10 06:00:20.172570,2010-02-10 06:00:25.549863,2010-02-10 06:00:20.172570,2010-02-10 06:00:25.549863,,2010-02-10 06:00:20.172570,2010-02-10 06:00:20.172570,
4,1d72b60adefab1066d7421d72a2c3abb3d558544,0090e57640ed78b0c16ac4606a6773769545bb17,CAD,Executed,v3,,,64c370a753b1db282770cdbf2aba5434a3185b3d,vertical 2,type 8,5.0,9e2b06736b477b7a924f60de14a7e329d82d6f4f,2009-08-14 13:42:02.939891,2009-08-14 13:42:54.515245,2009-08-14 13:42:09.978416,2009-08-14 13:42:54.515245,,2009-08-14 13:42:03.699860,2009-08-14 13:42:52.858713,2009-08-14 13:42:54.515245


In [7]:
truelayer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  500000 non-null  object 
 1   bank_id             500000 non-null  object 
 2   currency            500000 non-null  object 
 3   status              500000 non-null  object 
 4   api_version         500000 non-null  object 
 5   failure_reason      44628 non-null   object 
 6   failure_stage       44628 non-null   object 
 7   customer_id         500000 non-null  object 
 8   vertical            499988 non-null  object 
 9   connectivity_type   483695 non-null  object 
 10  amount_in_currency  500000 non-null  float64
 11  country_id          500000 non-null  object 
 12  createdat_ts        500000 non-null  object 
 13  lastupdatedat_ts    500000 non-null  object 
 14  initiated_at        237284 non-null  object 
 15  executed_at         377876 non-nul

In [8]:
def create_time_features(dataframe):
    """
    Convert specified time columns in the dataframe to datetime format.
    """
    # time_columns = [col for col in dataframe.columns if col.endswith(('_ts', '_at'))]
    
    for time_col in ['initiated_at', 'failed_at']:
        if time_col in dataframe.columns:
            dataframe[time_col] = pd.to_datetime(dataframe[time_col])
    
    dataframe.info()

# Call the function with the truelayer_data
create_time_features(truelayer_data)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   id                  500000 non-null  object        
 1   bank_id             500000 non-null  object        
 2   currency            500000 non-null  object        
 3   status              500000 non-null  object        
 4   api_version         500000 non-null  object        
 5   failure_reason      44628 non-null   object        
 6   failure_stage       44628 non-null   object        
 7   customer_id         500000 non-null  object        
 8   vertical            499988 non-null  object        
 9   connectivity_type   483695 non-null  object        
 10  amount_in_currency  500000 non-null  float64       
 11  country_id          500000 non-null  object        
 12  createdat_ts        500000 non-null  object        
 13  lastupdatedat_ts    500000 no

## 2. Exploratory Data Analysis

In [9]:
def calc_failed_transac_per():
    count_failed= len(truelayer_data[truelayer_data['failed_at'].notna()]['id'].unique())
    count_total_transaction= len(truelayer_data['id'].unique())
    perc= (count_failed / count_total_transaction) * 100
    return f"{perc:.2f}% failed transactions"

calc_failed_transac_per()

'8.93% failed transactions'

In [10]:
# select all rows where there are failed transactions
failed_df = truelayer_data[truelayer_data['failed_at'].notna()]

# sum the total amount of failed transactions
failed_df.groupby('currency').agg({'amount_in_currency':'sum'}).reset_index()

Unnamed: 0,currency,amount_in_currency
0,CAD,7646246.34
1,USD,1444682.24


## 3. Feature Engineering

## 4. Model Development (AutoML)


## 5. Model Evaluation