In [1]:
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

%load_ext autoreload
%autoreload

from lib.mutation_util import get_tx_datetime, is_night, is_weekend, get_card_spending_behaviour_features, \
    get_count_risk_rolling_window, get_diff_tx_time, is_diff_previous

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [3]:
df_ori = pd.read_csv('../adyen-dataset.csv')
df_baseline_features = df_ori.copy()[
    ["psp_reference", "eur_amount", "card_number", "email_address", "ip_address", "ip_country","zip_code","has_fraudulent_dispute"]]
df_baseline_features["tx_datetime"] = df_ori.parallel_apply(get_tx_datetime, axis=1)  # for each row of df
df_baseline_features["tx_datetime"] = pd.to_datetime(df_baseline_features["tx_datetime"])
df_baseline_features.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=23117), Label(value='0 / 23117')))…

Unnamed: 0,psp_reference,eur_amount,card_number,email_address,ip_address,ip_country,zip_code,has_fraudulent_dispute,tx_datetime
0,79622457320,18.75,XldVNN9TQgIA2RE9FTC2TA,3XBRYto3QgX7g4pX-BvNyA,i1oY1kTOXMlsFfoyhFJxyA,GR,XDED,False,2021-07-25 13:51:00
1,41293916020,43.65,zgRoL8pxX5LJI-SEPUTMvg,klwgzdfrnz_6eGsHSi2prQ,HZkxQvtpScNLgi0fsoSkjA,BR,AAD,False,2021-12-11 15:20:00
2,31382555255,187.72,T3BF1uOOqgcwaVRb_ujPQg,XkMWeLLmXALPcV3ndwzKJw,HGW5EwBnHVgOrD5xYplBdw,NL,1104,False,2021-10-20 01:56:00
3,13944051314,109.16,BMf1zFkJ1xvPMlOxC7RWTw,26Lw20NQtPKb5cyQZBuh1w,8AuFEjK8m-0lxGAOJzOPmg,NL,2039,False,2021-08-30 12:50:00
4,62343575411,46.27,jfxS2TIe0gR9vpOUdMKrBw,Y7tv2i85iUadkEIAPIufgg,,BR,XOL,False,2021-11-14 19:26:00
...,...,...,...,...,...,...,...,...,...
138696,58957356926,35.13,DJHwui3GH60rBpx_tAOZZw,ReVzz-e9w8mNO63YA1cjFA,947PJ1zh6yFwZxGOYG8Lnw,BR,BZD,True,2021-11-08 10:06:00
138697,44780141989,411.11,VnFgfzDi69Hsw2mpWvJuIA,Acshv7Bba4JnO0gwpHu5bw,,NL,2039,False,2021-07-17 10:42:00
138698,34619316012,29.03,iDlX7WaccSDt1GpQNj5JBQ,RSYkLbETJb2V9f705zmCkQ,-mT_MQEgvNfb5RkbW0oejw,BR,AAD,False,2021-10-03 23:48:00
138699,68700535373,273.14,kFgBFY0u8l72rwwHj7EBtg,Hr4m7amS_osGvEy8O6qVQQ,13Jg3lRGPEoM3c_vLM-SVQ,NL,1104,True,2021-11-03 21:23:00


# Date and time transformations

In [3]:
df_baseline_features["is_night"] = df_baseline_features.tx_datetime.parallel_apply(
    is_night)  # series.apply for each cell
df_baseline_features["is_weekend"] = df_baseline_features.tx_datetime.parallel_apply(is_weekend)
df_baseline_features.head()

Unnamed: 0,psp_reference,eur_amount,card_number,email_address,ip_address,has_fraudulent_dispute,tx_datetime,is_night,is_weekend
0,79622457320,18.75,XldVNN9TQgIA2RE9FTC2TA,3XBRYto3QgX7g4pX-BvNyA,i1oY1kTOXMlsFfoyhFJxyA,False,2021-07-25 13:51:00,0,1
1,41293916020,43.65,zgRoL8pxX5LJI-SEPUTMvg,klwgzdfrnz_6eGsHSi2prQ,HZkxQvtpScNLgi0fsoSkjA,False,2021-12-11 15:20:00,0,1
2,31382555255,187.72,T3BF1uOOqgcwaVRb_ujPQg,XkMWeLLmXALPcV3ndwzKJw,HGW5EwBnHVgOrD5xYplBdw,False,2021-10-20 01:56:00,1,0
3,13944051314,109.16,BMf1zFkJ1xvPMlOxC7RWTw,26Lw20NQtPKb5cyQZBuh1w,8AuFEjK8m-0lxGAOJzOPmg,False,2021-08-30 12:50:00,0,0
4,62343575411,46.27,jfxS2TIe0gR9vpOUdMKrBw,Y7tv2i85iUadkEIAPIufgg,,False,2021-11-14 19:26:00,0,1
...,...,...,...,...,...,...,...,...,...
138696,58957356926,35.13,DJHwui3GH60rBpx_tAOZZw,ReVzz-e9w8mNO63YA1cjFA,947PJ1zh6yFwZxGOYG8Lnw,True,2021-11-08 10:06:00,0,0
138697,44780141989,411.11,VnFgfzDi69Hsw2mpWvJuIA,Acshv7Bba4JnO0gwpHu5bw,,False,2021-07-17 10:42:00,0,1
138698,34619316012,29.03,iDlX7WaccSDt1GpQNj5JBQ,RSYkLbETJb2V9f705zmCkQ,-mT_MQEgvNfb5RkbW0oejw,False,2021-10-03 23:48:00,0,1
138699,68700535373,273.14,kFgBFY0u8l72rwwHj7EBtg,Hr4m7amS_osGvEy8O6qVQQ,13Jg3lRGPEoM3c_vLM-SVQ,True,2021-11-03 21:23:00,0,0


# Card Spending Behavior transformations

In [4]:
df_baseline_features = df_baseline_features.groupby('card_number').parallel_apply(
    lambda x: get_card_spending_behaviour_features(x, windows_size_in_days=[1, 7, 30]))
df_baseline_features = df_baseline_features.sort_values('tx_datetime').reset_index(drop=True)
df_baseline_features.head()

Unnamed: 0,psp_reference,eur_amount,card_number,email_address,ip_address,has_fraudulent_dispute,tx_datetime,is_night,is_weekend,card_nb_tx_1day_window,card_avg_amount_1day_window,card_nb_tx_7day_window,card_avg_amount_7day_window,card_nb_tx_30day_window,card_avg_amount_30day_window
0,74588836273,61.84,qO276RrcCHXnFdJYhsrKog,-O3ZNM2PhNWiMwxhfnn3Ew,,True,2021-01-01 00:17:00,1,0,1.0,61.84,1.0,61.840,1.0,61.840000
1,84312413437,20.80,GReZubXFTrofD4bIfDwOEA,,,False,2021-01-01 00:19:00,1,0,1.0,20.80,1.0,20.800,1.0,20.800000
2,15700686694,95.55,qarDDPsTDtdnKDFIX0lJ5A,w-YhoWIptK37VLEmfjXkUg,f3ocVUgTPGxVrlhpOLYGaQ,False,2021-01-01 00:31:00,1,0,1.0,95.55,1.0,95.550,1.0,95.550000
3,78860525616,16.38,_4qQSOFLt55qtiLpZnNbBA,j_Bl945hKPB8YvX0hbCQRA,,False,2021-01-01 00:42:00,1,0,1.0,16.38,1.0,16.380,1.0,16.380000
4,76831611562,17.83,W146roVdfL8V_d0W9J7jjA,K5-QrrBlpajXvDsYlMwiLw,pCkS1NlcwmwqcTg06EU6Xg,False,2021-01-01 00:42:00,1,0,1.0,17.83,1.0,17.830,1.0,17.830000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138696,19327282133,70.49,ZfF19IrW1HsZEActM9JowA,K54reyciwAHTmRarwrFXwQ,OYMkykI87UOII-ehpIe0sQ,False,2021-12-31 23:35:00,0,0,1.0,70.49,1.0,70.490,3.0,53.793333
138697,79652690861,9.20,GDI4Pqa-JNIiuDBOF94YOg,CEM3iBnsT-LAV-hdSjMSOg,UAbPUgtXb4DSXHPyoS5ZtQ,False,2021-12-31 23:43:00,0,0,1.0,9.20,1.0,9.200,1.0,9.200000
138698,68928267563,60.09,Td4kouJVr80IaerEZnNfxQ,,mUnMby0agnsKOEkVJ2RKuA,False,2021-12-31 23:46:00,0,0,1.0,60.09,1.0,60.090,2.0,55.445000
138699,21678354521,184.19,oy04-k1-hx9252bHD0Rc5A,MbYS654CuaWyjUJEmIvoXg,sKW6MhXSD4kq1TdOgHWmGA,True,2021-12-31 23:48:00,0,0,1.0,184.19,5.0,145.798,18.0,94.767222


# Email & IP transformations

In [5]:
df_baseline_features = df_baseline_features.groupby('email_address', dropna=False).parallel_apply(
    lambda x: get_count_risk_rolling_window(x, feature="email_address", delay_period=7,
                                            windows_size_in_days=[1, 7, 30]))
df_baseline_features = df_baseline_features.sort_values('tx_datetime').reset_index(drop=True)
df_baseline_features.head()

Unnamed: 0,psp_reference,eur_amount,card_number,email_address,ip_address,has_fraudulent_dispute,tx_datetime,is_night,is_weekend,card_nb_tx_1day_window,...,card_nb_tx_7day_window,card_avg_amount_7day_window,card_nb_tx_30day_window,card_avg_amount_30day_window,email_address_nb_tx_1day_window,email_address_risk_1day_window,email_address_nb_tx_7day_window,email_address_risk_7day_window,email_address_nb_tx_30day_window,email_address_risk_30day_window
0,74588836273,61.84,qO276RrcCHXnFdJYhsrKog,-O3ZNM2PhNWiMwxhfnn3Ew,,True,2021-01-01 00:17:00,1,0,1.0,...,1.0,61.840,1.0,61.840000,0.0,0.0,0.0,0.000000,0.0,0.000000
1,84312413437,20.80,GReZubXFTrofD4bIfDwOEA,,,False,2021-01-01 00:19:00,1,0,1.0,...,1.0,20.800,1.0,20.800000,0.0,0.0,0.0,0.000000,0.0,0.000000
2,15700686694,95.55,qarDDPsTDtdnKDFIX0lJ5A,w-YhoWIptK37VLEmfjXkUg,f3ocVUgTPGxVrlhpOLYGaQ,False,2021-01-01 00:31:00,1,0,1.0,...,1.0,95.550,1.0,95.550000,0.0,0.0,0.0,0.000000,0.0,0.000000
3,78860525616,16.38,_4qQSOFLt55qtiLpZnNbBA,j_Bl945hKPB8YvX0hbCQRA,,False,2021-01-01 00:42:00,1,0,1.0,...,1.0,16.380,1.0,16.380000,0.0,0.0,0.0,0.000000,0.0,0.000000
4,76831611562,17.83,W146roVdfL8V_d0W9J7jjA,K5-QrrBlpajXvDsYlMwiLw,pCkS1NlcwmwqcTg06EU6Xg,False,2021-01-01 00:42:00,1,0,1.0,...,1.0,17.830,1.0,17.830000,0.0,0.0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138696,19327282133,70.49,ZfF19IrW1HsZEActM9JowA,K54reyciwAHTmRarwrFXwQ,OYMkykI87UOII-ehpIe0sQ,False,2021-12-31 23:35:00,0,0,1.0,...,1.0,70.490,3.0,53.793333,0.0,0.0,1.0,0.000000,1.0,0.000000
138697,79652690861,9.20,GDI4Pqa-JNIiuDBOF94YOg,CEM3iBnsT-LAV-hdSjMSOg,UAbPUgtXb4DSXHPyoS5ZtQ,False,2021-12-31 23:43:00,0,0,1.0,...,1.0,9.200,1.0,9.200000,0.0,0.0,0.0,0.000000,0.0,0.000000
138698,68928267563,60.09,Td4kouJVr80IaerEZnNfxQ,,mUnMby0agnsKOEkVJ2RKuA,False,2021-12-31 23:46:00,0,0,1.0,...,1.0,60.090,2.0,55.445000,0.0,0.0,0.0,0.000000,0.0,0.000000
138699,21678354521,184.19,oy04-k1-hx9252bHD0Rc5A,MbYS654CuaWyjUJEmIvoXg,sKW6MhXSD4kq1TdOgHWmGA,True,2021-12-31 23:48:00,0,0,1.0,...,5.0,145.798,18.0,94.767222,0.0,0.0,3.0,0.333333,13.0,0.461538


In [6]:
df_baseline_features = df_baseline_features.groupby('ip_address', dropna=False).parallel_apply(
    lambda x: get_count_risk_rolling_window(x, feature="ip_address", delay_period=7, windows_size_in_days=[1, 7, 30]))
df_baseline_features = df_baseline_features.sort_values('tx_datetime').reset_index(drop=True)
df_baseline_features.head()

Unnamed: 0,psp_reference,eur_amount,card_number,email_address,ip_address,has_fraudulent_dispute,tx_datetime,is_night,is_weekend,card_nb_tx_1day_window,...,email_address_nb_tx_7day_window,email_address_risk_7day_window,email_address_nb_tx_30day_window,email_address_risk_30day_window,ip_address_nb_tx_1day_window,ip_address_risk_1day_window,ip_address_nb_tx_7day_window,ip_address_risk_7day_window,ip_address_nb_tx_30day_window,ip_address_risk_30day_window
0,74588836273,61.84,qO276RrcCHXnFdJYhsrKog,-O3ZNM2PhNWiMwxhfnn3Ew,,True,2021-01-01 00:17:00,1,0,1.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
1,84312413437,20.80,GReZubXFTrofD4bIfDwOEA,,,False,2021-01-01 00:19:00,1,0,1.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
2,15700686694,95.55,qarDDPsTDtdnKDFIX0lJ5A,w-YhoWIptK37VLEmfjXkUg,f3ocVUgTPGxVrlhpOLYGaQ,False,2021-01-01 00:31:00,1,0,1.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
3,76831611562,17.83,W146roVdfL8V_d0W9J7jjA,K5-QrrBlpajXvDsYlMwiLw,pCkS1NlcwmwqcTg06EU6Xg,False,2021-01-01 00:42:00,1,0,1.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
4,78860525616,16.38,_4qQSOFLt55qtiLpZnNbBA,j_Bl945hKPB8YvX0hbCQRA,,False,2021-01-01 00:42:00,1,0,1.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138696,19327282133,70.49,ZfF19IrW1HsZEActM9JowA,K54reyciwAHTmRarwrFXwQ,OYMkykI87UOII-ehpIe0sQ,False,2021-12-31 23:35:00,0,0,1.0,...,1.0,0.000000,1.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
138697,79652690861,9.20,GDI4Pqa-JNIiuDBOF94YOg,CEM3iBnsT-LAV-hdSjMSOg,UAbPUgtXb4DSXHPyoS5ZtQ,False,2021-12-31 23:43:00,0,0,1.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000
138698,68928267563,60.09,Td4kouJVr80IaerEZnNfxQ,,mUnMby0agnsKOEkVJ2RKuA,False,2021-12-31 23:46:00,0,0,1.0,...,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,2.0,0.000000
138699,21678354521,184.19,oy04-k1-hx9252bHD0Rc5A,MbYS654CuaWyjUJEmIvoXg,sKW6MhXSD4kq1TdOgHWmGA,True,2021-12-31 23:48:00,0,0,1.0,...,3.0,0.333333,13.0,0.461538,0.0,0.0,2.0,0.0,11.0,0.454545


# Card Transaction Time Difference

In [15]:
df_baseline_features = df_baseline_features.groupby('card_number').parallel_apply(lambda x: get_diff_tx_time(x))
df_baseline_features = df_baseline_features.sort_values('tx_datetime').reset_index(drop=True)
df_baseline_features.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=8830), Label(value='0 / 8830'))), …

KeyboardInterrupt: 

# Card Transaction Country Changed

In [4]:
df_baseline_features = df_baseline_features.groupby('card_number').parallel_apply(lambda x: is_diff_previous(x, feature="ip_country"))
df_baseline_features = df_baseline_features.sort_values('tx_datetime').reset_index(drop=True)
df_baseline_features.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=8830), Label(value='0 / 8830'))), …

Unnamed: 0,psp_reference,eur_amount,card_number,email_address,ip_address,ip_country,zip_code,has_fraudulent_dispute,tx_datetime,is_diff_previous_ip_country
0,74588836273,61.84,qO276RrcCHXnFdJYhsrKog,-O3ZNM2PhNWiMwxhfnn3Ew,,GR,DFFF,True,2021-01-01 00:17:00,True
1,84312413437,20.80,GReZubXFTrofD4bIfDwOEA,,,ZW,DB,False,2021-01-01 00:19:00,True
2,15700686694,95.55,qarDDPsTDtdnKDFIX0lJ5A,w-YhoWIptK37VLEmfjXkUg,f3ocVUgTPGxVrlhpOLYGaQ,BR,AAD,False,2021-01-01 00:31:00,True
3,78860525616,16.38,_4qQSOFLt55qtiLpZnNbBA,j_Bl945hKPB8YvX0hbCQRA,,NL,1104,False,2021-01-01 00:42:00,True
4,76831611562,17.83,W146roVdfL8V_d0W9J7jjA,K5-QrrBlpajXvDsYlMwiLw,pCkS1NlcwmwqcTg06EU6Xg,NL,3941,False,2021-01-01 00:42:00,True
...,...,...,...,...,...,...,...,...,...,...
138696,19327282133,70.49,ZfF19IrW1HsZEActM9JowA,K54reyciwAHTmRarwrFXwQ,OYMkykI87UOII-ehpIe0sQ,BR,FFR,False,2021-12-31 23:35:00,True
138697,79652690861,9.20,GDI4Pqa-JNIiuDBOF94YOg,CEM3iBnsT-LAV-hdSjMSOg,UAbPUgtXb4DSXHPyoS5ZtQ,ZW,ZB,False,2021-12-31 23:43:00,True
138698,68928267563,60.09,Td4kouJVr80IaerEZnNfxQ,,mUnMby0agnsKOEkVJ2RKuA,GR,XDED,False,2021-12-31 23:46:00,True
138699,21678354521,184.19,oy04-k1-hx9252bHD0Rc5A,MbYS654CuaWyjUJEmIvoXg,sKW6MhXSD4kq1TdOgHWmGA,BR,BZD,True,2021-12-31 23:48:00,True


# Card Transaction ZIP CODE Changed

In [73]:
df_baseline_features = df_baseline_features.groupby('card_number').parallel_apply(lambda x: is_diff_previous(x, feature="zip_code"))
df_baseline_features = df_baseline_features.sort_values('tx_datetime').reset_index(drop=True)
df_baseline_features.head()

Unnamed: 0_level_0,psp_reference,eur_amount,card_number,email_address,ip_address,ip_country,zip_code,has_fraudulent_dispute,tx_datetime,is_diff_previous_zip_code
psp_reference,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
82736246201,82736246201,330.31,Td4kouJVr80IaerEZnNfxQ,grR51Cx6seNGJtis7Dol0Q,mUnMby0agnsKOEkVJ2RKuA,BR,AAD,False,2021-11-23 12:38:00,False
68388747635,68388747635,50.8,Td4kouJVr80IaerEZnNfxQ,grR51Cx6seNGJtis7Dol0Q,mUnMby0agnsKOEkVJ2RKuA,ZW,ZB,False,2021-12-06 21:16:00,True
68928267563,68928267563,60.09,Td4kouJVr80IaerEZnNfxQ,,mUnMby0agnsKOEkVJ2RKuA,GR,XDED,False,2021-12-31 23:46:00,True


# Output Features

In [8]:
df_baseline_features.to_csv("baseline_features.csv", index=False)