In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [2]:
df = pd.read_csv("dataset-mini.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 545804 entries, 0 to 545803
Data columns (total 28 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Unnamed: 0               545804 non-null  int64  
 1   transaction_id           545804 non-null  uint64 
 2   is_fraud                 545804 non-null  int64  
 3   created_at               545804 non-null  object 
 4   is_subscription          545804 non-null  bool   
 5   transaction_type         545804 non-null  object 
 6   currency_amount          545804 non-null  float64
 7   currency_id              545804 non-null  int64  
 8   amount_scaled            545804 non-null  int64  
 9   merchant_customer_id     545804 non-null  object 
 10  merchant_customer_email  545804 non-null  object 
 11  merchant_country         545804 non-null  object 
 12  merchant_language        545804 non-null  object 
 13  ip_address               545804 non-null  object 
 14  plat

## Dropping unrelated data and duplicates

In [4]:
df.drop(columns=["Unnamed: 0","is_fraud", "currency_amount", "transaction_id", "merchant_id", "merchant_shop_id", "currency_id", "merchant_language"], inplace=True)

In [30]:
df.drop_duplicates(inplace=True)

In [31]:
df_cleaned = df.dropna(axis=1, thresh=0.28*len(df))

In [32]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 545789 entries, 0 to 545803
Data columns (total 20 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   created_at               545789 non-null  object 
 1   is_subscription          545789 non-null  bool   
 2   transaction_type         545789 non-null  object 
 3   amount_scaled            545789 non-null  int64  
 4   merchant_customer_id     545789 non-null  object 
 5   merchant_customer_email  545789 non-null  object 
 6   merchant_country         545789 non-null  object 
 7   ip_address               545789 non-null  object 
 8   platform                 545789 non-null  object 
 9   merchant_shop_name       545789 non-null  object 
 10  is_secured               545789 non-null  bool   
 11  ip_country               545789 non-null  object 
 12  payment_type             545789 non-null  object 
 13  card_id                  545789 non-null  object 
 14  bank     

In [33]:
def filter_outliers_iqr(df, factor):

    conditions = []
    total_outliers = 0
    for col in df.select_dtypes(include=['number']).columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR

        num_outliers = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()
        total_outliers += num_outliers

        print(f"{col}: {num_outliers} outliers removed")

        conditions.append((df[col] >= lower_bound) & (df[col] <= upper_bound))

    combined_condition = conditions[0]
    for cond in conditions[1:]:
        combined_condition &= cond

    return df.loc[combined_condition]  

df_filtered = filter_outliers_iqr(df_cleaned, factor=1.5)

amount_scaled: 20244 outliers removed
card_exp_relative: 4402 outliers removed


In [34]:
scaler = MinMaxScaler()

numerical_features = df_filtered.select_dtypes(include=["int64", "uint64", "float64"]).columns
df_filtered.loc[:, numerical_features] = scaler.fit_transform(
    df_filtered.loc[:, numerical_features].astype("float64")
).astype("float64")
df_filtered

  df_filtered.loc[:, numerical_features] = scaler.fit_transform(


Unnamed: 0,created_at,is_subscription,transaction_type,amount_scaled,merchant_customer_id,merchant_customer_email,merchant_country,ip_address,platform,merchant_shop_name,is_secured,ip_country,payment_type,card_id,bank,cardbrand,cardcountry,cardtype,bin,card_exp_relative
0,2024-04-28 10:35:25.253682787,False,first,0.075958,2a3813121eafd2e0c5d9769a850bbfc98f29296fb66bf7...,7b6ad7577459f69b090aa872f9c814d657f42fd07fcede...,UKR,5d39cb01a25ce88e9f10677e5f603617808f6a85a71aa0...,APP,6f26b1c68d2f76674935b98733a0c69c248d022867ec97...,False,UKR,#,07dd73f0839f408b08311772147e244c967ed4f11c90cf...,JSC CB Privatbank,VISA,UKR,DEBIT,96da19e3b3b916b909a2b8d8b038e5f5a734e5df5992bd...,0.755725
1,2024-02-21 08:52:56.253682787,True,first,0.124410,30ac4234600c1f059693dcb809f3caff4495b0dcce7cc9...,435193ad8757604861da61717fe84f2ebc12f0482be533...,ESP,266bed0105e64a373198d5b214cb25cc9ddcb9ffb4d2f4...,WEB,4e1db50aa146b97ab66658890039889b2d58299059c484...,False,ESP,#,784eaaaa47767e84f994e2123e901b5b80bf7665ee9538...,CAIXABANK S.A.,VISA,ESP,CREDIT,b3a99f3cc13e6549ff13140928861e8f86538e270f80f1...,0.641221
2,2024-03-01 06:25:53.253682787,True,token,0.154563,c842c2e79f15c944127a68307bbe244a0fa2cd4ca0a543...,c403e9eaf852e0b155a4b9f7ffdad1c242e6f702aa3a3e...,TUR,13e6ba29ecb2940e7eb9319b4b6210a08e94eb4167ac15...,WEB,97481ea31bd91296426dac31fe424c80c6e5c35700a447...,False,TUR,recurring,b1e5ef9845cc028b56ec58216dcc25eaf69212fa313f0f...,TURKIYE IS BANKASI A.S.,MASTERCARD,TUR,CREDIT,8f6ef449aba940045851a10a251d5d0fb9d1e1dac4d361...,0.229008
3,2024-05-18 15:05:49.253682787,False,first,0.058350,#,4572961c818ea0a3a03a0783cc980d807c960200ca7646...,UKR,5d39cb01a25ce88e9f10677e5f603617808f6a85a71aa0...,MOB,8714fef2ffe50d6c08d9ee9e5bfdf5d5ffe903235c3a05...,False,UKR,#,cdc4fcc8d11f303cc658b8f715d741391775e77b567e23...,JSC CB PRIVATBANK,VISA,UKR,DEBIT,734d0b205672bb054b4e622fb94eee60fce6d86295847c...,0.496183
4,2024-04-10 20:47:45.253682787,True,first,0.011509,5a31e62e36b245f4e6a707e4a314b9f09fb40626a47910...,b26f83c48d8cb3d6a34db7cf98d304d35cdeac9342fe00...,PRY,27c97aea8d6cbeff2dd82b3c14c6e8b99dc5b6a19eb9d3...,WEB,f66e97ae099f6600ee55b8a556691d4535bfe181f5366c...,False,PRY,#,d5d4a363cc8b6daa20b7fe12589702a56df4bd10cd7595...,"BANCARD, S.A.",MASTERCARD,PRY,CREDIT,9ed6069def7ad1ac1b3fdb4875f3bcea1b4a84bbc7e2a9...,0.534351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545799,2024-03-02 16:11:48.253682787,False,token,0.059501,#,6594b40cb7ab34881b80e7a677cab8a5af27ebb83f04d7...,HRV,65788ea0cf3f29d8e9c2083059cca0a2ebda5ce6b8aa92...,App,1ccd803cf6a9c6421e0ab7fa45fa1bcf46e8315f9d8a0e...,False,HRV,1-click,f5a59fdb02529d8b19ceaa07adafd004f3a2b0639248fa...,RAIFFEISEN BANK JSC,MASTERCARD,UKR,CREDIT,59b99f5082a1b91e391dfa0583ad2fe3b5b3e63fac159a...,0.282443
545800,2024-06-07 21:22:01.253682787,True,token,0.356658,d9b228fa3c76e7e5e6aa11cac3cff975c7ffa2634afff3...,7711bc5a89aabb6a8669a9f0dbf96e63725e07f03c5387...,TUR,e7cb8afc79e66301e7112406c9c9c7c6a4c2f3ecf18500...,WEB,ecdfb69957d149fbec966b39fee66c12899dc0d135401d...,False,TUR,recurring,63770f581734afe13e2f172229833adb85ca65a4cc5f44...,FINANSBANK A.S.,MASTERCARD,TUR,DEBIT,43421d6f3ef285c6c308224f24339ba39fae302a4ad3ae...,0.503817
545801,2024-05-28 13:42:49.253682787,True,first,0.022903,cb8305a3f8f9b9d956b4c38c476a92a941878f522c38f2...,4b855840fdbcd390cdb6da18b4f303bea8536219a49758...,USA,6656e488108b858c0ec4d9a494620e9749980cdd7623a3...,WEB,7667ced607445233024323e2aa6dc756bd47dc4ac16002...,False,USA,#,0015ec59f6649b59f7714d0eed9eb145c3fe5132f94580...,CELTIC BANK,VISA,USA,CREDIT,d09b4d3b93b47212d4e651be9b69eef55f8b04a116b7ba...,0.503817
545802,2024-05-22 13:31:16.253682787,True,token,0.575325,ac62ae00407853dbefa35124d1d88f79165444def51397...,6fb711f5fa02943044b5005f016356f5d85a22d1c8e8fe...,SWE,9a017d9fc2d82aee290cb444a4cbc4004a35910d5c1bf1...,WEB,ec4fa0dcd21b4c6ce469c04a807c328ccc9a1bb1ba894f...,False,SWE,recurring,3ffa7ee465f23197276ed8bb0cdf69c19c3d779e36ce54...,SVENSKA HANDELSBANKEN AB,MASTERCARD,SWE,DEBIT,1f17de1a721a8d9f939fb6885229b2c849776b989a5cb3...,0.412214


### Transforming boolean features into categorical

In [41]:
df_filtered.loc[:,["is_subscription", "is_secured"]] = df_filtered.loc[:,["is_subscription", "is_secured"]].astype("category")