In [None]:
# General Data Dictionary
    # Transaction table
        # TransactionID - join key for identity table, unique for each row, always filled
        # isFraud - binary label, always filled
            # Propagated 1 over user account/email address/billing address on encounter of reported chargeback within 120 days, otherwise 0
        # TransactionDT - number of seconds from arbitrary point in time (min 86_400 (one day), max 15_811_131 (~183 days), always filled
        # TransactionAmt - payment amount in USD, three decimal places for foreign transactions (unfilled addr1, addr2)
        # ProductCD - product code, values W, C, R, H, S, always filled
        # card1-card6 - payment card information (card type, card category, issuing bank, country etc.)
            # card1 + addr1 + D1n (see below) identify user accounts
        # addr1 - billing region ZIP
        # addr2 - billing country
        # dist1, dist2 - distance of two locations (may be addresses, IPs etc.)
        # P_emaildomain - purchaser email domain
        # R_emaildomain - recipient email domain (certain transactions do not require a recipient)
        # C1-C14 - counting features (e.g., number of addresses associated with the card, for both purchaser and recipient), always filled
        # D1-D15 - timedelta, e.g., days since previous transaction
            # D1 - days since client (credit card) began -> D1n := (D1 - TransactionDT / 86_400) corresponds to account age at start day
        # M1-M9 - binary, whether names on cards, addresses etc. matched
        # Vxxx - enriching continuous-like features 

    # Identity table
        # TransactionID - join key for tx table, unique for each row, always filled (but identity table contains only 144_233 rows)
        # id_01-id_11 - numerical identity features such as device/IP rating, login times etc.
        # id_12-id_38 - categorical identity features
        # DeviceType - ...
        # DeviceInfo - ...

In [79]:
raw_train_identities.describe()

statistic,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,str,str,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,str,str,str,str,str,f64,str,str,str,str,str,str,str,str
"""count""",144233.0,144233.0,140872.0,66324.0,66324.0,136865.0,136865.0,5155.0,5155.0,74926.0,74926.0,140978.0,"""144233""",127320.0,80044.0,"""140985""","""129340""",139369.0,45113.0,139318.0,139261.0,5159.0,5169.0,"""5169""",4747.0,5132.0,5163.0,"""5169""","""140978""","""140978""","""77565""","""140282""",77586.0,"""73289""","""77805""","""140985""","""140985""","""140985""","""140985""","""140810""","""118666"""
"""null_count""",0.0,0.0,3361.0,77909.0,77909.0,7368.0,7368.0,139078.0,139078.0,69307.0,69307.0,3255.0,"""0""",16913.0,64189.0,"""3248""","""14893""",4864.0,99120.0,4915.0,4972.0,139074.0,139064.0,"""139064""",139486.0,139101.0,139070.0,"""139064""","""3255""","""3255""","""66668""","""3951""",66647.0,"""70944""","""66428""","""3248""","""3248""","""3248""","""3248""","""3423""","""25567"""
"""mean""",3236300.0,-10.170502,174716.584708,0.060189,-0.058938,1.615585,-6.69871,13.285354,-38.600388,0.091023,-0.301124,99.745325,,48.053071,-344.507146,,,189.451377,14.237337,353.128174,403.882666,368.26982,16.002708,,12.800927,329.608924,149.070308,,,,,,26.508597,,,,,,,,
"""std""",178849.571186,14.347949,159651.816856,0.598231,0.701015,5.249856,16.491104,11.384207,26.084899,0.983842,2.789446,1.127602,,11.774858,93.695502,,,30.37536,1.561302,141.095343,152.160327,198.847038,6.897665,,2.372447,97.461089,32.101995,,,,,,3.737502,,,,,,,,
"""min""",2987004.0,-100.0,1.0,-13.0,-28.0,-72.0,-100.0,-46.0,-100.0,-36.0,-100.0,90.0,"""Found""",10.0,-660.0,"""Found""","""Found""",100.0,10.0,100.0,100.0,100.0,10.0,"""IP_PROXY:ANONYMOUS""",11.0,100.0,100.0,"""Found""","""Found""","""Found""","""Android""","""BLU/Dash""",0.0,"""0x0""","""match_status:-1""","""F""","""F""","""F""","""F""","""desktop""","""0PAJ5"""
"""25%""",3077142.0,-10.0,67992.0,0.0,0.0,0.0,-6.0,5.0,-48.0,0.0,0.0,100.0,,49.0,-360.0,,,166.0,13.0,266.0,256.0,252.0,14.0,,11.0,321.0,119.0,,,,,,24.0,,,,,,,,
"""50%""",3198818.0,-5.0,125803.0,0.0,0.0,0.0,0.0,14.0,-34.0,0.0,0.0,100.0,,52.0,-300.0,,,166.0,15.0,341.0,472.0,252.0,14.0,,11.0,321.0,149.0,,,,,,24.0,,,,,,,,
"""75%""",3392923.0,-5.0,228748.0,0.0,0.0,1.0,0.0,22.0,-23.0,0.0,0.0,100.0,,52.0,-300.0,,,225.0,15.0,427.0,533.0,487.0,14.0,,15.0,371.0,169.0,,,,,,32.0,,,,,,,,
"""max""",3577534.0,0.0,999595.0,10.0,0.0,52.0,0.0,61.0,0.0,25.0,0.0,100.0,"""NotFound""",64.0,720.0,"""Unknown""","""NotFound""",229.0,29.0,671.0,661.0,854.0,44.0,"""IP_PROXY:TRANSPARENT""",26.0,548.0,216.0,"""NotFound""","""New""","""NotFound""","""other""","""waterfox""",32.0,"""976x600""","""match_status:2""","""T""","""T""","""T""","""T""","""mobile""","""xs-Z47b7VqTMxs"""


In [10]:
import os
import json

In [2]:
import numpy as np

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
import polars as pl
from polars._typing import IntoExpr

In [28]:
pl.Config.set_tbl_rows(100)
pl.Config.set_tbl_cols(100)

polars.config.Config

In [20]:
with open('../config/general_config.json') as f:
    config = json.load(f)

In [25]:
raw_train_transactions = pl.scan_csv('../' + config['raw_data_folder'] + 'train/train_transaction.csv')
raw_train_identities = pl.scan_csv('../' + config['raw_data_folder'] + 'train/train_identity.csv')

In [26]:
raw_train_transactions.head(5).collect()

TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,…,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
i64,i64,i64,f64,str,i64,f64,f64,str,f64,str,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2987000,0,86400,68.5,"""W""",13926,,150.0,"""discover""",142.0,"""credit""",315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,…,0.0,0.0,1.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,
2987001,0,86401,29.0,"""W""",2755,404.0,150.0,"""mastercard""",102.0,"""credit""",325.0,87.0,,,"""gmail.com""",,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,…,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,
2987002,0,86469,59.0,"""W""",4663,490.0,150.0,"""visa""",166.0,"""debit""",330.0,87.0,287.0,,"""outlook.com""",,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,…,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,
2987003,0,86499,50.0,"""W""",18132,567.0,150.0,"""mastercard""",117.0,"""debit""",476.0,87.0,,,"""yahoo.com""",,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,…,0.0,0.0,1.0,50.0,1758.0,925.0,0.0,354.0,0.0,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,
2987004,0,86506,50.0,"""H""",4497,514.0,150.0,"""mastercard""",102.0,"""credit""",420.0,87.0,,,"""gmail.com""",,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,…,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
raw_train_identities.head(5).collect()

TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,str,str,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,str,str,str,str,str,f64,str,str,str,str,str,str,str,str
2987004,0.0,70787.0,,,,,,,,,100.0,"""NotFound""",,-480.0,"""New""","""NotFound""",166.0,,542.0,144.0,,,,,,,,"""New""","""NotFound""","""Android 7.0""","""samsung browser 6.2""",32.0,"""2220x1080""","""match_status:2""","""T""","""F""","""T""","""T""","""mobile""","""SAMSUNG SM-G892A Build/NRD90M"""
2987008,-5.0,98945.0,,,0.0,-5.0,,,,,100.0,"""NotFound""",49.0,-300.0,"""New""","""NotFound""",166.0,,621.0,500.0,,,,,,,,"""New""","""NotFound""","""iOS 11.1.2""","""mobile safari 11.0""",32.0,"""1334x750""","""match_status:1""","""T""","""F""","""F""","""T""","""mobile""","""iOS Device"""
2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,0.0,100.0,"""NotFound""",52.0,,"""Found""","""Found""",121.0,,410.0,142.0,,,,,,,,"""Found""","""Found""",,"""chrome 62.0""",,,,"""F""","""F""","""T""","""T""","""desktop""","""Windows"""
2987011,-5.0,221832.0,,,0.0,-6.0,,,,,100.0,"""NotFound""",52.0,,"""New""","""NotFound""",225.0,,176.0,507.0,,,,,,,,"""New""","""NotFound""",,"""chrome 62.0""",,,,"""F""","""F""","""T""","""T""","""desktop""",
2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,0.0,100.0,"""NotFound""",,-300.0,"""Found""","""Found""",166.0,15.0,529.0,575.0,,,,,,,,"""Found""","""Found""","""Mac OS X 10_11_6""","""chrome 62.0""",24.0,"""1280x800""","""match_status:2""","""T""","""F""","""T""","""T""","""desktop""","""MacOS"""


<h1>Utility Functions</h1>

<h2>Attribute Breakdown</h2>

In [61]:
def get_breakdown_for_categorical_feature(df: pl.DataFrame | pl.LazyFrame, feature: IntoExpr) -> None:
    """
    For a given categorical feature, print basic statistics such as value counts.

    Parameters
    ----------
    df : pl.DataFrame | pl.LazyFrame
        Frame storing the data including the given feature as a column.
    feature : IntoExpr
        Expression or string name of the column to describe.
    """
    print(f'Breakdown of the {feature} column:')
    
    print('  Value counts:')
    print('    Feature value \t Count')
    val_counts = df.select(pl.col(feature).value_counts(sort=True)).unnest(feature).collect().head(10)
    for row in val_counts.iter_rows():
        print('\t', row[0], row[1])
    print('\n' * 2)

<h1>Transaction Table</h1>

In [68]:
print(f'Total number of train transactions: {raw_train_transactions.select(pl.len()).collect().item():_}')

Total number of train transactions: 590_540


In [72]:
print(f'Total number of columns for transactions: {len(raw_train_transactions.collect_schema().names()):_}')

Total number of columns for transactions: 394


In [63]:
raw_train_transactions.describe()

statistic,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,M1,M2,M3,…,V290,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300,V301,V302,V303,V304,V305,V306,V307,V308,V309,V310,V311,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321,V322,V323,V324,V325,V326,V327,V328,V329,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
str,f64,f64,f64,f64,str,f64,f64,f64,str,f64,str,f64,f64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,str,str,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",590540.0,590540.0,590540.0,590540.0,"""590540""",590540.0,581607.0,588975.0,"""588963""",586281.0,"""588969""",524834.0,524834.0,238269.0,37627.0,"""496084""","""137291""",590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,589271.0,309743.0,327662.0,421618.0,280699.0,73187.0,38917.0,74926.0,74926.0,514518.0,311253.0,64717.0,61952.0,62187.0,501427.0,"""319440""","""319440""","""319440""",…,590528.0,590528.0,590528.0,590528.0,590528.0,590528.0,589271.0,590528.0,590528.0,590528.0,589271.0,589271.0,590528.0,590528.0,590528.0,590528.0,590528.0,590528.0,590528.0,590528.0,590528.0,590528.0,590528.0,589271.0,589271.0,589271.0,590528.0,590528.0,590528.0,590528.0,590528.0,590528.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0,82351.0
"""null_count""",0.0,0.0,0.0,0.0,"""0""",0.0,8933.0,1565.0,"""1577""",4259.0,"""1571""",65706.0,65706.0,352271.0,552913.0,"""94456""","""453249""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1269.0,280797.0,262878.0,168922.0,309841.0,517353.0,551623.0,515614.0,515614.0,76022.0,279287.0,525823.0,528588.0,528353.0,89113.0,"""271100""","""271100""","""271100""",…,12.0,12.0,12.0,12.0,12.0,12.0,1269.0,12.0,12.0,12.0,1269.0,1269.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,12.0,1269.0,1269.0,1269.0,12.0,12.0,12.0,12.0,12.0,12.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0,508189.0
"""mean""",3282269.5,0.03499,7372300.0,135.027176,,9898.734658,362.555488,153.194925,,199.278897,,290.733794,86.80063,118.50218,231.855423,,,14.092458,15.269734,0.005644,4.092185,5.571526,9.071082,2.848478,5.144574,4.48024,5.240343,10.241521,4.076227,32.539918,8.295215,94.347568,169.563231,28.343348,140.002441,42.335965,69.805717,41.63895,146.058108,0.561057,123.982137,146.621465,54.037533,17.901295,57.724444,163.744579,,,,…,1.103011,1.659811,1.239916,0.942599,2.313863,1.433424,0.328917,0.089034,0.298829,0.171655,0.045507,0.052002,0.251761,0.28314,0.264208,1.000007,139.748713,408.682375,230.41318,10.995986,118.195658,4.202175,39.17391,21.351473,43.319174,26.806977,109.818544,247.606741,162.153398,18.372476,42.073133,28.326584,6.220289,13.103775,9.184612,0.058494,0.85104,0.296633,0.33679,1.312844,0.775874,721.741883,1375.783644,1014.622782,9.807015,59.16455,28.530903,55.352422,151.160542,100.700882
"""std""",170474.358321,0.183755,4617200.0,239.162522,,4901.170153,157.793246,11.336444,,41.244453,,101.741072,2.690623,371.872026,529.053494,,,133.569018,154.668899,0.150536,68.848459,25.786976,71.508467,61.727304,95.378574,16.674897,95.581443,94.336292,86.666218,129.364844,49.544262,157.660387,177.315865,62.384721,191.096774,89.000144,143.669253,99.743264,231.66384,0.31688,182.615225,186.042622,124.274558,67.614425,136.31245,202.72666,,,,…,0.768897,16.252538,3.77505,20.588816,39.526468,25.962948,3.264745,0.628352,3.175062,1.724218,0.289573,0.31831,0.481889,0.623608,0.528238,0.002603,2348.849634,4391.992977,3021.924247,116.254277,352.983093,102.374938,172.128339,95.90297,173.619028,116.853222,2270.033202,3980.042828,2793.343636,332.304848,473.499307,382.053171,56.022561,106.739813,73.627893,0.304415,3.950295,1.364356,1.580144,8.769083,4.727971,6217.223583,11169.275702,7955.735482,243.861391,387.62948,274.57692,668.486833,1095.034387,814.946722
"""min""",2987000.0,0.0,86400.0,0.251,"""C""",1000.0,100.0,100.0,"""american express""",100.0,"""charge card""",100.0,10.0,0.0,0.0,"""aim.com""","""aim.com""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-122.0,0.0,-83.0,0.0,0.0,0.0,0.0,-53.0,-83.0,0.0,-193.0,-83.0,"""F""","""F""","""F""",…,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",3134635.0,0.0,3027065.0,43.321,,6019.0,214.0,150.0,,166.0,,204.0,87.0,3.0,7.0,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,26.0,1.0,0.0,1.0,0.0,0.0,0.958333,0.208333,0.0,0.0,0.0,0.0,0.0,0.0,,,,…,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""50%""",3282270.0,0.0,7306535.0,68.78,,9678.0,361.0,150.0,,226.0,,299.0,87.0,8.0,37.0,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,1.0,3.0,97.0,8.0,26.0,10.0,0.0,0.0,37.875,0.666666,15.0,43.0,0.0,0.0,0.0,52.0,,,,…,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""75%""",3429904.0,0.0,11246605.0,125.0,,14184.0,512.0,150.0,,226.0,,330.0,87.0,24.0,206.0,,,3.0,3.0,0.0,0.0,1.0,2.0,0.0,0.0,2.0,0.0,2.0,0.0,12.0,2.0,122.0,276.0,27.0,253.0,32.0,40.0,17.0,187.958328,0.833333,197.0,274.0,13.0,0.0,2.0,314.0,,,,…,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,151.380005,35.970001,0.0,107.949997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""max""",3577539.0,1.0,15811131.0,31937.391,"""W""",18396.0,600.0,231.0,"""visa""",237.0,"""debit or credit""",540.0,102.0,10286.0,11623.0,"""ymail.com""","""ymail.com""",4685.0,5691.0,26.0,2253.0,349.0,2253.0,2255.0,3331.0,210.0,3257.0,3188.0,3188.0,2918.0,1429.0,640.0,640.0,819.0,869.0,819.0,873.0,843.0,1707.791626,0.958333,876.0,670.0,648.0,847.0,878.0,879.0,"""T""","""T""","""T""",…,67.0,1055.0,323.0,869.0,1286.0,928.0,93.0,12.0,93.0,49.0,11.0,13.0,16.0,20.0,16.0,2.0,108800.0,145765.0,108800.0,55125.0,55125.0,55125.0,55125.0,4817.470215,7519.870117,4817.470215,93736.0,134021.0,98476.0,104060.0,104060.0,104060.0,880.0,1411.0,976.0,12.0,44.0,18.0,15.0,99.0,55.0,160000.0,160000.0,160000.0,55125.0,55125.0,55125.0,104060.0,104060.0,104060.0


<h2>Continuous Features</h2>

<h2>Categorical Features</h2>

In [76]:
for categorical_col in ['isFraud', 'ProductCD'] + ['card4', 'card6'] + ['addr1', 'addr2'] + ['P_emaildomain', 'R_emaildomain'] + [f'M{i}' for i in range(1, 10)]:
    get_breakdown_for_categorical_feature(raw_train_transactions, categorical_col)

Breakdown of the isFraud column:
  Value counts:
    Feature value 	 Count
	 0 569877
	 1 20663



Breakdown of the ProductCD column:
  Value counts:
    Feature value 	 Count
	 W 439670
	 C 68519
	 R 37699
	 H 33024
	 S 11628



Breakdown of the card4 column:
  Value counts:
    Feature value 	 Count
	 visa 384767
	 mastercard 189217
	 american express 8328
	 discover 6651
	 None 1577



Breakdown of the card6 column:
  Value counts:
    Feature value 	 Count
	 debit 439938
	 credit 148986
	 None 1571
	 debit or credit 30
	 charge card 15



Breakdown of the addr1 column:
  Value counts:
    Feature value 	 Count
	 None 65706
	 299.0 46335
	 325.0 42751
	 204.0 42020
	 264.0 39870
	 330.0 26287
	 315.0 23078
	 441.0 20827
	 272.0 20141
	 123.0 16105



Breakdown of the addr2 column:
  Value counts:
    Feature value 	 Count
	 87.0 520481
	 None 65706
	 60.0 3084
	 96.0 638
	 32.0 91
	 65.0 82
	 16.0 55
	 31.0 47
	 19.0 33
	 26.0 25



Breakdown of the P_emaildomain column:
  Value cou

<h1>Identity Table</h1>

In [73]:
print(f'Total number of train identity rows: {raw_train_identities.select(pl.len()).collect().item():_}')

Total number of train identity rows: 144_233


In [74]:
print(f'Total number of columns for identity features: {len(raw_train_identities.collect_schema().names()):_}')

Total number of columns for identity features: 41


<h2>Continuous Features</h2>

<h2>Categorical Features</h2>

In [62]:
for categorical_col_id in range(12, 39):
    get_breakdown_for_categorical_feature(raw_train_identities, f'id_{categorical_col_id}')

Breakdown of the id_12 column:
  Value counts:
    Feature value 	 Count
	 NotFound 123025
	 Found 21208



Breakdown of the id_13 column:
  Value counts:
    Feature value 	 Count
	 52.0 58099
	 49.0 26365
	 None 16913
	 64.0 14429
	 33.0 10048
	 27.0 3666
	 20.0 2878
	 14.0 2499
	 63.0 1468
	 19.0 1147



Breakdown of the id_14 column:
  Value counts:
    Feature value 	 Count
	 None 64189
	 -300.0 44121
	 -360.0 16661
	 -480.0 12891
	 -420.0 4542
	 -600.0 498
	 60.0 369
	 0.0 192
	 -240.0 159
	 -180.0 126



Breakdown of the id_15 column:
  Value counts:
    Feature value 	 Count
	 Found 67728
	 New 61612
	 Unknown 11645
	 None 3248



Breakdown of the id_16 column:
  Value counts:
    Feature value 	 Count
	 Found 66324
	 NotFound 63016
	 None 14893



Breakdown of the id_17 column:
  Value counts:
    Feature value 	 Count
	 166.0 78631
	 225.0 56968
	 None 4864
	 102.0 689
	 159.0 352
	 100.0 336
	 121.0 279
	 148.0 229
	 150.0 126
	 191.0 123



Breakdown of the id_18 column:
  