## 2- Baseline feature engineering and splitting to train/validation/test splits
* No additional features will be added in this stage, only one-hot encoding the categorical values and standard scaling the numerical values.
* Splitting into __80/10/10 train/validation/test__ sets will be done on the __customer_id__ level, only one customer's entire records in only one set to avoid data leakage across sets.
    * A __seed__ value will be used to ensure reproducible splits.
    * The encoding and scaling algorithms will be fitted and transformed on the __train set__, then transformed on the validation and test sets.
* Once splitted the following columns would be removed (assumed irrelevant for the model scope):
    * __customer__ (id information assumed not relevant for the model)
    * __zipcodeOri__ (only one value)
    * __merchant__ (id information assumed not relevant for the model)
    * __zipMerchant__ (id information assumed not relevant for the model)
* No duplicate records will be removed, to maintain consistency of the sets between baseline experiment and future experiments. 

#### Stages
1. __Reading data__.
2. __Specifying column types__.
3. __Train/validation/test splits based on customer id__.
4. __One-hot-encoding categorical columns and standard scaling numerical columns__.
5. __Dropping irrelevant columns__.
6. __Saving data and preprocessor__.

In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import StratifiedGroupKFold  # for splitting into train/val/test based on customer id while keeping fraud ratio the same for the three sets
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [2]:
RANDOM_SEED = 420

### 1- Reading data

In [3]:
df = pd.read_csv("data/fraud_cleaned.csv")

In [4]:
df.shape

(594643, 10)

In [5]:
df.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,C1093826151,4,M,28007,M348934600,28007,es_transportation,4.55,0
1,0,C352968107,2,M,28007,M348934600,28007,es_transportation,39.68,0
2,0,C2054744914,4,F,28007,M1823072687,28007,es_transportation,26.89,0
3,0,C1760612790,3,M,28007,M348934600,28007,es_transportation,17.25,0
4,0,C757503768,5,M,28007,M348934600,28007,es_transportation,35.72,0


### 2- Specifying column types

In [6]:
target_column = "fraud"

In [7]:
categorical_columns_to_encode = [
    "age",
    "gender",
    "category"
]

In [8]:
numerical_columns = [
    "step",
    "amount"
]

### 3- Train/validation/test splits based on customer id

In [9]:
def stratified_group_split_80_10_10(df, y_col="fraud", group_col="customer", seed=420):
    y = df[y_col].to_numpy()
    g = df[group_col].to_numpy()

    # 1) Hold out ~10% of groups for test
    sgkf_test = StratifiedGroupKFold(n_splits=10, shuffle=True, random_state=seed)
    trainval_idx, test_idx = next(sgkf_test.split(df, y=y, groups=g))

    # 2) From the remaining ~90%, hold out ~11.1% of groups for val (≈10% of total)
    #    0.1 / 0.9 ≈ 1/9  -> take 1 fold out of 9
    sgkf_val = StratifiedGroupKFold(n_splits=9, shuffle=True, random_state=seed)
    tv_y, tv_g = y[trainval_idx], g[trainval_idx]
    inner_train_idx_rel, val_idx_rel = next(sgkf_val.split(df.iloc[trainval_idx], y=tv_y, groups=tv_g))

    train_idx = trainval_idx[inner_train_idx_rel]
    val_idx = trainval_idx[val_idx_rel]

    # Group exclusivity checks (customers disjoint across splits)
    def uniq(ids): return set(df.iloc[ids][group_col].unique())
    assert uniq(train_idx).isdisjoint(uniq(val_idx))
    assert uniq(train_idx).isdisjoint(uniq(test_idx))
    assert uniq(val_idx).isdisjoint(uniq(test_idx))

    return train_idx, val_idx, test_idx

In [10]:
train_idx, val_idx, test_idx = stratified_group_split_80_10_10(
    df=df,
    y_col="fraud",
    group_col="customer",
    seed=RANDOM_SEED
)

In [11]:
train_idx.shape

(476608,)

In [12]:
val_idx.shape

(60291,)

In [13]:
test_idx.shape

(57744,)

In [14]:
df_train, df_val, df_test = df.iloc[train_idx], df.iloc[val_idx], df.iloc[test_idx]

In [15]:
df_train.shape

(476608, 10)

In [16]:
df_train.shape[0] / df.shape[0]

0.8015027503897296

In [17]:
df_val.shape

(60291, 10)

In [18]:
df_val.shape[0] / df.shape[0]

0.10139024591225324

In [19]:
df_test.shape

(57744, 10)

In [20]:
df_test.shape[0] / df.shape[0]

0.09710700369801713

In [21]:
df_train.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,C1093826151,4,M,28007,M348934600,28007,es_transportation,4.55,0
1,0,C352968107,2,M,28007,M348934600,28007,es_transportation,39.68,0
2,0,C2054744914,4,F,28007,M1823072687,28007,es_transportation,26.89,0
3,0,C1760612790,3,M,28007,M348934600,28007,es_transportation,17.25,0
4,0,C757503768,5,M,28007,M348934600,28007,es_transportation,35.72,0


In [22]:
df_val.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
9,0,C39858251,5,F,28007,M348934600,28007,es_transportation,35.4,0
20,0,C1687101094,2,F,28007,M348934600,28007,es_transportation,19.31,0
24,0,C1622124632,2,M,28007,M348934600,28007,es_transportation,29.84,0
25,0,C187514477,3,M,28007,M348934600,28007,es_transportation,12.1,0
42,0,C1635613216,4,F,28007,M1053599405,28007,es_health,105.59,0


In [23]:
df_test.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
8,0,C105845174,3,M,28007,M348934600,28007,es_transportation,32.4,0
15,0,C194016923,3,F,28007,M348934600,28007,es_transportation,30.19,0
16,0,C1207205377,4,M,28007,M1823072687,28007,es_transportation,17.54,0
17,0,C834963773,5,F,28007,M348934600,28007,es_transportation,40.69,0
40,0,C1425441042,2,M,28007,M1888755466,28007,es_otherservices,87.67,0


In [24]:
df_train.shape[0] + df_val.shape[0] + df_test.shape[0] 

594643

In [25]:
df.shape[0]

594643

#### validating distribution of labels per set is the same

In [26]:
df_train["fraud"].value_counts().to_frame()

Unnamed: 0_level_0,count
fraud,Unnamed: 1_level_1
0,470978
1,5630


In [27]:
df_train["fraud"].value_counts(normalize=True).to_frame()

Unnamed: 0_level_0,proportion
fraud,Unnamed: 1_level_1
0,0.988187
1,0.011813


In [28]:
df_val["fraud"].value_counts().to_frame()

Unnamed: 0_level_0,count
fraud,Unnamed: 1_level_1
0,59787
1,504


In [29]:
df_val["fraud"].value_counts(normalize=True).to_frame()

Unnamed: 0_level_0,proportion
fraud,Unnamed: 1_level_1
0,0.991641
1,0.008359


In [30]:
df_test["fraud"].value_counts().to_frame()

Unnamed: 0_level_0,count
fraud,Unnamed: 1_level_1
0,56678
1,1066


In [31]:
df_test["fraud"].value_counts(normalize=True).to_frame()

Unnamed: 0_level_0,proportion
fraud,Unnamed: 1_level_1
0,0.981539
1,0.018461


### Saving data before processing

In [32]:
df_train.to_csv("data/train_baseline_before_processing.csv", index=False)

In [33]:
df_val.to_csv("data/validation_baseline_before_processing.csv", index=False)

In [34]:
df_test.to_csv("data/test_baseline_before_processing.csv", index=False)

### 4- One-hot-encoding categorical columns and standard scaling numerical columns

__Notes__:
* I am prioritizing the ability to handle unknown values.
    * Setting __handle_unknown__ to "ignore" -> keep all columns as zeros when an unknown value is encountered
    * __drop__ must be set to None in this case which does not drop any value during one-hot-encoding.
    * There is concern for __multicollinearity issue__ for some models, but I am prioritizing handling unknown values in this case.

In [35]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_columns),
        # Set handle_unknown to "ignore" -> keep all columns as zeros when an unknown value is encountered
        #  drop must be set to None in this case which does not drop any value during one-hot-encoding.
        #  there is concern for multicollinearity issue for some models, but I am prioritizing handling unknown values in this case.
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop=None), categorical_columns_to_encode)
    ],
    verbose_feature_names_out=False,  # disable added prefixes to column names like "num__", "cat__", "remainder__"
    remainder="passthrough" # or 'drop' to remove the other columns
).set_output(transform="pandas")  # set output as pandas dataframe

In [36]:
# Fit on train set
preprocessor.fit(df_train)

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [37]:
df_train = preprocessor.transform(df_train)
df_val = preprocessor.transform(df_val)
df_test = preprocessor.transform(df_test)

In [38]:
df_train.shape

(476608, 34)

In [39]:
df_train.head()

Unnamed: 0,step,amount,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_U,gender_E,gender_F,gender_M,gender_U,category_es_barsandrestaurants,category_es_contents,category_es_fashion,category_es_food,category_es_health,category_es_home,category_es_hotelservices,category_es_hyper,category_es_leisure,category_es_otherservices,category_es_sportsandtoys,category_es_tech,category_es_transportation,category_es_travel,category_es_wellnessandbeauty,customer,zipcodeOri,merchant,zipMerchant,fraud
0,-1.858796,-0.303248,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C1093826151,28007,M348934600,28007,0
1,-1.858796,0.018468,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C352968107,28007,M348934600,28007,0
2,-1.858796,-0.098661,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C2054744914,28007,M1823072687,28007,0
3,-1.858796,-0.186943,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C1760612790,28007,M348934600,28007,0
4,-1.858796,-0.017797,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C757503768,28007,M348934600,28007,0


In [40]:
df_val.shape

(60291, 34)

In [41]:
df_val.head()

Unnamed: 0,step,amount,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_U,gender_E,gender_F,gender_M,gender_U,category_es_barsandrestaurants,category_es_contents,category_es_fashion,category_es_food,category_es_health,category_es_home,category_es_hotelservices,category_es_hyper,category_es_leisure,category_es_otherservices,category_es_sportsandtoys,category_es_tech,category_es_transportation,category_es_travel,category_es_wellnessandbeauty,customer,zipcodeOri,merchant,zipMerchant,fraud
9,-1.858796,-0.020728,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C39858251,28007,M348934600,28007,0
20,-1.858796,-0.168078,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C1687101094,28007,M348934600,28007,0
24,-1.858796,-0.071645,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C1622124632,28007,M348934600,28007,0
25,-1.858796,-0.234106,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C187514477,28007,M348934600,28007,0
42,-1.858796,0.622063,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,C1635613216,28007,M1053599405,28007,0


In [42]:
df_test.shape

(57744, 34)

In [43]:
df_test.head()

Unnamed: 0,step,amount,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_U,gender_E,gender_F,gender_M,gender_U,category_es_barsandrestaurants,category_es_contents,category_es_fashion,category_es_food,category_es_health,category_es_home,category_es_hotelservices,category_es_hyper,category_es_leisure,category_es_otherservices,category_es_sportsandtoys,category_es_tech,category_es_transportation,category_es_travel,category_es_wellnessandbeauty,customer,zipcodeOri,merchant,zipMerchant,fraud
8,-1.858796,-0.048201,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C105845174,28007,M348934600,28007,0
15,-1.858796,-0.06844,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C194016923,28007,M348934600,28007,0
16,-1.858796,-0.184287,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C1207205377,28007,M1823072687,28007,0
17,-1.858796,0.027718,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C834963773,28007,M348934600,28007,0
40,-1.858796,0.457954,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,C1425441042,28007,M1888755466,28007,0


### 5- Dropping irrelevant columns

In [44]:
irrelevant_columns = [
    "customer",
    "zipcodeOri",
    "merchant",
    "zipMerchant"
]

In [45]:
df_train = df_train.drop(columns=irrelevant_columns)

In [46]:
df_train.shape

(476608, 30)

In [47]:
df_train.head()

Unnamed: 0,step,amount,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_U,gender_E,gender_F,gender_M,gender_U,category_es_barsandrestaurants,category_es_contents,category_es_fashion,category_es_food,category_es_health,category_es_home,category_es_hotelservices,category_es_hyper,category_es_leisure,category_es_otherservices,category_es_sportsandtoys,category_es_tech,category_es_transportation,category_es_travel,category_es_wellnessandbeauty,fraud
0,-1.858796,-0.303248,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
1,-1.858796,0.018468,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,-1.858796,-0.098661,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
3,-1.858796,-0.186943,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
4,-1.858796,-0.017797,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [48]:
df_train.duplicated().sum()

np.int64(8429)

In [49]:
df_val = df_val.drop(columns=irrelevant_columns)

In [50]:
df_val.shape

(60291, 30)

In [51]:
df_val.head()

Unnamed: 0,step,amount,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_U,gender_E,gender_F,gender_M,gender_U,category_es_barsandrestaurants,category_es_contents,category_es_fashion,category_es_food,category_es_health,category_es_home,category_es_hotelservices,category_es_hyper,category_es_leisure,category_es_otherservices,category_es_sportsandtoys,category_es_tech,category_es_transportation,category_es_travel,category_es_wellnessandbeauty,fraud
9,-1.858796,-0.020728,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
20,-1.858796,-0.168078,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
24,-1.858796,-0.071645,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
25,-1.858796,-0.234106,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
42,-1.858796,0.622063,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [52]:
df_val.duplicated().sum()

np.int64(166)

In [53]:
df_test = df_test.drop(columns=irrelevant_columns)

In [54]:
df_test.shape

(57744, 30)

In [55]:
df_test.head()

Unnamed: 0,step,amount,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_U,gender_E,gender_F,gender_M,gender_U,category_es_barsandrestaurants,category_es_contents,category_es_fashion,category_es_food,category_es_health,category_es_home,category_es_hotelservices,category_es_hyper,category_es_leisure,category_es_otherservices,category_es_sportsandtoys,category_es_tech,category_es_transportation,category_es_travel,category_es_wellnessandbeauty,fraud
8,-1.858796,-0.048201,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
15,-1.858796,-0.06844,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
16,-1.858796,-0.184287,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
17,-1.858796,0.027718,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
40,-1.858796,0.457954,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0


In [56]:
df_test.duplicated().sum()

np.int64(115)

### 6- Saving data and preprocessor

In [57]:
joblib.dump(preprocessor, "models/baseline_preprocessor.joblib")

['models/baseline_preprocessor.joblib']

In [58]:
df_train.to_csv("data/train_baseline.csv", index=False)

In [59]:
df_val.to_csv("data/validation_baseline.csv", index=False)

In [60]:
df_test.to_csv("data/test_baseline.csv", index=False)