## Feature Engineering & Model Testing

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from util.data_access import load_data
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
df_raw = load_data()
X_train_raw, X_valid_raw, y_train_raw, y_valid_raw = train_test_split(df_raw.drop('fraud',axis=1),df_raw.loc[:,'fraud'])

In [24]:
cat_columns = [
    'customer',
    'age',
    'gender',
    'merchant',
    'category'
]

drop_columns = ['zipMerchant','zipcodeOri']

def preprocess(X: pd.DataFrame, categorical_columns: list[str]) -> pd.DataFrame:
    X = X.copy()
    X[cat_columns] = X.loc[:,cat_columns].applymap(lambda x: x.strip("'"))
    X = X.drop(drop_columns,axis=1)
    return X

X_train = preprocess(X_train_raw, cat_columns)
X_valid = preprocess(X_valid_raw,cat_columns)

In [25]:
X_train

Unnamed: 0,step,customer,age,gender,merchant,category,amount
148111,52,C1413412440,2,F,M348934600,es_transportation,39.90
191992,66,C1113166893,5,F,M85975013,es_food,54.39
179728,62,C153258882,4,F,M1823072687,es_transportation,11.46
126894,45,C2086664398,2,F,M348934600,es_transportation,47.67
494070,152,C817389751,5,F,M1823072687,es_transportation,35.47
...,...,...,...,...,...,...,...
221145,75,C23582994,2,F,M348934600,es_transportation,43.41
427314,134,C1642006830,3,F,M1823072687,es_transportation,47.37
210581,72,C269960003,1,M,M348934600,es_transportation,18.71
466469,145,C1095821033,4,F,M1823072687,es_transportation,22.90


In [None]:
transformer = ColumnTransformer([
    ('categorical_transformation', OneHotEncoder(drop='first', sparse=False), ['age','gender','category'])
], remainder='passthrough')

one_hot_encoded = pd.DataFrame(
    transformer.fit_transform(X_train),
    columns=transformer.get_feature_names_out()
)
one_hot_encoded.head()




: 

: 