## Feature Engineering & Model Testing

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from util.data_access import load_data
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [2]:
df_raw = load_data()
X_train_raw, X_valid_raw, y_train_raw, y_valid_raw = train_test_split(df_raw.drop('fraud',axis=1),df_raw.loc[:,'fraud'])

In [4]:
cat_columns = [
    'customer',
    'age',
    'gender',
    'merchant',
    'category'
]

drop_columns = ['zipMerchant','zipcodeOri']

def preprocess(X: pd.DataFrame, categorical_columns: list[str]) -> pd.DataFrame:
    X = X.copy()
    X[cat_columns] = X.loc[:,cat_columns].applymap(lambda x: x.strip("'"))
    X = X.drop(drop_columns,axis=1)
    return X

X_train = preprocess(X_train_raw, cat_columns)
X_valid = preprocess(X_valid_raw,cat_columns)

In [5]:
X_train

Unnamed: 0,step,customer,age,gender,merchant,category,amount
143685,50,C432013067,1,M,M1823072687,es_transportation,19.82
309760,101,C1865204568,5,M,M348934600,es_transportation,36.16
96570,35,C297351696,1,M,M348934600,es_transportation,28.37
253408,84,C1152356495,3,F,M1823072687,es_transportation,6.57
322937,105,C477892356,4,M,M1823072687,es_transportation,1.61
...,...,...,...,...,...,...,...
131535,47,C1631162170,4,F,M348934600,es_transportation,1.85
151984,53,C1239061472,2,M,M348934600,es_transportation,35.90
126571,45,C1769470125,2,F,M85975013,es_food,56.64
178985,62,C1862384239,4,M,M1823072687,es_transportation,36.08


In [6]:
transformer = ColumnTransformer([
    ('categorical_transformation', OneHotEncoder(drop='first', sparse=False), ['age','gender','category'])
], remainder='passthrough')

one_hot_encoded = pd.DataFrame(
    transformer.fit_transform(X_train),
    columns=transformer.get_feature_names_out()
)
one_hot_encoded.head()




Unnamed: 0,categorical_transformation__age_1,categorical_transformation__age_2,categorical_transformation__age_3,categorical_transformation__age_4,categorical_transformation__age_5,categorical_transformation__age_6,categorical_transformation__age_U,categorical_transformation__gender_F,categorical_transformation__gender_M,categorical_transformation__gender_U,...,categorical_transformation__category_es_otherservices,categorical_transformation__category_es_sportsandtoys,categorical_transformation__category_es_tech,categorical_transformation__category_es_transportation,categorical_transformation__category_es_travel,categorical_transformation__category_es_wellnessandbeauty,remainder__step,remainder__customer,remainder__merchant,remainder__amount
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,50,C432013067,M1823072687,19.82
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,101,C1865204568,M348934600,36.16
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,35,C297351696,M348934600,28.37
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,84,C1152356495,M1823072687,6.57
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,105,C477892356,M1823072687,1.61
