## Feature Engineering & Model Testing

In [4]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from util.data_access import load_data
from util.preprocess import preprocess, cat_columns
from util.tracking import get_metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTENC
import lightgbm as lgb
from dotenv import load_dotenv
load_dotenv()

import os
import mlflow
mlflow.set_tracking_uri('***REMOVED***')


EXPERIMENT_NAME = 'Fraud Model Feature Engineering Loop'

try:
    EXPERIMENT_ID = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
except Exception as e:
    print(e)
    EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)

In [5]:
df_raw = load_data()
df_raw['Customer Moving Amount Average'] = df_raw.groupby(['customer'])['amount'].transform(lambda x: x.rolling(10, 1).mean())
df_raw['Transactions Completed'] = 1
MAX_CUSTOMER_TRANSACTIONS = df_raw.groupby('customer')['fraud'].count().max()
df_raw['Customer Transaction Number'] = df_raw.groupby(['customer'])['Transactions Completed'].transform(lambda x: x.rolling(MAX_CUSTOMER_TRANSACTIONS, 1).sum())


X_train_raw, X_valid_raw, y_train, y_valid = train_test_split(df_raw.drop('fraud',axis=1),df_raw.loc[:,'fraud'])

In [6]:
drop_columns = ['zipMerchant','zipcodeOri']
X_train = preprocess(X_train_raw, cat_columns, drop_columns)
X_valid = preprocess(X_valid_raw, cat_columns, drop_columns)

In [8]:
_cat_columns = [
    'age',
    'gender',
    'category',
    'Customer Transaction Number'
]
columns = [
    'amount',
    'step',
    'Customer Moving Amount Average',
    *_cat_columns
]



In [12]:
X_train.head()

Unnamed: 0,step,customer,age,gender,merchant,category,amount,Customer Moving Amount Average,Transactions Completed,Customer Transaction Number
579939,176,C1589138432,4,M,M1823072687,es_transportation,6.53,17.703,1,166.0
546403,167,C719332132,2,F,M1946091778,es_wellnessandbeauty,70.71,32.508,1,165.0
157689,55,C1377339988,2,M,M348934600,es_transportation,22.3,23.344,1,51.0
211311,72,C1228152586,1,F,M348934600,es_transportation,24.51,20.186,1,68.0
555868,169,C1734487337,4,F,M1823072687,es_transportation,33.4,42.054,1,165.0


Unnamed: 0,step,customer,age,gender,merchant,category,amount,Customer Moving Amount Average,Transactions Completed,Customer Transaction Number
579939,176,C1589138432,4,M,M1823072687,es_transportation,6.53,17.703000,1,166.0
546403,167,C719332132,2,F,M1946091778,es_wellnessandbeauty,70.71,32.508000,1,165.0
157689,55,C1377339988,2,M,M348934600,es_transportation,22.30,23.344000,1,51.0
211311,72,C1228152586,1,F,M348934600,es_transportation,24.51,20.186000,1,68.0
555868,169,C1734487337,4,F,M1823072687,es_transportation,33.40,42.054000,1,165.0
...,...,...,...,...,...,...,...,...,...,...
183720,63,C1194290880,2,M,M1198415165,es_wellnessandbeauty,39.96,71.608889,1,9.0
557382,170,C744066701,2,F,M1823072687,es_transportation,44.80,32.991000,1,91.0
564608,171,C1214929092,3,M,M348934600,es_transportation,50.71,33.277000,1,92.0
118011,42,C918659260,3,F,M348934600,es_transportation,37.19,39.226000,1,38.0


In [20]:
model = lgb.LGBMClassifier()
_cat_columns = [
    'age',
    'gender',
    'category',
    'Customer Transaction Number'
]

columns = [
    'amount',
    'step',
    'Customer Moving Amount Average',
    *_cat_columns
]

train_data = X_train.loc[:,columns]
valid_data = X_valid.loc[:,columns]

_cat_columns_idx = [train_data.columns.get_loc(i) for i in _cat_columns]
resampler = SMOTENC(categorical_features=_cat_columns_idx,n_jobs=-1)
train_data_res, y_train_res = resampler.fit_resample(train_data,y_train)

model.fit(train_data_res,y_train_res, categorical_feature=_cat_columns)



: 

: 

In [7]:
with mlflow.start_run(experiment_id='3') as run:
    mlflow.log_param('Columns', columns)
    mlflow.log_param('Train Data Dimension', train_data.shape)     
    mlflow.log_param('Train Target Bad Rate', y_train.mean())     
    mlflow.log_param('Valid Data Dimension', valid_data.shape)     
    mlflow.log_param('Valid Target Dimension', y_valid.mean())     
    mlflow.log_param('Model Type', model.__class__.__name__)

    y_pred_train = model.predict(train_data)
    y_pred_proba_train = model.predict(train_data)
    train_metrics = get_metrics(y_train,y_pred_train,y_pred_proba_train)

    for key, val in train_metrics.items():
        mlflow.log_metric(f'Train {key}', val)

    y_pred_valid = model.predict(valid_data)
    y_pred_proba_valid = model.predict(valid_data)
    train_metrics = get_metrics(y_valid,y_pred_valid,y_pred_proba_valid)

    for key, val in train_metrics.items():
        mlflow.log_metric(f'Validation {key}', val)


