In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# 1. Load dataset
# Import using pandas to read the CSV file
df = pd.read_csv('PS_20174392719_1491204439457_log.csv',\
# defining smaller datatypes (ex int32 vs int64), reduces memory usage in cases where it is not needed
                 dtype={                'step': 'int32',
                                         'type':'str',
                                         'amount':"float32",
                                         'nameOrig':"str",
                                         'oldbalanceOrg':"float32",
                                         'newbalanceOrig':"float32",
                                         'nameDest':'str',
                                         'oldbalanceDest':"float32",
                                         'newbalanceDest':"float32",
                                         'isFraud':'int8',
                                         'isFlaggedFraud':'int8'
                                         })

# 2. Keep relevant columns
cols = [
    'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig',
    'oldbalanceDest', 'newbalanceDest', 'isFraud'
]
df = df[cols]

# 3. Encode and clean the transaciton type
# Create a column for each type of transaction
ohe = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)
df_ohe = ohe.fit_transform(df[['type']])

train_ohe_df =  pd.DataFrame(df_ohe, columns=ohe.get_feature_names_out())
# drop original type column
df = pd.concat([df.drop('type', axis=1), train_ohe_df], axis=1)

# Impute values that contain invalid values
df = df.replace([float('inf'), float('-inf')], 0).fillna(0)

# 4. Dataset splitting
X = df.drop('isFraud', axis=1)
y = df['isFraud']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# perform randomized sampling of training and test set
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [22]:
# 5. Train model
model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.7,
    scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),
    eval_metric='logloss',
    random_state=123,
    njobs = 4
)
model.fit(X_train, y_train)

# 6. Evaluate the models predictions
y_pred = model.predict(X_test)
print("\nðŸ“Š Classification Report:\n")
print(classification_report(y_test, y_pred))


ðŸ“Š Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.55      0.98      0.70      1643

    accuracy                           1.00   1272524
   macro avg       0.77      0.99      0.85   1272524
weighted avg       1.00      1.00      1.00   1272524



In [23]:
# 7.Export model and other objects for use later
with open("./pickles/fraud_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("./pickles/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("./pickles/onehot_encoder.pkl", "wb") as f:
    pickle.dump(ohe, f)