# Model for HackMX NDS Cognitive Labs Challenge
## Fraud Detection
#### Dataset obtained from IEEE-CIS Fraud Detection in Kaggle: 

In [None]:
import os
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
import pickle
import time

import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit, KFold

import lightgbm as lgb

In [None]:
train_identity = "data/train_identity.csv"
train_transaction = "data/train_transaction.csv"
test_identity = "data/test_identity.csv"
test_transaction = "data/test_transaction.csv"

In [None]:
#%%time # visualize time to load data
train_id = pd.read_csv(train_identity)
train_tr = pd.read_csv(train_transaction)
test_id = pd.read_csv(test_identity)
test_tr = pd.read_csv(test_transaction)

In [None]:
def downcast_dtypes(df):
    _start = df.memory_usage(deep=True).sum() / 1024 ** 2
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    _end = df.memory_usage(deep=True).sum() / 1024 ** 2
    saved = (_start - _end) / _start * 100
    print(f"Saved {saved:.2f}%")
    return df

In [None]:
train_id = downcast_dtypes(train_id)
train_tr = downcast_dtypes(train_tr)
test_id = downcast_dtypes(test_id)
test_tr = downcast_dtypes(test_tr)

In [None]:
train = pd.merge(
    train_tr, train_id, how="left", on="TransactionID", left_index=True, right_index=True,
)


In [None]:
train.to_csv("train_set.csv")

In [None]:
test = pd.merge(
    test_tr, test_id, how="left", on="TransactionID", left_index=True, right_index=True
)

In [None]:
print(f"Train shape: {train.shape}, Test shape: {test.shape}")

In [None]:
imp_features = [
    "TransactionAmt",
    "ProductCD",
    "card1",
    "card2",
    "card3",
    "card5",
    "card6",
    "addr1",
    "addr2",
    "dist1",
    "dist2",
    "P_emaildomain",
    "R_emaildomain",
    "C1",
    "C2",
    "C4",
    "C5",
    "C6",
    "C7",
    "C8",
    "C9",
    "C10",
    "C11",
    "C12",
    "C13",
    "C14",
    "D1",
    "D2",
    "D3",
    "D4",
    "D5",
    "D10",
    "D11",
    "D15",
    "M1",
    "M2",
    "M3",
    "M4",
    "M6",
    "M7",
    "M8",
    "M9",
    "V1",
    "V3",
    "V4",
    "V6",
    "V8",
    "V11",
    "V13",
    "V14",
    "V17",
    "V20",
    "V23",
    "V26",
    "V27",
    "V30",
    "V36",
    "V37",
    "V40",
    "V41",
    "V44",
    "V47",
    "V48",
    "V54",
    "V56",
    "V59",
    "V62",
    "V65",
    "V67",
    "V68",
    "V70",
    "V76",
    "V78",
    "V80",
    "V82",
    "V86",
    "V88",
    "V89",
    "V91",
    "V107",
    "V108",
    "V111",
    "V115",
    "V117",
    "V120",
    "V121",
    "V123",
    "V124",
    "V127",
    "V129",
    "V130",
    "V136",
    "V138",
    "V139",
    "V142",
    "V147",
    "V156",
    "V160",
    "V162",
    "V165",
    "V166",
    "V169",
    "V171",
    "V173",
    "V175",
    "V176",
    "V178",
    "V180",
    "V182",
    "V185",
    "V187",
    "V188",
    "V198",
    "V203",
    "V205",
    "V207",
    "V209",
    "V210",
    "V215",
    "V218",
    "V220",
    "V221",
    "V223",
    "V224",
    "V226",
    "V228",
    "V229",
    "V234",
    "V235",
    "V238",
    "V240",
    "V250",
    "V252",
    "V253",
    "V257",
    "V258",
    "V260",
    "V261",
    "V264",
    "V266",
    "V267",
    "V271",
    "V274",
    "V277",
    "V281",
    "V283",
    "V284",
    "V285",
    "V286",
    "V289",
    "V291",
    "V294",
    "V296",
    "V297",
    "V301",
    "V303",
    "V305",
    "V307",
    "V309",
    "V310",
    "V314",
    "V320",
    "DeviceType",
    "DeviceInfo",
    "isFraud", ]

In [None]:
cols_to_drop_train = [col for col in train.columns if col not in imp_features]
cols_to_drop_test = [col for col in test.columns if col not in imp_features]

train = train.drop(cols_to_drop_train, axis=1)
test = test.drop(cols_to_drop_test, axis=1)

In [None]:
train = train.replace([np.inf, -np.inf], np.nan)
test = test.replace([np.inf, -np.inf], np.nan)

train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [None]:
for col in train.columns:
    if train[col].dtype == "object":
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))

In [None]:
print(train.shape)
print(test.shape)
train.to_csv("train_set.csv", index = False, header=False)

In [None]:
X_train = train.drop("isFraud", axis=1).copy()
X_test = test.copy()
y_train = train["isFraud"].copy()

In [None]:
print(X_train.shape, X_test.shape, y_train.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
    X_train, y_train, test_size=0.3, random_state=7
)

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    max_depth=45, max_features=30, n_estimators=500, n_jobs=-1, min_samples_leaf=200
)

In [None]:
#%%time 
rf.fit(X_train_split, y_train_split)

In [None]:
print("Roc Auc Score:", roc_auc_score(y_test_split, rf.predict(X_test_split)))

In [None]:
import pickle


In [None]:
Pkl_Filename = "modelo.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(rf, file)

In [None]:
import lightgbm as lgb


In [None]:
feats = {}
for feature, importance in zip(X_train.columns, rf.feature_importances_):
    feats[feature] = importance

importances = pd.DataFrame.from_dict(feats, orient="index").rename(
    columns={0: "Gini-importance"}
)
imp = importances.sort_values(by="Gini-importance", ascending=False)[:20]

In [None]:
plt.figure(1, figsize=(16, 7))
plt.bar(imp.index, imp["Gini-importance"])
plt.xticks(imp.index, rotation=90)
print(imp.index)

In [None]:
from sklearn.model_selection import TimeSeriesSplit, KFold

n_folds = 5
folds = TimeSeriesSplit(n_splits=n_folds)
folds = KFold(n_splits=5)

In [None]:
columns = X_train.columns

splits = folds.split(X_train, y_train)

y_preds = np.zeros(X_test.shape[0])
y_oof = np.zeros(X_train.shape[0])

score_auc = 0

feature_importances = pd.DataFrame()
feature_importances["feature"] = columns

In [None]:
params = {
    "num_leaves": 64,
    "min_child_weight": 0.03,
    "feature_fraction": 0.04,
    "bagging_fraction": 0.33,
    "min_data_in_leaf": 80,
    "objective": "binary",
    "max_depth": -1,
    "learning_rate": 0.006,
    "boosting_type": "gbdt",
    "bagging_seed": 7,
    "metric": "auc",
    "verbosity": -1,
    "reg_alpha": 0.3,
    "reg_lambda": 0.6,
    "random_state": 0,
}

In [None]:
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_tr, X_val = X_train[columns].iloc[train_index], X_train[columns].iloc[valid_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dvalid = lgb.Dataset(X_val, label=y_val)

    clf = lgb.train(params, dtrain, 10000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=100)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_val = clf.predict(X_val)
    y_oof[valid_index] = y_pred_val
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_val, y_pred_val)}")
    
    score_auc += roc_auc_score(y_val, y_pred_val) / n_folds
    
    y_preds += clf.predict(X_test) / n_folds
    
    del X_tr, X_val, y_tr, y_val

In [None]:
print(f"\nMean AUC = {score_auc}")
print(f"Out of folds AUC = {roc_auc_score(y_train, y_oof)}")

In [None]:
feature_importances["average"] = feature_importances[
    ["fold_{}".format(fold + 1) for fold in range(folds.n_splits)]
].mean(axis=1)

In [None]:
f = (
    feature_importances[["feature", "average"]]
    .sort_values(by="average", ascending=False)
    .head(20)
)

In [None]:
plt.figure(1, figsize=(16, 7))
plt.bar(f["feature"], f["average"])
plt.xticks(f["feature"], rotation=90)
print(imp.index)

In [None]:
clf_params = {'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.006,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.03,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 64,
 'objective': 'binary',
 'random_state': 0,
 'reg_alpha': 0.3,
 'reg_lambda': 0.6,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0,
 'feature_fraction': 0.04,
 'bagging_fraction': 0.33,
 'min_data_in_leaf': 80,
 'bagging_seed': 7,
 'metric': 'auc',
 'verbosity': -1,
 'num_boost_round': 5575}

In [None]:
final_clf = lgb.LGBMClassifier(**clf_params)

In [None]:
final_clf.fit(X_train, y_train)

In [None]:
Pkl_Filename = "lgbm_model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(final_clf, file)

In [None]:
print("Roc Auc Score:", roc_auc_score(y_test_split, final_clf.predict(X_test_split)))