<a href="https://colab.research.google.com/github/vetheshwaran/.github-workflows/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import warnings, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import (roc_auc_score, average_precision_score, RocCurveDisplay,
                             PrecisionRecallDisplay, confusion_matrix, classification_report)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_recall_curve
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

In [20]:
warnings.filterwarnings("ignore")

In [21]:
df = pd.read_csv("Fraud.csv")

In [22]:
assert set(["step","type","amount","nameOrig","oldbalanceOrg","newbalanceOrig",
            "nameDest","oldbalanceDest","newbalanceDest","isFraud","isFlaggedFraud"]) - set(df.columns) <= set()

In [23]:
df["isMerchantDest"] = df["nameDest"].str.startswith("M").astype("int8")

In [24]:
for col in ["oldbalanceDest","newbalanceDest"]:
    df[col + "_missing"] = df[col].isna().astype("int8")
    df.loc[df["isMerchantDest"]==1, col] = df.loc[df["isMerchantDest"]==1, col].fillna(0)
    df[col] = df[col].fillna(0)

In [25]:
for col in ["oldbalanceOrg","newbalanceOrig"]:
    df[col + "_missing"] = df[col].isna().astype("int8")
    df[col] = df[col].fillna(0)

In [26]:
df["hour"] = df["step"] % 24
df["day"]  = (df["step"] // 24) % 30
df["hour_sin"] = np.sin(2*np.pi*df["hour"]/24)
df["hour_cos"] = np.cos(2*np.pi*df["hour"]/24)

In [27]:
df["deltaOrg"]  = df["newbalanceOrig"] + df["amount"] - df["oldbalanceOrg"]
df["deltaDest"] = df["oldbalanceDest"] + df["amount"] - df["newbalanceDest"]


In [28]:
eps = 1e-6
df["amt_over_oldOrg"]  = np.clip(df["amount"]/(df["oldbalanceOrg"]+eps), 0, 10)
df["amt_over_oldDest"] = np.clip(df["amount"]/(df["oldbalanceDest"]+eps), 0, 10)

In [29]:
df["log_amount"]       = np.log1p(df["amount"])
df["log_oldOrg"]       = np.log1p(df["oldbalanceOrg"])
df["log_newOrg"]       = np.log1p(df["newbalanceOrig"])
df["log_oldDest"]      = np.log1p(df["oldbalanceDest"])
df["log_newDest"]      = np.log1p(df["newbalanceDest"])

In [30]:
df = pd.get_dummies(df, columns=["type"], drop_first=True)

In [31]:
df = df.drop(columns=["nameOrig","nameDest"])

In [32]:
base_features_for_vif = [
    "log_amount","log_oldOrg","log_newOrg","log_oldDest","log_newDest",
    "deltaOrg","deltaDest","amt_over_oldOrg","amt_over_oldDest",
    "hour_sin","hour_cos","isMerchantDest","isFlaggedFraud",
    "oldbalanceDest_missing","newbalanceDest_missing","oldbalanceOrg_missing","newbalanceOrig_missing"
] + [c for c in df.columns if c.startswith("type_")]

In [33]:
base_features_for_vif = [c for c in base_features_for_vif if c in df.columns]

X_vif = df[base_features_for_vif].select_dtypes(include=[np.number]).fillna(0)
X_vif = sm.add_constant(X_vif)

In [34]:
def compute_vif(frame):
    vifs = []
    for i, col in enumerate(frame.columns):
        if col == "const":
            continue
        vifs.append((col, variance_inflation_factor(frame.values, i)))
    return pd.DataFrame(vifs, columns=["feature","VIF"]).sort_values("VIF", ascending=False)

In [35]:
vif_before = compute_vif(X_vif)

In [36]:
to_drop = []
for _, row in vif_before.iterrows():
    if row["VIF"] > 20 and row["feature"] in ["log_oldOrg","log_newOrg","log_oldDest","log_newDest"]:
        to_drop.append(row["feature"])

df = df.drop(columns=list(set(to_drop)))

In [37]:
df.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,isMerchantDest,oldbalanceDest_missing,...,amt_over_oldOrg,amt_over_oldDest,log_amount,log_oldOrg,log_newOrg,log_newDest,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0,1,0,...,0.057834,10.0,9.194276,12.044359,11.984786,0.0,False,False,True,False
1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0,1,0,...,0.087735,10.0,7.531166,9.964112,9.872292,0.0,False,False,True,False
2,1,181.0,181.0,0.0,0.0,0.0,1,0,0,0,...,1.0,10.0,5.204007,5.204007,0.0,0.0,False,False,False,True
3,1,181.0,181.0,0.0,21182.0,0.0,1,0,0,0,...,1.0,0.008545,5.204007,5.204007,0.0,0.0,True,False,False,False
4,1,11668.14,41554.0,29885.86,0.0,0.0,0,0,1,0,...,0.280795,10.0,9.364703,10.634773,10.305174,0.0,False,False,True,False


