In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN
from skopt import BayesSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
df = pd.concat([pd.read_csv("train.csv"),
               pd.read_csv("test.csv")], ignore_index=True)
df['is_male'] = df['gender'].map({'M': 1, 'F': 0}).astype(bool)
df.drop(["gender"], axis=1, inplace=True)

df["trans_hour"] = df["trans_time"].apply(lambda x: int(x.split(":")[0]))
df.drop(["trans_time"], axis=1, inplace=True)

df["trans_date"] = pd.to_datetime(df['trans_date'], format='%Y-%m-%d')
df["trans_day"] = df["trans_date"].dt.day
df["trans_day"] = df["trans_date"].dt.day_of_week
df['age'] = df['trans_date'].dt.year - \
    pd.to_datetime(df['dob'], format='%Y-%m-%d').dt.year
df.drop(["trans_date", "dob"], axis=1, inplace=True)

df['nth_trans_of_day'] = df.groupby(['cc_num', 'trans_day']).cumcount() + 1
df['total_trans_of_day'] = df.groupby(['cc_num', 'trans_day'])['nth_trans_of_day'].transform('max')


def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * \
        np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c


df['distance_to_merch'] = haversine(
    df['lat'], df['long'], df['merch_lat'], df['merch_long'])
df.drop(['merch_lat', 'merch_long', 'lat', 'long'], axis=1, inplace=True)

category_encoder = OneHotEncoder(handle_unknown='ignore')
encoded_columns = category_encoder.fit_transform(df[['category']])
encoded_df = pd.DataFrame(encoded_columns.toarray(
), columns=category_encoder.get_feature_names_out(['category']), index=df.index)
encoded_df = encoded_df.astype(bool)
df = df.join(encoded_df)
df.drop(['category'], axis=1, inplace=True)


for c in ['amt', 'city_pop', 'unix_time', 'distance_to_merch']:
    scaler = StandardScaler()
    df[c] = scaler.fit_transform(df[[c]])


df.drop(["trans_num", "first", "last", "merchant", "job", "street", "cc_num",
        "city", "state", "zip"], axis=1, inplace=True)

In [None]:
df

In [None]:
df_tt = df[df['is_fraud'].notnull()]
df_sub = df[df['is_fraud'].isnull()]

X = df_tt.drop(["is_fraud"], axis=1)
y = df_tt["is_fraud"]

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
X_resampled

In [None]:
param_space = {
    'n_estimators': (500,400),
    'max_depth': (5, 40),
    'learning_rate': (0.05, 0.5),
    'subsample': (0.8, 1),
    'colsample_bytree': (0.8, 1),
}

xgb = XGBClassifier(eval_metric='logloss', objective="binary:logistic")

opt = BayesSearchCV(
    scoring='f1',
    estimator=xgb,
    search_spaces=param_space,
    n_iter=256,
    cv=5,
    n_jobs=-1,
    random_state=42,
)

opt.fit(X, y)

print("Best parameters found: ", opt.best_params_)
print("Best cross-validation score: ", opt.best_score_)


In [None]:
opt.best_estimator_.feature_importances_

In [None]:
best_params = opt.best_params_
xgb_best = XGBClassifier(
    **best_params, eval_metric='logloss', objective="binary:logistic")
xgb_best.fit(X.drop(labels='id', axis=1), y)

y_sub_pred = xgb_best.predict(df_sub.drop(labels=['is_fraud', 'id'], axis=1))

submission = pd.DataFrame({
    'id': df_sub['id'],
    'is_fraud': y_sub_pred
})


submission.to_csv('submission.csv', index=False)