In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

In [None]:
train_data_path = './data/train.csv'
test_data_path = './data/test.csv'
output_dir = './out'
train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)

In [None]:
print(train.isnull().sum())
print(test.isnull().sum())

In [5]:
train['trans_datetime'] = pd.to_datetime(train['trans_date'] + ' ' + train['trans_time'])
test['trans_datetime'] = pd.to_datetime(test['trans_date'] + ' ' + test['trans_time'])

for dataset in [train, test]:
    dataset['year'] = dataset['trans_datetime'].dt.year
    dataset['month'] = dataset['trans_datetime'].dt.month
    dataset['day'] = dataset['trans_datetime'].dt.day
    dataset['hour'] = dataset['trans_datetime'].dt.hour
    dataset['minute'] = dataset['trans_datetime'].dt.minute
    dataset['dayofweek'] = dataset['trans_datetime'].dt.dayofweek

train.drop(['trans_date', 'trans_time', 'trans_datetime'], axis=1, inplace=True)
test.drop(['trans_date', 'trans_time', 'trans_datetime'], axis=1, inplace=True)

In [7]:
from sklearn.preprocessing import OrdinalEncoder
categorical_features = ['category', 'gender', 'street', 'city', 'state', 'job', 'merchant']
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
train[categorical_features] = encoder.fit_transform(train[categorical_features].astype(str))
test[categorical_features] = encoder.transform(test[categorical_features].astype(str))

In [8]:
features_to_drop = ['trans_num', 'cc_num', 'first', 'last', 'street', 'dob']
train.drop(features_to_drop, axis=1, inplace=True)
test.drop(features_to_drop, axis=1, inplace=True)

In [9]:
X = train.drop(['is_fraud', 'id'], axis=1)
y = train['is_fraud']

X_test = test.drop(['id'], axis=1)

In [None]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

# print("Original dataset shape:\n", y.value_counts())
# print("Resampled dataset shape:\n", y_res.value_counts())

In [None]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1 = make_scorer(f1_score)
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
xgb_scores = cross_val_score(xgb, X_res, y_res, cv=cv, scoring=f1)
print("XGBoost F1-Score:", xgb_scores.mean())

In [13]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
}
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='f1',
    cv=3,
    verbose=2,
    n_jobs=-1
)
grid_search.fit(X_res, y_res)
print("Best parameters:", grid_search.best_params_)
print("Best F1-Score:", grid_search.best_score_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Best F1-Score: 0.9919763540070902


In [14]:
best_xgb = grid_search.best_estimator_
best_xgb.fit(X_res, y_res)

Parameters: { "use_label_encoder" } are not used.



In [17]:
test_pred = best_xgb.predict(X_test)

submission = pd.DataFrame({
    'id': test['id'],
    'is_fraud': test_pred
})

submission.to_csv(os.path.join(output_dir, 'submission.csv'), index=False)
print("submission file created")


submission file created
