In [3]:
# import all modules needed
import models.eda as eda

import pandas as pd
import numpy as np
from IPython.display import display

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import xgboost as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.utils import resample

from sklearn.ensemble import RandomForestClassifier
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args

In [2]:
# df_client and df_invoice are being loaded from the data folder
client = eda.load_data("data/train/client_train.csv")
invoice = eda.load_data("data/train/invoice_train.csv")
df = eda.feature_change(client, invoice)
df = eda.get_mean_consumption(df)
df = eda.get_historical_mean(df)

  invoice = eda.load_data("data/train/invoice_train.csv")


In [4]:
df.head(20)

Unnamed: 0,district,client_id,client_catg,region,creation_date,target,region_group,coop_time,invoice_date,tarif_type,...,reading_remarque,counter_coefficient,months_number,counter_type,invoice_month,invoice_year,is_weekday,total_consumption,mean_consumption_per_year,historical_mean_consumption
0,60,0,11,101,1994-12-31,0.0,200,288,2005-10-17,11,...,6,1,4,1,10,2005,0.0,124,124.0,0.0
1,60,0,11,101,1994-12-31,0.0,200,288,2006-10-18,11,...,6,1,4,1,10,2006,0.0,159,154.0,124.0
2,60,0,11,101,1994-12-31,0.0,200,288,2006-06-23,11,...,6,1,4,1,6,2006,0.0,162,154.0,141.5
3,60,0,11,101,1994-12-31,0.0,200,288,2006-02-24,11,...,6,1,4,1,2,2006,0.0,141,154.0,148.333333
4,60,0,11,101,1994-12-31,0.0,200,288,2007-02-26,11,...,6,1,4,1,2,2007,0.0,182,232.666667,146.5
5,60,0,11,101,1994-12-31,0.0,200,288,2007-10-25,11,...,6,1,4,1,10,2007,0.0,276,232.666667,153.6
6,60,0,11,101,1994-12-31,0.0,200,288,2007-06-27,11,...,6,1,4,1,6,2007,0.0,240,232.666667,174.0
7,60,0,11,101,1994-12-31,0.0,200,288,2008-01-04,11,...,6,1,4,1,1,2008,0.0,277,207.333333,183.428571
8,60,0,11,101,1994-12-31,0.0,200,288,2008-07-28,11,...,6,1,4,1,7,2008,0.0,171,207.333333,195.125
9,60,0,11,101,1994-12-31,0.0,200,288,2008-11-25,11,...,6,1,4,1,11,2008,0.0,174,207.333333,192.444444


## Baseline model
- our very simple baseline model assumes every client from district 51 is fraudulent. 

In [5]:
def baseline_model(X):
    y_pred = [0 if cat != 51 else 1 for cat in X.client_catg]
    return y_pred

# in district 51 the most fraudulents appear:
pd.crosstab(df['target'], df['client_catg'], normalize='columns')*100

client_catg,11,12,51
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,92.391486,94.413284,79.044952
1.0,7.608514,5.586716,20.955048


In [6]:
# Select the features to use for modeling
feature_cols = ['district','client_catg', 'region_group',
       'tarif_type', 'counter_number',
       'counter_statue', 'counter_code', 'reading_remarque',
       'counter_coefficient', 'months_number', 'counter_type',
       'total_consumption', 'invoice_year', 'mean_consumption_per_year', 'historical_mean_consumption']
X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df['target'], test_size=0.3, random_state=42)
y_pred = baseline_model(X_test)

print('F1-score:', precision_score(y_test, y_pred).round(2))
print('ROC AUC:', roc_auc_score(y_test, y_pred).round(2))

F1-score: 0.21
ROC AUC: 0.52


# Preprocessing

In [7]:
feature_cols = ['district','client_catg', 'region_group',
       'tarif_type', 'counter_number',
       'counter_statue', 'counter_code', 'reading_remarque',
       'counter_coefficient', 'months_number', 'counter_type',
       'total_consumption', 'invoice_year', 'mean_consumption_per_year', 'historical_mean_consumption']

## Sampling

In [8]:
# sample the data to balance the classes

# Separate majority and minority classes
df_majority = df[df.target==0]
df_minority = df[df.target==1]

# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False,     # Sample without replacement
                                   n_samples=len(df_minority),    # Match number in minority class
                                   random_state=42)  # Reproducible results

# Combine majority class with minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Split into X and y
X_down = df_downsampled[feature_cols]
y_down = df_downsampled['target']

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,    # Sample with replacement
                                 n_samples=len(df_majority),    # Match number in majority class
                                 random_state=42)  # Reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Split into X and y
X_up = df_upsampled[feature_cols]
y_up = df_upsampled['target']

## Test Split

In [9]:
X_train_down, X_test_down, y_train_down, y_test_down = train_test_split(X_down, y_down, test_size=0.3, random_state=42)
X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_up, y_up, test_size=0.3, random_state=42)

## Scaling

In [10]:
# scale data

# Instantiate the scaler
scaler_down = StandardScaler()

# Fit and transform the training data
X_train_scaled_down = scaler_down.fit_transform(X_train_down)

# Transform the test data using the same scaler
X_test_scaled_down = scaler_down.transform(X_test_down)

# scale data

# Instantiate the scaler
scaler_up = StandardScaler()

# Fit and transform the training data
X_train_scaled_up = scaler_up.fit_transform(X_train_up)

# Transform the test data using the same scaler
X_test_scaled_up = scaler_up.transform(X_test_up)

## A model with better performance?

## XGBoost with GridSearch / Downsampling / 

In [11]:
# Define XGBoost model with enable_categorical=True and tree_method='hist'
xgb_model = xgb.XGBClassifier(objective='binary:logistic', seed=42, enable_categorical=True, tree_method='hist')

# Define parameter grid for grid search
param_grid = {
    'objective': ['binary:logistic'],
    'seed': [42],
    'enable_categorical': [True],
    'tree_method': ['hist'],
    'n_estimators': [300],
    'max_depth': [9],
    'learning_rate': [0.1]
}

# Create GridSearchCV object
clf = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV object to training data
clf.fit(X_train_scaled_down, y_train_down)

# Predict on test data using best model
y_pred = clf.predict(X_test_scaled_down)

eda.print_metrics(y_test_down, y_pred)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Accuracy: 0.69
Precision: 0.69
Recall: 0.69
F1 Score: 0.69
ROC AUC Score: 0.69


## XGBoost with GridSearch / Upsampling / 

In [12]:
# Define XGBoost model with enable_categorical=True and tree_method='hist'
xgb_model = xgb.XGBClassifier(objective='binary:logistic', seed=42, enable_categorical=True, tree_method='hist')

# Define parameter grid for grid search
param_grid = {
    'objective': ['binary:logistic'],
    'seed': [42],
    'enable_categorical': [True],
    'tree_method': ['hist'],
    'n_estimators': [300],
    'max_depth': [9],
    'learning_rate': [0.1]
}

# Create GridSearchCV object
clf = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV object to training data
clf.fit(X_train_scaled_up, y_train_up)

# Predict on test data using best model
y_pred = clf.predict(X_test_scaled_up)

eda.print_metrics(y_test_up, y_pred)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Accuracy: 0.71
Precision: 0.71
Recall: 0.72
F1 Score: 0.72
ROC AUC Score: 0.71


## XGBoost with RandomSearchCV / Downscaling

In [37]:
# Define best hyperparameters
param_grid = {
    'objective': ['binary:logistic'],
    'tree_method': ['hist'],
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20],
    'learning_rate': [1, 0.1, 0.01]
}

# Create XGBoost model with best hyperparameters
model = xgb.XGBClassifier()

random_model = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=10, cv=5, n_jobs=-1, verbose=1)

# Fit the model to the training data
random_model.fit(X_train_scaled_down, y_train_down)

print(random_model.best_params_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


KeyboardInterrupt: 

In [28]:
# Get the best hyperparameters
best_params = random_model.best_params_

# Create a new random forest classifier with the best hyperparameters
rf_best = xgb.XGBClassifier(**best_params, enable_categorical=True)

# Fit the classifier to the training data
rf_best.fit(X_train_scaled_down, y_train_down)

# Make predictions on the test data using the best classifier
y_pred_proba = rf_best.predict_proba(X_test_down)

# Convert predicted probabilities to predicted class labels
y_pred = np.argmax(y_pred_proba, axis=1)

eda.print_metrics(y_test_down, y_pred)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:district: category, client_catg: category

# Logistic Regression

In [29]:
# Fit logistic regression model on the training data
lr = LogisticRegression(random_state=42)
lr.fit(X_train_scaled_down, y_train_down)

# Predict on test data using logistic regression model
y_pred_lr = lr.predict(X_test_scaled_down)

eda.print_metrics(y_test_down, y_pred)


Accuracy: 0.71
Precision: 0.71
Recall: 0.72
F1 Score: 0.72
ROC AUC Score: 0.71


## Random Forest

### RF with GridSearchCV

In [35]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=9, random_state=42)

rf_model.fit(X_train_scaled_down, y_train_down)

y_pred = rf_model.predict(X_test_scaled_down)

# Define parameter grid for grid search
param_grid = {
    'random_state': [42],
    'n_estimators': [300, 400, 500],
    'max_depth': [10, 15, 20],
    'verbose': [1],
    'n_jobs': [-1],
}

# Create GridSearchCV object
clf = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV object to training data
clf.fit(X_train_scaled_down, y_train_down)

print("Best hyperparameters: ", clf.best_params_)

# Predict on test data using best model
y_pred = clf.predict(X_test_scaled_down)

eda.print_metrics(y_test_down, y_pred)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   26.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   26.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   27.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   27.5s
[Parallel(n_jobs=-1)]: Done  34 tasks     

Best hyperparameters:  {'max_depth': 20, 'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'verbose': 1}


[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    3.0s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    6.0s finished


Accuracy: 0.66
Precision: 0.66
Recall: 0.65
F1 Score: 0.65
ROC AUC Score: 0.66


In [36]:
print("Best hyperparameters: ", clf.best_params_)

Best hyperparameters:  {'max_depth': 20, 'n_estimators': 400, 'n_jobs': -1, 'random_state': 42, 'verbose': 1}


### Random Forest with Scikit-Optimize

In [43]:
# Define hyperparameter search space
space = [Integer(100, 400, name='n_estimators'),
         Integer(3, 11, name='max_depth'),
         Integer(2, 15, name='min_samples_split')]

# Define objective function to minimize (in this case, negative F1 score)
@use_named_args(space)
def objective(n_estimators, max_depth, min_samples_split):
    # Create model with specified hyperparameters
    rf_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)
    rf_model.fit(X_train_scaled_down, y_train_down)
    
    # Make predictions on test data and compute evaluation metrics
    y_pred = rf_model.predict(X_test_scaled_down)
    test_f1 = f1_score(y_test_down, y_pred, average='macro')
    
    # Return negative F1 score (to be minimized)
    return -test_f1

# Run Bayesian optimization using the objective function and search space
res = gp_minimize(objective, space, n_calls=50, random_state=42)

# Print best hyperparameters and evaluation metrics on test set
best_params = {'n_estimators': res.x[0],
                'max_depth': res.x[1],
                'min_samples_split': res.x[2],
                'random_state': 42,
                'verbose': 1,
                'n_jobs': -1}

eda.print_metrics(y_test_down, y_pred)

KeyboardInterrupt: 