In [None]:
# import all modules needed
import models.eda as eda

import pandas as pd
import numpy as np
from IPython.display import display

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import xgboost as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.utils import resample

from sklearn.ensemble import RandomForestClassifier
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args

In [13]:
# df_client and df_invoice are being loaded from the data folder
client = eda.load_data("data/train/client_train.csv")
invoice = eda.load_data("data/train/invoice_train.csv")
df = eda.feature_change(client, invoice)
df = eda.get_mean_consumption(df)
df = eda.get_historical_mean(df)

  invoice = eda.load_data("data/train/invoice_train.csv")


In [5]:
df.head(20)

Unnamed: 0,district,client_id,client_catg,region,creation_date,target,region_group,coop_time,invoice_date,tarif_type,...,reading_remarque,counter_coefficient,months_number,counter_type,invoice_month,invoice_year,is_weekday,total_consumption,mean_consumption_per_year,historical_mean_consumption
0,60,0,11,101,1994-12-31,0.0,200,288,2005-10-17,11,...,6,1,4,1,10,2005,0.0,124,124.0,0.0
1,60,0,11,101,1994-12-31,0.0,200,288,2006-10-18,11,...,6,1,4,1,10,2006,0.0,159,154.0,124.0
2,60,0,11,101,1994-12-31,0.0,200,288,2006-06-23,11,...,6,1,4,1,6,2006,0.0,162,154.0,141.5
3,60,0,11,101,1994-12-31,0.0,200,288,2006-02-24,11,...,6,1,4,1,2,2006,0.0,141,154.0,148.333333
4,60,0,11,101,1994-12-31,0.0,200,288,2007-02-26,11,...,6,1,4,1,2,2007,0.0,182,232.666667,146.5
5,60,0,11,101,1994-12-31,0.0,200,288,2007-10-25,11,...,6,1,4,1,10,2007,0.0,276,232.666667,153.6
6,60,0,11,101,1994-12-31,0.0,200,288,2007-06-27,11,...,6,1,4,1,6,2007,0.0,240,232.666667,174.0
7,60,0,11,101,1994-12-31,0.0,200,288,2008-01-04,11,...,6,1,4,1,1,2008,0.0,277,207.333333,183.428571
8,60,0,11,101,1994-12-31,0.0,200,288,2008-07-28,11,...,6,1,4,1,7,2008,0.0,171,207.333333,195.125
9,60,0,11,101,1994-12-31,0.0,200,288,2008-11-25,11,...,6,1,4,1,11,2008,0.0,174,207.333333,192.444444


## Baseline model
- our very simple baseline model assumes every client from district 51 is fraudulent. 

In [6]:
def baseline_model(X):
    y_pred = [0 if cat != 51 else 1 for cat in X.client_catg]
    return y_pred

# in district 51 the most fraudulents appear:
pd.crosstab(df['target'], df['client_catg'], normalize='columns')*100

client_catg,11,12,51
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,92.391486,94.413284,79.044952
1.0,7.608514,5.586716,20.955048


In [22]:
# Select the features to use for modeling
feature_cols = ['district','client_catg', 'region_group',
       'tarif_type', 'counter_number',
       'counter_statue', 'counter_code', 'reading_remarque',
       'counter_coefficient', 'months_number', 'counter_type',
       'total_consumption', 'invoice_year', 'mean_consumption_per_year', 'historical_mean_consumption']
X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df['target'], test_size=0.3, random_state=42)
y_pred = baseline_model(X_test)

print('F1-score:', precision_score(y_test, y_pred).round(2))
print('ROC AUC:', roc_auc_score(y_test, y_pred).round(2))

F1-score: 0.21
ROC AUC: 0.52


# Preprocessing

## Scaling

In [31]:
# scale data

# Instantiate the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

## Sampling

In [32]:
# sample the data to balance the classes

# Separate majority and minority classes
df_majority = df[df.target==0]
df_minority = df[df.target==1]

# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False,     # Sample without replacement
                                   n_samples=len(df_minority),    # Match number in minority class
                                   random_state=42)  # Reproducible results

# Combine majority class with minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Split into X and y
X_train_down = df_downsampled[feature_cols]
y_train_down = df_downsampled['target']

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,    # Sample with replacement
                                 n_samples=len(df_majority),    # Match number in majority class
                                 random_state=42)  # Reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Split into X and y
X_train_up = df_upsampled[feature_cols]
y_train_up = df_upsampled['target']

## A model with better performance?

## XGBoost with GridSearch / Downsampling / 

In [36]:
# Select the features to use for modeling
feature_cols = ['district','client_catg', 'region_group',
       'tarif_type', 'counter_number',
       'counter_statue', 'counter_code', 'reading_remarque',
       'counter_coefficient', 'months_number', 'counter_type',
       'total_consumption', 'invoice_year', 'mean_consumption_per_year', 'historical_mean_consumption']

X_train_down, X_test, y_train_down, y_test = train_test_split(df[feature_cols], df['target'], test_size=0.3, random_state=42)

# Define XGBoost model with enable_categorical=True and tree_method='hist'
xgb_model = xgb.XGBClassifier(objective='binary:logistic', seed=42, enable_categorical=True, tree_method='hist')

# Define parameter grid for grid search
param_grid = {
    'objective': ['binary:logistic'],
    'seed': [42],
    'enable_categorical': [True],
    'tree_method': ['hist'],
    'n_estimators': [300],
    'max_depth': [9],
    'learning_rate': [0.1]
}

# Split data into training and test sets
#X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# Create GridSearchCV object
clf = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV object to training data
clf.fit(X_train_scaled, y_train_down)

# Predict on test data using best model
y_pred = clf.predict(X_test_scaled)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Print best hyperparameters and evaluation metrics
print('Best Hyperparameters: ', clf.best_params_)
print('Accuracy: {:.2f}'.format(clf.best_score_))
print('Precision: {:.2f}'.format(precision_score(y_test, y_pred)))
print('Recall: {:.2f}'.format(recall_score(y_test, y_pred)))
print('F1 Score: {:.2f}'.format(f1_score(y_test, y_pred)))
print('ROC AUC Score: {:.2f}'.format(roc_auc_score(y_test, y_pred)))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Hyperparameters:  {'enable_categorical': True, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 300, 'objective': 'binary:logistic', 'seed': 42, 'tree_method': 'hist'}
Accuracy: 0.93
Precision: 0.89
Recall: 0.08
F1 Score: 0.14
ROC AUC Score: 0.54


In [37]:
# Print best hyperparameters and evaluation metrics
print('Best Hyperparameters: ', clf.best_params_)
print('Accuracy: {:.2f}'.format(clf.best_score_))
print('Precision: {:.2f}'.format(precision_score(y_test, y_pred)))
print('Recall: {:.2f}'.format(recall_score(y_test, y_pred)))
print('F1 Score: {:.2f}'.format(f1_score(y_test, y_pred)))
print('ROC AUC Score: {:.2f}'.format(roc_auc_score(y_test, y_pred)))

Best Hyperparameters:  {'enable_categorical': True, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 300, 'objective': 'binary:logistic', 'seed': 42, 'tree_method': 'hist'}
Accuracy: 0.93
Precision: 0.89
Recall: 0.08
F1 Score: 0.14
ROC AUC Score: 0.54


## XGBoost with GridSearch / Upsampling / 

In [18]:
# Select the features to use for modeling
feature_cols = ['district','client_catg', 'region_group',
       'tarif_type', 'counter_number',
       'counter_statue', 'counter_code', 'reading_remarque',
       'counter_coefficient', 'months_number', 'counter_type',
       'total_consumption', 'invoice_year', 'mean_consumption_per_year', 'historical_mean_consumption']

X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df['target'], test_size=0.3, random_state=42)

# Scale data
#from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer

# Instantiate the scaler
#scaler = StandardScaler()

# Fit and transform the training data
#X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the same scaler
#X_test_scaled = scaler.transform(X_test)

# Separate majority and minority classes
#df_majority = df[df.target==0]
#df_minority = df[df.target==1]

# Upsample minority class
#df_minority_upsampled = resample(df_minority, 
                                 replace=True,    # Sample with replacement
                                 n_samples=len(df_majority),    # Match number in majority class
                                 random_state=42)  # Reproducible results

# Combine majority class with upsampled minority class
#df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Split into X and y
#X_train = df_upsampled[feature_cols]
#y_train = df_upsampled['target']

# Scale the data
#scaler = Normalizer()
#X_train_scaled = scaler.fit_transform(X_train)

# Define XGBoost model with enable_categorical=True and tree_method='hist'
xgb_model = xgb.XGBClassifier(objective='binary:logistic', seed=42, enable_categorical=True, tree_method='hist')

# Define parameter grid for grid search
param_grid = {
    'objective': ['binary:logistic'],
    'seed': [42],
    'enable_categorical': [True],
    'tree_method': ['hist'],
    'n_estimators': [300],
    'max_depth': [9],
    'learning_rate': [0.1]
}

# Split data into training and test sets
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# Create GridSearchCV object
clf = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV object to training data
clf.fit(X_train_scaled, y_train)

# Predict on test data using best model
y_pred = clf.predict(X_test_scaled)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Print best hyperparameters and evaluation metrics
print('Best Hyperparameters: ', clf.best_params_)
print('Accuracy: {:.2f}'.format(clf.best_score_))
print('Precision: {:.2f}'.format(precision_score(y_test, y_pred)))
print('Recall: {:.2f}'.format(recall_score(y_test, y_pred)))
print('F1 Score: {:.2f}'.format(f1_score(y_test, y_pred)))
print('ROC AUC Score: {:.2f}'.format(roc_auc_score(y_test, y_pred)))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Hyperparameters:  {'enable_categorical': True, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 300, 'objective': 'binary:logistic', 'seed': 42, 'tree_method': 'hist'}
Accuracy: 0.75
Precision: 0.76
Recall: 0.75
F1 Score: 0.75
ROC AUC Score: 0.75


In [34]:
# Select the features to use for modeling
feature_cols = ['district','client_catg', 'region_group',
       'tarif_type', 'counter_number',
       'counter_statue', 'counter_code', 'reading_remarque',
       'counter_coefficient', 'months_number', 'counter_type',
       'total_consumption', 'invoice_year', 'mean_consumption_per_year', 'historical_mean_consumption']

X_train_up, X_test, y_train_up, y_test = train_test_split(df[feature_cols], df['target'], test_size=0.3, random_state=42)

# Define XGBoost model with enable_categorical=True and tree_method='hist'
xgb_model = xgb.XGBClassifier(objective='binary:logistic', seed=42, enable_categorical=True, tree_method='hist')

# Define parameter grid for grid search
param_grid = {
    'objective': ['binary:logistic'],
    'seed': [42],
    'enable_categorical': [True],
    'tree_method': ['hist'],
    'n_estimators': [300],
    'max_depth': [9],
    'learning_rate': [0.1]
}

# Split data into training and test sets
#X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# Create GridSearchCV object
clf = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV object to training data
clf.fit(X_train_scaled, y_train_up)

# Predict on test data using best model
y_pred = clf.predict(X_test_scaled)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Print best hyperparameters and evaluation metrics
print('Best Hyperparameters: ', clf.best_params_)
print('Accuracy: {:.2f}'.format(clf.best_score_))
print('Precision: {:.2f}'.format(precision_score(y_test, y_pred)))
print('Recall: {:.2f}'.format(recall_score(y_test, y_pred)))
print('F1 Score: {:.2f}'.format(f1_score(y_test, y_pred)))
print('ROC AUC Score: {:.2f}'.format(roc_auc_score(y_test, y_pred)))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Hyperparameters:  {'enable_categorical': True, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 300, 'objective': 'binary:logistic', 'seed': 42, 'tree_method': 'hist'}
Accuracy: 0.93
Precision: 0.89
Recall: 0.08
F1 Score: 0.14
ROC AUC Score: 0.54


## XGBoost with RandomSearchCV / Upscaling

In [38]:
# Select the features to use for modeling
feature_cols = ['district', 'client_catg', 'region_group', 'tarif_type', 'counter_number',
                'reading_remarque', 'months_number', 'total_consumption', 'invoice_year', 
                'counter_coefficient', 'counter_code', 'counter_statue', 'counter_type']

X_train_up, X_test, y_train_up, y_test = train_test_split(df[feature_cols], df['target'], test_size=0.3, random_state=42)

# Define best hyperparameters
best_params = {
    'objective': 'binary:logistic',
    'tree_method': 'hist',
    'n_estimators': 400,
    'max_depth': 11,
    'learning_rate': 0.1,
    'gamma': 0,
    'subsample': 0.6,
    'colsample_bytree': 1.0,
    'enable_categorical': True,
    'seed': 42
}

# Create XGBoost model with best hyperparameters
model = xgb.XGBClassifier(**best_params)

# Fit the model to the training data
model.fit(X_train_scaled, y_train_up)

# Predict on test data
y_pred = model.predict(X_test_scaled)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Print evaluation metrics
print('Best Hyperparameters: ', clf.best_params_)
print('Accuracy: {:.2f}'.format(accuracy))
print('Precision: {:.2f}'.format(precision))
print('Recall: {:.2f}'.format(recall))
print('F1 Score: {:.2f}'.format(f1))
print('ROC AUC Score: {:.2f}'.format(roc_auc))

Best Hyperparameters:  {'enable_categorical': True, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 300, 'objective': 'binary:logistic', 'seed': 42, 'tree_method': 'hist'}
Accuracy: 0.93
Precision: 0.89
Recall: 0.11
F1 Score: 0.20
ROC AUC Score: 0.56


# Logistic Regression

In [42]:
# Select the features to use for modeling
feature_cols = ['district','client_catg', 'region_group',
       'tarif_type', 'counter_number',
       'counter_statue', 'counter_code', 'reading_remarque',
       'counter_coefficient', 'months_number', 'counter_type',
       'total_consumption', 'invoice_year']
X_train_up, X_test, y_train_up, y_test = train_test_split(df[feature_cols], df['target'], test_size=0.3, random_state=42)

# Fit logistic regression model on the training data
lr = LogisticRegression(random_state=42)
lr.fit(X_train_scaled, y_train_up)

# Predict on test data using logistic regression model
y_pred_lr = lr.predict(X_test_scaled)

# Calculate evaluation metrics for logistic regression model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)
roc_auc_lr = roc_auc_score(y_test, y_pred_lr)

# Print best hyperparameters and evaluation metrics
print('Accuracy: {:.2f}'.format(accuracy_lr))
print('Precision: {:.2f}'.format(precision_lr))
print('Recall: {:.2f}'.format(recall_lr))
print('F1 Score: {:.2f}'.format(f1_lr))
print('ROC AUC Score: {:.2f}'.format(roc_auc_lr))

Accuracy: 0.92
Precision: 0.25
Recall: 0.00
F1 Score: 0.00
ROC AUC Score: 0.50


## Random Forest

### RF with GridSearchCV

In [44]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=9, random_state=42)

rf_model.fit(X_train_scaled, y_train_down)

y_pred = rf_model.predict(X_test_scaled)

# Define parameter grid for grid search
param_grid = {
    'random_state': [42],
    'n_estimators': [100],
    'max_depth': [9],
    'verbose': [1],
    'n_jobs': [-1],
}

# Create GridSearchCV object
clf = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV object to training data
clf.fit(X_train_scaled, y_train_down)

# Predict on test data using best model
y_pred = clf.predict(X_test_scaled)

# Evaluate performance on test set
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred)
test_recall = recall_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)
test_roc_auc = roc_auc_score(y_test, y_pred)

# Print best hyperparameters and evaluation metrics on test set
print('Best Hyperparameters: ', clf.best_params_)
print('Train Accuracy: {:.2f}'.format(clf.best_score_))
print('Test Accuracy: {:.2f}'.format(test_accuracy))
print('Train Precision: {:.2f}'.format(clf.cv_results_['mean_test_score'][clf.best_index_]))
print('Test Precision: {:.2f}'.format(test_precision))
print('Train Recall: {:.2f}'.format(clf.cv_results_['mean_test_score'][clf.best_index_]))
print('Test Recall: {:.2f}'.format(test_recall))
print('Train F1 Score: {:.2f}'.format(clf.cv_results_['mean_test_score'][clf.best_index_]))
print('Test F1 Score: {:.2f}'.format(test_f1))
print('Train ROC AUC Score: {:.2f}'.format(clf.cv_results_['mean_test_score'][clf.best_index_]))
print('Test ROC AUC Score: {:.2f}'.format(test_roc_auc))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


### Random Forest with Scikit-Optimize

In [43]:
# Select the features to use for modeling
feature_cols = ['district','client_catg', 'region_group',
       'tarif_type', 'counter_number',
       'counter_statue', 'counter_code',
        'reading_remarque', 'counter_coefficient', 
       'months_number', 'counter_type',
       'total_consumption', 'invoice_year', 
       'mean_consumption_per_year', 'historical_mean_consumption']

X_train_down, X_test, y_train_down, y_test = train_test_split(df[feature_cols], df['target'], test_size=0.3, random_state=42)

# Define hyperparameter search space
space = [Integer(100, 400, name='n_estimators'),
         Integer(3, 11, name='max_depth'),
         Integer(2, 15, name='min_samples_split')]

# Define objective function to minimize (in this case, negative F1 score)
@use_named_args(space)
def objective(n_estimators, max_depth, min_samples_split):
    # Create model with specified hyperparameters
    rf_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)
    rf_model.fit(X_train_scaled, y_train_down)
    
    # Make predictions on test data and compute evaluation metrics
    y_pred = rf_model.predict(X_test_scaled)
    test_f1 = f1_score(y_test, y_pred, average='macro')
    
    # Return negative F1 score (to be minimized)
    return -test_f1

# Run Bayesian optimization using the objective function and search space
res = gp_minimize(objective, space, n_calls=50, random_state=42)

# Print best hyperparameters and evaluation metrics on test set
best_params = {'n_estimators': res.x[0],
                'max_depth': res.x[1],
                'min_samples_split': res.x[2],
                'random_state': 42,
                'verbose': 1,
                'n_jobs': -1}

rf_model = RandomForestClassifier(**best_params)
rf_model.fit(X_train_scaled, y_train_down)
y_pred = rf_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred)
test_recall = recall_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)
test_roc_auc = roc_auc_score(y_test, y_pred)
print('Best Hyperparameters: ', best_params)
print('Test Accuracy: {:.2f}'.format(test_accuracy))
print('Test Precision: {:.2f}'.format(test_precision))
print('Test Recall: {:.2f}'.format(test_recall))
print('Test F1 Score: {:.2f}'.format(test_f1))
print('Test ROC AUC Score: {:.2f}'.format(test_roc_auc))

KeyboardInterrupt: 