In [100]:
# import all modules needed
import models.eda as eda

import pandas as pd
import numpy as np
from IPython.display import display

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

import xgboost as xgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


In [101]:
# df_client and df_invoice are being loaded from the data folder
client = eda.load_data("data/train/client_train.csv")
invoice = eda.load_data("data/train/invoice_train.csv")
df = eda.feature_change(client, invoice)

  invoice = eda.load_data("data/train/invoice_train.csv")


In [102]:
df.head(20)

Unnamed: 0,district,client_id,client_catg,region,creation_date,target,region_group,coop_time,invoice_date,tarif_type,...,counter_statue,counter_code,reading_remarque,counter_coefficient,months_number,counter_type,invoice_month,invoice_year,is_weekday,total_consumption
0,60,0,11,101,1994-12-31,0.0,200,288,2014-03-24,11,...,0,203,8,1,4,1,3,2014,0.0,82
1,60,0,11,101,1994-12-31,0.0,200,288,2013-03-29,11,...,0,203,6,1,4,1,3,2013,0.0,1384
2,60,0,11,101,1994-12-31,0.0,200,288,2015-03-23,11,...,0,203,8,1,4,1,3,2015,0.0,123
3,60,0,11,101,1994-12-31,0.0,200,288,2015-07-13,11,...,0,207,8,1,4,1,7,2015,0.0,102
4,60,0,11,101,1994-12-31,0.0,200,288,2016-11-17,11,...,0,207,9,1,12,1,11,2016,0.0,572
5,60,0,11,101,1994-12-31,0.0,200,288,2017-07-17,11,...,0,207,9,1,8,1,7,2017,0.0,314
6,60,0,11,101,1994-12-31,0.0,200,288,2018-12-07,11,...,0,207,9,1,12,1,12,2018,0.0,541
7,60,0,11,101,1994-12-31,0.0,200,288,2019-03-19,11,...,0,207,9,1,8,1,3,2019,0.0,585
8,60,0,11,101,1994-12-31,0.0,200,288,2011-07-22,11,...,0,203,9,1,4,1,7,2011,0.0,1386
9,60,0,11,101,1994-12-31,0.0,200,288,2011-11-22,11,...,0,203,6,1,4,1,11,2011,0.0,1082


## Baseline model
- our very simple baseline model assumes every client from district 51 is fraudulent. 

In [103]:
def baseline_model(X):
    y_pred = [0 if cat != 51 else 1 for cat in X.client_catg]
    return y_pred

# in district 51 the most fraudulents appear:
pd.crosstab(df['target'], df['client_catg'], normalize='columns')*100

client_catg,11,12,51
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,92.391486,94.413284,79.044952
1.0,7.608514,5.586716,20.955048


In [104]:
# Select the features to use for modeling
feature_cols = ['district','client_catg', 'region_group',
       'tarif_type', 'counter_number',
       'counter_statue', 'counter_code', 'reading_remarque',
       'counter_coefficient', 'months_number', 'counter_type',
       'total_consumption', 'invoice_year']
X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df['target'], test_size=0.3, random_state=42)
y_pred = baseline_model(X_test)

print('F1-score:', precision_score(y_test, y_pred).round(2))
print('ROC AUC:', roc_auc_score(y_test, y_pred).round(2))


# Preprocessing

## Scaling

In [120]:
# scale data 

# Instantiate the scaler
scaler = Normalizer()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

## Sampling

In [121]:
# sample the data to balance the classes

# Separate majority and minority classes
df_majority = df[df.target==0]
df_minority = df[df.target==1]

# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False,     # Sample without replacement
                                   n_samples=len(df_minority),    # Match number in minority class
                                   random_state=42)  # Reproducible results

# Combine majority class with minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Split into X and y
X_train = df_downsampled[feature_cols]
y_train = df_downsampled['target']

## A model with better performance?

### every model has his own preprocessing and scaling

## XGBoost with GridSearch / Downsampling / 

In [124]:
# Separate majority and minority classes
df_majority = df[df.target==0]
df_minority = df[df.target==1]

# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False,     # Sample without replacement
                                   n_samples=len(df_minority),    # Match number in minority class
                                   random_state=42)  # Reproducible results

# Combine majority class with minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Split into X and y
X = df_downsampled[feature_cols]
y = df_downsampled['target']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define XGBoost model with enable_categorical=True and tree_method='hist'
xgb_model = xgb.XGBClassifier(objective='binary:logistic', seed=42, enable_categorical=True, tree_method='hist')

# Define parameter grid for grid search
param_grid = {
    'objective': ['binary:logistic'],
    'seed': [42],
    'enable_categorical': [True],
    'tree_method': ['hist'],
    'n_estimators': [300],
    'max_depth': [9],
    'learning_rate': [0.1]
}

# Create GridSearchCV object
clf = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV object to training data
clf.fit(X_train_scaled, y_train)

# Predict on test data using best model
y_pred = clf.predict(X_test_scaled)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Print best hyperparameters and evaluation metrics
print('Best Hyperparameters: ', clf.best_params_)
print('Accuracy: {:.2f}'.format(clf.best_score_))
print('Precision: {:.2f}'.format(precision_score(y_test, y_pred)))
print('Recall: {:.2f}'.format(recall_score(y_test, y_pred)))
print('F1 Score: {:.2f}'.format(f1_score(y_test, y_pred)))
print('ROC AUC Score: {:.2f}'.format(roc_auc_score(y_test, y_pred)))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Hyperparameters:  {'enable_categorical': True, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 300, 'objective': 'binary:logistic', 'seed': 42, 'tree_method': 'hist'}
Accuracy: 0.65
Precision: 0.66
Recall: 0.66
F1 Score: 0.66
ROC AUC Score: 0.66


## XGBoost with GridSearch / Upsampling / 

In [123]:
# Scale data
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer

# Instantiate the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

from sklearn.utils import resample
from sklearn.model_selection import train_test_split

# Separate majority and minority classes
df_majority = df[df.target==0]
df_minority = df[df.target==1]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,    # Sample with replacement
                                 n_samples=len(df_majority),    # Match number in majority class
                                 random_state=42)  # Reproducible results

# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Split into X and y
X_train = df_upsampled[feature_cols]
y_train = df_upsampled['target']

# Scale the data
scaler = Normalizer()
X_train_scaled = scaler.fit_transform(X_train)

# Define XGBoost model with enable_categorical=True and tree_method='hist'
xgb_model = xgb.XGBClassifier(objective='binary:logistic', seed=42, enable_categorical=True, tree_method='hist')

# Define parameter grid for grid search
param_grid = {
    'objective': ['binary:logistic'],
    'seed': [42],
    'enable_categorical': [True],
    'tree_method': ['hist'],
    'n_estimators': [300],
    'max_depth': [9],
    'learning_rate': [0.1]
}

# Split data into training and test sets
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_train_scaled, y_train, test_size=0.2, random_state=42)

# Create GridSearchCV object
clf = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV object to training data
clf.fit(X_train_scaled, y_train)

# Predict on test data using best model
y_pred = clf.predict(X_test_scaled)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Print best hyperparameters and evaluation metrics
print('Best Hyperparameters: ', clf.best_params_)
print('Accuracy: {:.2f}'.format(clf.best_score_))
print('Precision: {:.2f}'.format(precision_score(y_test, y_pred)))
print('Recall: {:.2f}'.format(recall_score(y_test, y_pred)))
print('F1 Score: {:.2f}'.format(f1_score(y_test, y_pred)))
print('ROC AUC Score: {:.2f}'.format(roc_auc_score(y_test, y_pred)))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Hyperparameters:  {'enable_categorical': True, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 300, 'objective': 'binary:logistic', 'seed': 42, 'tree_method': 'hist'}
Accuracy: 0.72
Precision: 0.72
Recall: 0.71
F1 Score: 0.72
ROC AUC Score: 0.72


## XGBoost with RandomSearchCV

In [116]:
# Select the features to use for modeling
feature_cols = ['district', 'client_catg', 'region_group', 'tarif_type', 'counter_number',
                'reading_remarque', 'months_number', 'total_consumption', 'invoice_year', 
                'counter_coefficient', 'counter_code', 'counter_statue', 'counter_type']

X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df['target'], test_size=0.3, random_state=42)

# Separate majority and minority classes
df_majority = df[df.target==0]
df_minority = df[df.target==1]

# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False,     # Sample without replacement
                                   n_samples=len(df_minority),    # Match number in minority class
                                   random_state=42)  # Reproducible results

# Combine majority class with minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Split into X and y
X_train = df_downsampled[feature_cols]
y_train = df_downsampled['target']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

############################################
# SCALER data using a chosen scaler
scaler = MinMaxScaler() # RobustScaler() # StandardScaler() # Normalizer()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

############################################
# Define best hyperparameters
best_params = {
    'objective': 'binary:logistic',
    'tree_method': 'hist',
    'n_estimators': 400,
    'max_depth': 11,
    'learning_rate': 0.1,
    'gamma': 0,
    'subsample': 0.6,
    'colsample_bytree': 1.0,
    'enable_categorical': True,
    'seed': 42
}

# Create XGBoost model with best hyperparameters
model = xgb.XGBClassifier(**best_params)

# Fit the model to the training data
model.fit(X_train_scaled, y_train)

# Predict on test data
y_pred = model.predict(X_test_scaled)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Print evaluation metrics
print('Best Hyperparameters: ', clf.best_params_)
print('Accuracy: {:.2f}'.format(accuracy))
print('Precision: {:.2f}'.format(precision))
print('Recall: {:.2f}'.format(recall))
print('F1 Score: {:.2f}'.format(f1))
print('ROC AUC Score: {:.2f}'.format(roc_auc))



Best Hyperparameters:  {'enable_categorical': True, 'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 300, 'objective': 'binary:logistic', 'seed': 42, 'tree_method': 'hist'}
Accuracy: 0.69
Precision: 0.69
Recall: 0.70
F1 Score: 0.69
ROC AUC Score: 0.69


# Logistic Regression

In [None]:
# Import necessary libraries
from sklearn.linear_model import LogisticRegression

# Select the features to use for modeling
feature_cols = ['district','client_catg', 'region_group',
       'tarif_type', 'counter_number',
       'counter_statue', 'counter_code', 'reading_remarque',
       'counter_coefficient', 'months_number', 'counter_type',
       'total_consumption', 'invoice_year']
X_train, X_test, y_train, y_test = train_test_split(df[feature_cols], df['target'], test_size=0.3, random_state=42)

# Separate majority and minority classes
df_majority = df[df.target==0]
df_minority = df[df.target==1]

# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                   replace=False,     # Sample without replacement
                                   n_samples=len(df_minority),    # Match number in minority class
                                   random_state=42)  # Reproducible results

# Combine majority class with minority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Split into X and y
X_train = df_downsampled[feature_cols]
y_train = df_downsampled['target']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42)


############################################
# SCALER data using a chosen scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


############################################
# Fit logistic regression model on the training data
lr = LogisticRegression(random_state=42)
lr.fit(X_train_scaled, y_train)

# Predict on test data using logistic regression model
y_pred_lr = lr.predict(X_test_scaled)

# Calculate evaluation metrics for logistic regression model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)
roc_auc_lr = roc_auc_score(y_test, y_pred_lr)

# Print best hyperparameters and evaluation metrics
print('Accuracy: {:.2f}'.format(accuracy_lr))
print('Precision: {:.2f}'.format(precision_lr))
print('Recall: {:.2f}'.format(recall_lr))
print('F1 Score: {:.2f}'.format(f1_lr))
print('ROC AUC Score: {:.2f}'.format(roc_auc_lr))

Accuracy: 0.57
Precision: 0.56
Recall: 0.65
F1 Score: 0.60
ROC AUC Score: 0.57


## Random Forest

### RF with GridSearchCV

In [127]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

rf_model = RandomForestClassifier(n_estimators=100, max_depth=9, random_state=42)

rf_model.fit(X_train_scaled, y_train)

y_pred = rf_model.predict(X_test_scaled)

# Define parameter grid for grid search
param_grid = {
    'random_state': [42],
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 6, 9, 11],
    'min_samples_split': [2, 5, 10, 15],
    'verbose': [1],
    'n_jobs': [-1],
}

# Create GridSearchCV object
clf = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)

# Fit GridSearchCV object to training data
clf.fit(X_train_scaled, y_train)

# Predict on test data using best model
y_pred = clf.predict(X_test_scaled)

# Evaluate performance on test set
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred)
test_recall = recall_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)
test_roc_auc = roc_auc_score(y_test, y_pred)

# Print best hyperparameters and evaluation metrics on test set
print('Best Hyperparameters: ', clf.best_params_)
print('Train Accuracy: {:.2f}'.format(clf.best_score_))
print('Test Accuracy: {:.2f}'.format(test_accuracy))
print('Train Precision: {:.2f}'.format(clf.cv_results_['mean_test_score'][clf.best_index_]))
print('Test Precision: {:.2f}'.format(test_precision))
print('Train Recall: {:.2f}'.format(clf.cv_results_['mean_test_score'][clf.best_index_]))
print('Test Recall: {:.2f}'.format(test_recall))
print('Train F1 Score: {:.2f}'.format(clf.cv_results_['mean_test_score'][clf.best_index_]))
print('Test F1 Score: {:.2f}'.format(test_f1))
print('Train ROC AUC Score: {:.2f}'.format(clf.cv_results_['mean_test_score'][clf.best_index_]))
print('Test ROC AUC Score: {:.2f}'.format(test_roc_auc))

Fitting 5 folds for each of 64 candidates, totalling 320 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.4s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done  34 tasks     

KeyboardInterrupt: 

In [130]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from skopt import gp_minimize
from skopt.space import Real, Integer
from skopt.utils import use_named_args

# Define hyperparameter search space
space = [Integer(100, 400, name='n_estimators'),
         Integer(3, 11, name='max_depth'),
         Integer(2, 15, name='min_samples_split')]

# Define objective function to minimize (in this case, negative F1 score)
@use_named_args(space)
def objective(n_estimators, max_depth, min_samples_split):
    # Create model with specified hyperparameters
    rf_model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)
    rf_model.fit(X_train_scaled, y_train)
    
    # Make predictions on test data and compute evaluation metrics
    y_pred = rf_model.predict(X_test_scaled)
    test_f1 = f1_score(y_test, y_pred, average='macro')
    
    # Return negative F1 score (to be minimized)
    return -test_f1

# Run Bayesian optimization using the objective function and search space
res = gp_minimize(objective, space, n_calls=50, random_state=42)

# Print best hyperparameters and evaluation metrics on test set
best_params = {'n_estimators': res.x[0], 'max_depth': res.x[1], 'min_samples_split': res.x[2], 'random_state': 42, 'verbose': 1, 'n_jobs': -1}

rf_model = RandomForestClassifier(**best_params)
rf_model.fit(X_train_scaled, y_train)
y_pred = rf_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred)
test_recall = recall_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)
test_roc_auc = roc_auc_score(y_test, y_pred)
print('Best Hyperparameters: ', best_params)
print('Test Accuracy: {:.2f}'.format(test_accuracy))
print('Test Precision: {:.2f}'.format(test_precision))
print('Test Recall: {:.2f}'.format(test_recall))
print('Test F1 Score: {:.2f}'.format(test_f1))
print('Test ROC AUC Score: {:.2f}'.format(test_roc_auc))


KeyboardInterrupt: 