In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

!pip install xgboost
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from utils import train_log, eval

import warnings
warnings.filterwarnings("ignore")

RSEED=42


In [None]:
df_year = pd.read_csv("data/yearly_cons.csv", low_memory=False)

In [None]:
df_year = df_year.drop('client_id', axis=1)

In [None]:
df_year.info()

In [None]:
import logging

# Create a logger
logger = logging.getLogger(__name__)

# Set the logging level
logger.setLevel(logging.INFO)

# Create a handler to output logs to the console
handler = logging.FileHandler('train.log')

# Create a formatter to format the logs
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Add the formatter to the handler
handler.setFormatter(formatter)

# Add the handler to the logger
logger.addHandler(handler)


#### XGBOOST Training - unbalanced data ####

In [None]:
# Split the data 
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import recall_score, confusion_matrix, classification_report


X = df_year.drop(['target'], axis=1)
y = df_year['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED)

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

# Initialize the XGBClassifier
model = XGBClassifier(random_state=RSEED, n_jobs=-1)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='recall', verbose=1, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')

#### Model using balanced data ####

In [None]:
# Best model

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import recall_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import numpy as np

# Split the data
X = df_year.drop(['target'], axis=1)
y = df_year['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED)

# Apply SMOTE to the training data
smote = SMOTE(random_state=RSEED)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define the hyperparameter grid
param_dist = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

# Initialize the XGBClassifier
model = XGBClassifier(random_state=RSEED, n_jobs=-1)

# Initialize RandomizedSearchCV with Recall as the scoring metric
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=50, cv=3, scoring='recall', verbose=1, n_jobs=-1, random_state=RSEED)

# Fit the random search to the resampled training data
random_search.fit(X_train_resampled, y_train_resampled)

# Get the best estimator
best_model_0 = random_search.best_estimator_

# Make predictions on the test data
y_pred = best_model_0.predict(X_test)

# Evaluate the model
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

# Print the best parameters and best score
print(f'Best Parameters: {random_search.best_params_}')
print(f'Best Score: {random_search.best_score_}')

In [None]:
from sklearn.metrics import make_scorer, recall_score, roc_auc_score, f1_score
# print("Best Parameters: ", grid_search.best_params_) 
print("Test Recall: ", recall_score(y_test, y_pred))
print("Test AUC: ", roc_auc_score(y_test, best_model_0.predict(X_test)))

In [None]:
from utils import eval
eval(best_model_0.predict(X_train), y_train)

In [None]:
# Model using best parameter selection


from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import recall_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import numpy as np

# Split the data
X = df_year.drop(['target'], axis=1)
y = df_year['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED)

# Apply SMOTE to the training data
smote = SMOTE(random_state=RSEED)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define the hyperparameter grid
param_dist = {
    'n_estimators': [900, 1000, 1100],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [6, 7, 8],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.9, 1.0, 1.1],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0.05, 0.1, 0.15],
    'reg_lambda': [1, 2, 3]
}

# Initialize the XGBClassifier
model = XGBClassifier(random_state=RSEED, n_jobs=-1)

# Initialize RandomizedSearchCV with Recall as the scoring metric
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=50, cv=3, scoring='recall', verbose=1, n_jobs=-1, random_state=RSEED)

# Fit the random search to the resampled training data
random_search.fit(X_train_resampled, y_train_resampled)

# Get the best estimator
best_model_1 = random_search.best_estimator_

# Make predictions on the test data
y_pred = best_model_1.predict(X_test)

# Evaluate the model
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

# Print the best parameters and best score
print(f'Best Parameters: {random_search.best_params_}')
print(f'Best Score: {random_search.best_score_}')

In [None]:
# Model using best parameter selection

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import recall_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import numpy as np

# Split the data
X = df_year.drop(['target'], axis=1)
y = df_year['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED)

# Apply SMOTE to the training data
smote = SMOTE(random_state=RSEED)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define the hyperparameter grid
param_dist = {
    'n_estimators': [100, 1000, 1100],
    'learning_rate': [0.1, 0.2, 0.25],
    'max_depth': [6, 7, 8],
    'subsample': [0.6, 0.7, 0.8],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.15],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 2, 3]
}

# Initialize the XGBClassifier
model = XGBClassifier(random_state=RSEED, n_jobs=-1)

# Initialize RandomizedSearchCV with Recall as the scoring metric
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=50, cv=3, scoring='recall', verbose=1, n_jobs=-1, random_state=RSEED)

# Fit the random search to the resampled training data
random_search.fit(X_train_resampled, y_train_resampled)

# Get the best estimator
best_model_1 = random_search.best_estimator_

# Make predictions on the test data
y_pred = best_model_1.predict(X_test)

# Evaluate the model
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

# Print the best parameters and best score
print(f'Best Parameters: {random_search.best_params_}')
print(f'Best Score: {random_search.best_score_}')

In [None]:
agg_lvl = pd.read_csv("data/agg_lvl.csv", low_memory=False) 
agg_lvl

In [None]:
agg_lvl.columns

In [None]:
agg_lvl.isna().sum()

In [None]:
agg_lvl.dropna(inplace=True)

In [None]:
agg_lvl.isna().sum()

In [None]:
agg_lvl.isna().sum()

In [None]:
sns.countplot(agg_lvl['target'])

In [None]:
agg_lvl['target'].value_counts()

In [None]:
# Best model

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import recall_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import numpy as np

# Split the data
X = df_year.drop(['target'], axis=1)
y = df_year['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RSEED)

# Apply SMOTE to the training data
smote = SMOTE(random_state=RSEED)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Define the hyperparameter grid
param_dist = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

# Initialize the XGBClassifier
model = XGBClassifier(random_state=RSEED, n_jobs=-1)

# Initialize RandomizedSearchCV with Recall as the scoring metric
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=50, cv=3, scoring='recall', verbose=1, n_jobs=-1, random_state=RSEED)

# Fit the random search to the resampled training data
random_search.fit(X_train_resampled, y_train_resampled)

# Get the best estimator
best_model_0 = random_search.best_estimator_

# Make predictions on the test data
y_pred = best_model_0.predict(X_test)

# Evaluate the model
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.2f}')

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

class_report = classification_report(y_test, y_pred)
print('Classification Report:')
print(class_report)

# Print the best parameters and best score
print(f'Best Parameters: {random_search.best_params_}')
print(f'Best Score: {random_search.best_score_}')