In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import LabelEncoder
#LabelEncoder is used to convert categorical data to numerical data
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import kagglehub
# We may use Decision Tree or XGBoost for better results
# from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeRegressor

In [16]:
# What is tree ensemble methods?
# Ensemble methods use multiple learning algorithms to obtain better predictive performance than could be obtained from any of the constituent learning algorithms alone.
# Examples of ensemble methods include bagging, boosting, and stacking.
# Decision trees are a type of supervised learning algorithm that can be used for both classification and regression tasks.
# They work by recursively splitting the data into subsets based on the values of the input features, with the goal of creating homogeneous subsets that are easier to predict.
# Tree ensemble methods combine multiple decision trees to improve the overall performance of the model.
# Random Forest is an example of a tree ensemble method that uses bagging to create multiple decision trees and combines their predictions to make a final prediction.
# Gradient Boosting is another tree ensemble method that builds decision trees sequentially, with each tree trying to correct the errors of the previous tree.
# The final prediction is made by combining the predictions of all the trees in the ensemble.
# Tree ensemble methods can often achieve better performance than individual decision trees, as they can reduce overfitting and improve generalization to new data.
# However, they can also be more complex and computationally expensive to train and deploy.
# Overall, tree ensemble methods are a powerful tool for improving the performance of decision tree models and are widely used in machine learning applications.
# Examples of tree ensemble methods include Random Forest, Gradient Boosting, and XGBoost.
# These methods can be used for both classification and regression tasks and are particularly effective for handling complex datasets with many features and interactions.
# Tree ensemble methods are commonly used in various applications, such as fraud detection, customer churn prediction, and recommendation systems.
# They are also popular in data science competitions, such as those hosted on Kaggle, where they often achieve top performance.
# However, tree ensemble methods can also be more difficult to interpret than individual decision trees, as they involve multiple trees and complex interactions between features.
# Therefore, it is important to carefully evaluate the performance and interpretability of tree ensemble models before deploying them in real-world applications.
# Overall, tree ensemble methods are a powerful and widely used tool in machine learning that can help improve the accuracy and robustness of predictive models.
# Anyways, let's implement Gradient Boosting Classifier from scratch.


In [None]:
class GradientBoostingClassifierScratch:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.trees = []
        self.initial_prediction = None
    def fit(self, X, y):
        # Initialize the model with the mean of the target variable
        self.initial_prediction = np.mean(y)
        #Find the mean of the true y values
        # Initialize the residuals
        residuals = y - self.initial_prediction
        # Then y - mean(y) as to what the residuals are and what are residuals? 
        #Residuals are the difference between the true values and the predicted values.
        for _ in range(self.n_estimators):
            # Fit a decision tree to the residuals
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residuals)
            # Predict the residuals
            predictions = tree.predict(X)
            # Update the residuals
            residuals -= self.learning_rate * predictions
            # Store the tree
            self.trees.append(tree)
    def predict(self, X):
        # Start with the initial prediction
        y_pred = np.full(X.shape[0], self.initial_prediction)
        # Add the predictions from each tree
        for tree in self.trees:
            y_pred += self.learning_rate * tree.predict(X)
        # Convert to binary predictions
        return (y_pred >= 0.5).astype(int)
# The above class implements a simple version of Gradient Boosting Classifier from scratch using decision trees as base learners.
# It includes methods for fitting the model to training data and making predictions on new data.
# Note that this is a basic implementation and may not include all the optimizations and features of more advanced libraries like XGBoost or LightGBM.
# Let's use this class to train a model on the loan prediction dataset.

In [18]:
path = kagglehub.dataset_download("uciml/default-of-credit-card-clients-dataset")
df = pd.read_csv(path + '/UCI_Credit_Card.csv')
# print("Path to dataset files:", path

In [19]:
df.head()

# The dataset contains information about credit card clients and whether they defaulted on their payments.
# And, this will predict if the client will get into a debt or not.
# So, let's create a feature whereas we will predict if the client will default or not.
# df['default_payment_next_month'] = df['default.payment.next.month']
X = df.drop(columns=['ID', 'default.payment.next.month'], axis = 1)
y = df['default.payment.next.month']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)
model = GradientBoostingClassifierScratch(n_estimators=100, learning_rate=0.1, max_depth=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Gradient Boosting Classifier from scratch:", accuracy)

Accuracy of Gradient Boosting Classifier from scratch: 0.821


In [21]:
model = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Random Forest Classifier:", accuracy)


Accuracy of Random Forest Classifier: 0.82


In [22]:
model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of XGBoost Classifier:", accuracy)


Accuracy of XGBoost Classifier: 0.8208333333333333


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
