In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

Loading the dataset

In [2]:
data = pd.read_csv('train.csv')

In [3]:
print(data.head())
print(data.info())

   Id  Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
0   1       2596      51      3                               258   
1   2       2590      56      2                               212   
2   3       2804     139      9                               268   
3   4       2785     155     18                               242   
4   5       2595      45      2                               153   

   Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
0                               0                              510   
1                              -6                              390   
2                              65                             3180   
3                             118                             3090   
4                              -1                              391   

   Hillshade_9am  Hillshade_Noon  Hillshade_3pm  ...  Soil_Type32  \
0            221             232            148  ...            0   
1            220          

Feature Engineering and Preprocessing

In [4]:
X = data.drop(columns=['Cover_Type'])
y = data['Cover_Type']

Normalising numerical features

In [5]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Splitting the data into training and testing sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

Training the Gradient Boosting Classifier

In [7]:
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

Making Prediction

In [8]:
y_pred = gb_model.predict(X_test)

Model Evaluation

In [9]:
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           1       0.73      0.72      0.72       432
           2       0.75      0.56      0.64       432
           3       0.74      0.75      0.74       432
           4       0.93      0.95      0.94       432
           5       0.81      0.93      0.86       432
           6       0.76      0.79      0.78       432
           7       0.91      0.96      0.93       432

    accuracy                           0.81      3024
   macro avg       0.80      0.81      0.80      3024
weighted avg       0.80      0.81      0.80      3024

Confusion Matrix:
 [[310  69   1   0  11   2  39]
 [ 95 243  13   0  54  23   4]
 [  0   1 322  24  15  70   0]
 [  0   0  18 410   0   4   0]
 [  1  11  12   0 400   8   0]
 [  1   0  70   6  14 341   0]
 [ 17   0   0   0   0   0 415]]


Hyperparameter Tuning

In [10]:
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2, 4]
}

In [11]:
grid_search = GridSearchCV(estimator=GradientBoostingClassifier(random_state=42), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


Best Parameter from grid search

In [12]:
print("Best parameters found by GridSearchCV:", grid_search.best_params_)

Best parameters found by GridSearchCV: {'learning_rate': 0.2, 'max_depth': 7, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


Re-train with best parameters

In [13]:
best_gb_model = grid_search.best_estimator_
y_pred_best = best_gb_model.predict(X_test)

Evaluation of tuned model

In [14]:
print("Tuned Model Classification Report:\n", classification_report(y_test, y_pred_best))

Tuned Model Classification Report:
               precision    recall  f1-score   support

           1       0.80      0.77      0.78       432
           2       0.76      0.69      0.72       432
           3       0.87      0.87      0.87       432
           4       0.96      0.98      0.97       432
           5       0.90      0.96      0.93       432
           6       0.87      0.90      0.89       432
           7       0.95      0.97      0.96       432

    accuracy                           0.88      3024
   macro avg       0.87      0.88      0.87      3024
weighted avg       0.87      0.88      0.87      3024

