#### Importing necessary libraries and data set

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("train.csv")

X = df.drop(columns=['Cover_Type'])  
y = df['Cover_Type']  
y = y - 1  

In [12]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#### Train-test split up

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)


In [16]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# XGBoost
xgb = XGBClassifier(n_estimators=100, eval_metric="mlogloss", random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
#y_pred_xgb = y_pred_xgb + 1

Parameters: { "use_label_encoder" } are not used.



#### Evaluation

In [17]:

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))

print("\nRandom Forest Classification Report:\n", classification_report(y_test, y_pred_rf))
print("\nXGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))


Random Forest Accuracy: 0.8601190476190477
XGBoost Accuracy: 0.8743386243386243

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.78      0.78       432
           1       0.80      0.66      0.72       432
           2       0.83      0.81      0.82       432
           3       0.94      0.98      0.96       432
           4       0.89      0.95      0.92       432
           5       0.84      0.88      0.86       432
           6       0.93      0.97      0.95       432

    accuracy                           0.86      3024
   macro avg       0.86      0.86      0.86      3024
weighted avg       0.86      0.86      0.86      3024


XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.79      0.79       432
           1       0.80      0.68      0.73       432
           2       0.84      0.85      0.85       432
           3       0.96      0.97 

#### Hyperparameter Tuning - XGBClassifier

In [20]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  
    'max_depth': [3, 6, 9],  
    'learning_rate': [0.01, 0.1, 0.2],  
    'subsample': [0.8, 1],  
    'colsample_bytree': [0.8, 1]  
}

xgb_clf = xgb.XGBClassifier(eval_metric="mlogloss", random_state=42)

# GridSearch
grid_search = GridSearchCV(xgb_clf, param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

best_xgb = grid_search.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

print("Optimized XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))


Fitting 3 folds for each of 108 candidates, totalling 324 fits


1 fits failed out of a total of 324.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\YASHU\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\YASHU\AppData\Local\Programs\Python\Python311\Lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\YASHU\AppData\Local\Programs\Python\Python311\Lib\site-packages\xgboost\sklearn.py", line 1512, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
                           ^^^^^^^^^^^^^^

Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 9, 'n_estimators': 300, 'subsample': 1}
Optimized XGBoost Accuracy: 0.8826058201058201

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.81      0.81       432
           1       0.81      0.70      0.75       432
           2       0.86      0.85      0.85       432
           3       0.95      0.97      0.96       432
           4       0.90      0.95      0.93       432
           5       0.88      0.91      0.89       432
           6       0.95      0.97      0.96       432

    accuracy                           0.88      3024
   macro avg       0.88      0.88      0.88      3024
weighted avg       0.88      0.88      0.88      3024



#### Hyperparameter Tuning - Random Forest

In [39]:
from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 300, 500, 1000],  
    'max_depth': [10, 20, 30, 50, None],    
    'min_samples_split': [2, 5, 10],        
    'min_samples_leaf': [1, 2, 5],          
    'max_features': ['sqrt', 'log2', None], 
    'bootstrap': [True, False]              
}

rf = RandomForestClassifier(random_state=42)

# Randomized Search
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=20,  
    cv=5,       
    verbose=2,
    n_jobs=-1, 
    random_state=42
)


rf_random.fit(X_train, y_train)

# Best parameters from tuning
print("Best Parameters:", rf_random.best_params_)

y_pred = rf_random.best_estimator_.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized Random Forest Accuracy: {accuracy:.4f}")

print("Classification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'n_estimators': 1000, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
Optimized Random Forest Accuracy: 0.8697
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.79      0.79       432
           1       0.80      0.67      0.73       432
           2       0.86      0.84      0.85       432
           3       0.94      0.98      0.96       432
           4       0.89      0.94      0.92       432
           5       0.85      0.90      0.87       432
           6       0.94      0.97      0.95       432

    accuracy                           0.87      3024
   macro avg       0.87      0.87      0.87      3024
weighted avg       0.87      0.87      0.87      3024

