# Forest Cover Type Prediction (UCI Covertype Dataset)

This notebook demonstrates data cleaning, preprocessing, training, and evaluation of classification models to predict the type of forest cover based on cartographic and environmental features.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## Load Dataset
The UCI Covertype dataset can be downloaded from: https://archive.ics.uci.edu/ml/datasets/covertype

In [None]:
# Replace with the path to your dataset CSV file
data = pd.read_csv('covtype.csv')
data.head()

## Data Preprocessing

In [None]:
X = data.drop('Cover_Type', axis=1)
y = data['Cover_Type']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape, X_test.shape

## Model Training & Evaluation

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, multi_class='multinomial'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)
}

for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot(cmap='Blues')
    plt.title(name)
    plt.show()

## Feature Importance (Random Forest & XGBoost)

In [None]:
importances = models['Random Forest'].feature_importances_
indices = np.argsort(importances)[-10:]

plt.figure(figsize=(8,6))
plt.barh(range(len(indices)), importances[indices], align='center')
plt.yticks(range(len(indices)), [X.columns[i] for i in indices])
plt.title('Top 10 Important Features - Random Forest')
plt.show()

xgb_importances = models['XGBoost'].feature_importances_
indices = np.argsort(xgb_importances)[-10:]

plt.figure(figsize=(8,6))
plt.barh(range(len(indices)), xgb_importances[indices], align='center')
plt.yticks(range(len(indices)), [X.columns[i] for i in indices])
plt.title('Top 10 Important Features - XGBoost')
plt.show()