In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import make_regression
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# Dataset:  covtype.csv

Source: Remote Sensing and GIS Program, Department of Forest Sciences, College of Natural Resources, Colorado State University

Description: Predicting forest cover type from cartographic variables only (no remotely sensed data). The actual forest cover type for a given observation (30 x 30 meter cell) was determined from US Forest Service (USFS) Region 2 Resource Information System (RIS) data. Independent variables were derived from data originally obtained from US Geological Survey (USGS) and USFS data. Data is in raw form (not scaled) and contains binary (0 or 1) columns of data for qualitative independent variables (wilderness areas and soil types).

This study area includes four wilderness areas located in the Roosevelt National Forest of northern Colorado. These areas represent forests with minimal human-caused disturbances, so that existing forest cover types are more a result of ecological processes rather than forest management practices.

Variables/Columns

- Elevation: Elevation in meters
- Aspect: Aspect in degrees azimuth
- Slope: Slope in degrees
- Horizontal_Distance_To_Hydrology: Horz Dist to nearest surface water features
- Vertical_Distance_To_Hydrology: Vert Dist to nearest surface water features
- Horizontal_Distance_To_Roadways: Horz Dist to nearest roadway
- Hillshade_9am: Hillshade index at 9am, summer solstice
- Hillshade_Noon: Hillshade index at noon, summer soltice
- Hillshade_3pm: Hillshade index at 3pm, summer solstice
- Horizontal_Distance_To_Fire_Points: Horz Dist to nearest wildfire ignition points
- Wilderness_Area: 0 (absence) or 1 (presence)
- Cover_Type: (2 types) Forest Cover Type designation
    - 0: Spruce/Fir
    - 1: Lodgepole Pine

In [2]:
# Read the forest cover dataset
df = pd.read_csv('../Resources/covtype.csv')
X = df.drop('cover', axis=1)
y = df['cover']
target_names = ["Spruce/Fir", "Lodgepole Pine"]

In [3]:
# Prepare the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
# Import a Random Forests classifier
from sklearn.ensemble import RandomForestClassifier

In [5]:
# Fit a model, and then print a classification report
clf = RandomForestClassifier(random_state=1).fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

                precision    recall  f1-score   support

    Spruce/Fir       0.88      0.85      0.86      5346
Lodgepole Pine       0.89      0.91      0.90      7033

      accuracy                           0.88     12379
     macro avg       0.88      0.88      0.88     12379
  weighted avg       0.88      0.88      0.88     12379

Training Score: 1.0
Testing Score: 0.8830277082155263


In [6]:
# Import an Extremely Random Trees classifier
from sklearn.ensemble import ExtraTreesClassifier

In [7]:
clf = ExtraTreesClassifier(random_state=1).fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

                precision    recall  f1-score   support

    Spruce/Fir       0.89      0.85      0.87      5346
Lodgepole Pine       0.89      0.92      0.90      7033

      accuracy                           0.89     12379
     macro avg       0.89      0.88      0.88     12379
  weighted avg       0.89      0.89      0.89     12379

Training Score: 1.0
Testing Score: 0.8872283706276759


In [8]:
# Import an Adaptive Boosting classifier
from sklearn.ensemble import AdaBoostClassifier

In [9]:
clf = AdaBoostClassifier(random_state=1).fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred, target_names=target_names))
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

                precision    recall  f1-score   support

    Spruce/Fir       0.74      0.72      0.73      5346
Lodgepole Pine       0.79      0.81      0.80      7033

      accuracy                           0.77     12379
     macro avg       0.77      0.77      0.77     12379
  weighted avg       0.77      0.77      0.77     12379

Training Score: 0.7708423093494183
Testing Score: 0.7711446805073108


In [10]:
# BONUS
def model_tester(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    clf = model.fit(X_train_scaled, y_train)
    print(classification_report(y_test, y_pred, target_names=target_names))
    print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
    print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')
    
model_tester(AdaBoostClassifier(random_state=1, n_estimators=100), X, y)
model_tester(AdaBoostClassifier(random_state=1, n_estimators=200), X, y)
model_tester(AdaBoostClassifier(random_state=1, n_estimators=200, learning_rate=0.1), X, y)
model_tester(AdaBoostClassifier(random_state=1, n_estimators=500, learning_rate=0.1), X, y)
model_tester(AdaBoostClassifier(random_state=1, n_estimators=1000, learning_rate=0.1), X, y)
model_tester(AdaBoostClassifier(random_state=1, n_estimators=2000, learning_rate=0.1), X, y)

                precision    recall  f1-score   support

    Spruce/Fir       0.74      0.72      0.73      5346
Lodgepole Pine       0.79      0.81      0.80      7033

      accuracy                           0.77     12379
     macro avg       0.77      0.77      0.77     12379
  weighted avg       0.77      0.77      0.77     12379

Training Score: 0.7736697544161999
Testing Score: 0.7711446805073108
                precision    recall  f1-score   support

    Spruce/Fir       0.74      0.72      0.73      5346
Lodgepole Pine       0.79      0.81      0.80      7033

      accuracy                           0.77     12379
     macro avg       0.77      0.77      0.77     12379
  weighted avg       0.77      0.77      0.77     12379

Training Score: 0.7770088323998277
Testing Score: 0.772356410049277
                precision    recall  f1-score   support

    Spruce/Fir       0.74      0.72      0.73      5346
Lodgepole Pine       0.79      0.81      0.80      7033

      accuracy 