In [13]:
# jupyter notebook for training extremely randomized trees model
# Date: 02/10/2020

# import libraries
import pandas as pd
from sklearn import ensemble, metrics
import matplotlib.pyplot as plt
import numpy as np

In [14]:
# load data after data imputation and feature engineering
train = pd.read_csv('data/output/train_processed.csv') # train_resampled.csv
test = pd.read_csv('data/output/test_processed.csv')
test_labeled = pd.read_csv("data/test/test.csv")

In [15]:
# data preprocessing for model training
feature_cols = [col for col in train.columns if col not in ['Cover_Type','Id']]
X_train = train[feature_cols]
X_test = test[feature_cols]
y = train['Cover_Type']
test_ids = test['Id']

In [16]:
print(X_train.shape)
print(X_test.shape)
X_train.columns
X_test.columns
X_train.head()
X_test.head()

(15120, 68)
(565892, 68)


Unnamed: 0,Elevation,Aspect,Slope,HDTH,VDTH,HDTR,H9am,Hnoon,H3pm,HDTFP,...,tan,HF1,HF2,HF3,HR1,HR2,HR3,FR1,FR2,FR3
0,2680,354,14,0,0,2684,196,214,156,6645,...,1.0,6645,6645,1.504891e-16,2684,2684,3.725782e-16,9329,3961,2.475782
1,2683,0,13,0,0,2654,201,216,152,6675,...,1.0,6675,6675,1.498127e-16,2654,2654,3.767898e-16,9329,4021,2.515072
2,2713,16,15,0,0,2980,206,208,137,6344,...,1.0,6344,6344,1.576293e-16,2980,2980,3.355705e-16,9324,3364,2.128859
3,2709,24,17,0,0,2950,208,201,125,6374,...,1.0,6374,6374,1.568874e-16,2950,2950,3.389831e-16,9324,3424,2.160678
4,2706,29,19,0,0,2920,210,195,115,6404,...,1.0,6404,6404,1.561524e-16,2920,2920,3.424658e-16,9324,3484,2.193151


In [17]:
# train extremely randomized trees model
forest = ensemble.ExtraTreesClassifier(n_estimators=1000, criterion='gini', 
                                       max_depth=None, min_samples_split=2, 
                                       min_samples_leaf=1, max_features='auto', 
                                       bootstrap=False, oob_score=False, 
                                       n_jobs=-1, random_state=None, verbose=0)

In [18]:
# train model and predict on test set
forest.fit(X_train, y)
y_pred = pd.Series(forest.predict(X_test), name="Cover_Type")

In [19]:
print(sum(y_pred == 1))
print(sum(y_pred == 2))
print(sum(y_pred == 3))
print(sum(y_pred == 4))
print(sum(y_pred == 5))
print(sum(y_pred == 6))
print(sum(y_pred == 7))

209955
242194
36523
1963
24864
22905
27488


In [88]:
# print variable importance from extremely randomized trees model
feature_imp = pd.DataFrame(forest.feature_importances_, index=X_train.columns, columns=["importance"])
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(feature_imp.sort_values(by=["importance"], ascending=False))

                  importance
EHDtH               0.089182
EVDtH               0.082058
Elevation           0.077097
FR1                 0.032729
HDTR                0.030953
HR1                 0.030839
HR2                 0.030255
HF1                 0.028760
FR2                 0.028520
HDTFP               0.028475
HF2                 0.027937
FR3                 0.027639
Aspect              0.027384
Hnoon               0.026542
Wilderness_Area4    0.025681
HR3                 0.024910
H3pm                0.024649
H9am                0.024604
DTH                 0.024394
HDTH                0.024045
tan                 0.023465
Slope               0.023368
VDTH                0.023326
HF3                 0.023255
Soil_Type22         0.014355
Wilderness_Area1    0.013611
Wilderness_Area3    0.012044
Soil_Type23         0.011668
Soil_Type10         0.011623
Soil_Type38         0.011616
Soil_Type4          0.011502
Soil_Type12         0.010306
Soil_Type39         0.008643
Soil_Type29   

In [89]:
# compute total accuracy on test set
metrics.accuracy_score(test_labeled["Cover_Type"], y_pred)

0.8311992394308455

In [6]:
# compute accuracy for each class label on test set
def get_accuracy_for_label(y_pred, test_labeled):
    for label in range(1, 8):
        acc = sum((y_pred == label) & (y_pred == test_labeled["Cover_Type"]))
        total = sum(test_labeled["Cover_Type"] == label)
        print("The accuracy for class label = %d is %.4f" % (label, acc/total))

In [None]:
get_accuracy_for_label(y_pred, test_labeled)

In [92]:
# save predicted labels and corresponding ids to local csv file for submission
submission = pd.DataFrame(data={"Id": test_ids, "Cover_Type": y_pred})
submission.to_csv("data/output/extratree1000sample.csv", index=False)

In [None]:
# train strata train set and make predictions 
results = []
for i in ["3", "4", "7", "8", "9", "11", "12", "13", "14", "15", "17", "18", "19", "23", "25", "26", "27", "out"]:
    train = pd.read_csv("data/output/strata/train" + i + ".csv")
    test = pd.read_csv("data/output/strata/test" + i + ".csv")
    feature_cols = [col for col in train.columns if col not in ['Cover_Type','Id']]
    X_train = train[feature_cols]
    X_test = test[feature_cols]
    y = train['Cover_Type']
    test_ids = test['Id']
    forest.fit(X_train, y)
    y_pred = pd.Series(forest.predict(X_test), name="Cover_Type")
    result = pd.DataFrame(data={"Id": test_ids, "Cover_Type": y_pred})
    results.append(result)

In [20]:
# submission.to_csv("data/output/strata/pred" + i + "extratree.csv", index=False)
extratree1000strata = pd.concat(results).sort_values(by=["Id"])
extratree1000strata.reset_index(drop=True)

Unnamed: 0,Id,Cover_Type
0,15121,5
1,15122,1
2,15123,1
3,15124,1
4,15125,1
...,...,...
565887,581008,3
565888,581009,3
565889,581010,3
565890,581011,3


In [25]:
extratree1000strata.to_csv("data/output/extratree1000strata.csv", index=False)

In [8]:
extratree1000strata = pd.read_csv("data/output/extratree1000strata.csv")

In [9]:
print(metrics.accuracy_score(test_labeled["Cover_Type"], extratree1000strata["Cover_Type"]))
get_accuracy_for_label(extratree1000strata["Cover_Type"], test_labeled)

0.8025382935259732
The accuracy for class label = 1 is 0.8296
The accuracy for class label = 2 is 0.7496
The accuracy for class label = 3 is 0.8855
The accuracy for class label = 4 is 0.9710
The accuracy for class label = 5 is 0.9718
The accuracy for class label = 6 is 0.9248
The accuracy for class label = 7 is 0.9784


In [11]:
# compute confusion matrix for third classifier
metrics.confusion_matrix(test_labeled["Cover_Type"], extratree1000strata["Cover_Type"])

array([[173943,  25395,    174,      0,   2203,    247,   7718],
       [ 42886, 210747,   6178,     70,  14223,   5925,   1112],
       [     0,    106,  29747,   1090,    138,   2513,      0],
       [     0,      0,     11,    570,      0,      6,      0],
       [    11,    116,     59,      0,   7126,     21,      0],
       [    10,     45,    774,    270,     45,  14063,      0],
       [   350,     40,      0,      0,      6,      0,  17954]],
      dtype=int64)

In [12]:
extratree1000strata["Cover_Type"]

0         5
1         1
2         1
3         1
4         1
         ..
565887    3
565888    3
565889    3
565890    3
565891    3
Name: Cover_Type, Length: 565892, dtype: int64

In [7]:
final_prediction = pd.read_csv("data/output/final_prediction.csv")
print(metrics.accuracy_score(test_labeled["Cover_Type"], final_prediction["Cover_Type"]))
get_accuracy_for_label(final_prediction["Cover_Type"], test_labeled)

0.838262424632262
The accuracy for class label = 1 is 0.7951
The accuracy for class label = 2 is 0.8639
The accuracy for class label = 3 is 0.8563
The accuracy for class label = 4 is 0.9693
The accuracy for class label = 5 is 0.6919
The accuracy for class label = 6 is 0.8217
The accuracy for class label = 7 is 0.9728
