In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.decomposition import PCA, SparsePCA, TruncatedSVD
from sklearn.ensemble import *
from sklearn.preprocessing import normalize
import math
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
import time
from sklearn import model_selection
from sklearn.metrics import accuracy_score, plot_confusion_matrix, confusion_matrix as cm
import seaborn as sns
import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils.np_utils import to_categorical
from itertools import combinations
from catboost import CatBoostClassifier, cv, Pool 
from tqdm.notebook import tqdm

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
from sklearn.base import clone

In [3]:
np.set_printoptions(precision=5)
pd.set_option('display.max_rows', 100)

In [4]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# EDA

In [None]:
df_train.sample(5)

In [None]:
df_train.columns

In [None]:
col_normalize = ['Aspect','Slope','Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology', 'Hillshade_9am','Hillshade_Noon','Hillshade_3pm','Horizontal_Distance_To_Fire_Points']
df_train_norm = df_train.copy()
df_train_norm[col_normalize] = normalize(df_train[col_normalize])
for col in col_normalize:
    print(col)
    l = []
    for cover in sorted(df_train_norm.Cover_Type.unique()):
        l.append( [v for v,k in zip(df_train_norm[col],df_train_norm['Cover_Type']) if k==cover]
                )
    plt.boxplot(l)
    plt.show()

In [None]:
l = []
for cover in sorted(df_train_norm.Cover_Type.unique()):
    l.append( [math.log(v+1) for v,k in zip(df_train_norm['Horizontal_Distance_To_Roadways'],df_train_norm['Cover_Type']) if k==cover]
            )
plt.boxplot(l)
plt.show()

In [None]:
l = []
for cover in sorted(df_train_norm.Cover_Type.unique()):
    l.append( [v for v,k in zip(df_train_norm['Horizontal_Distance_To_Roadways'],df_train_norm['Cover_Type']) if k==cover]
            )
plt.boxplot(l)
plt.show()

results = []
for k in [1, 2, 3, 4, 5, 10, 20, 30, 40, 50]:
    pca = PCA(n_components=k)
    pca.fit(df_train_norm.to_numpy(), df_train_norm['Cover_Type'].to_numpy())
    explained_variance = pca.explained_variance_ratio_.sum()
    print("k={}:\t{}".format(k,explained_variance))
    results.append([k, explained_variance])
plt.plot(np.array(results)[:,0], np.array(results)[:,1])

In [None]:
soils = [x for x in df_train_norm.columns if "Soil" in x]

for x in combinations(soils, 3):
    print(x)
    break

df_train_norm[soils].sum(axis=1).value_counts()

from tqdm.notebook import tqdm

classes = []
for n in tqdm(range(2,6)):
    for x in tqdm(combinations(soils, n)):
        pca = PCA(n_components=1)
        pca.fit(df_train_norm[list(x)].to_numpy(), df_train_norm['Cover_Type'].to_numpy())
        explained_variance = pca.explained_variance_ratio_.sum()
        classes.append([*x] + [" "]*(5-n) + [explained_variance])

r = pd.DataFrame(classes, columns=['S1','S2','S3','S4','S5','var'])
r['var'] -= 1
r[r['var']>=0].sort_values('var',ascending=False)

r.sort_values('var',ascending=False).to_csv('PCA results.csv', index=False)

results = []
for k in [1, 2, 3, 4, 5, 10, 20, 30, 40, 50]:
    pca = TruncatedSVD(n_components=k)
    pca.fit(df_train_norm.to_numpy(), df_train_norm['Cover_Type'].to_numpy())
    explained_variance = pca.explained_variance_ratio_.sum()
    print("k={}:\t{}".format(k,explained_variance))
    results.append([k, explained_variance])
plt.plot(np.array(results)[:,0], np.array(results)[:,1])

# Helper Funtions

In [5]:
kfold = model_selection.KFold(n_splits=10, random_state=0, shuffle=True)

In [6]:
def pipeline(df, version=0):
    # 0.8658068783068783
    df_n = df.copy()
    df_n.drop(columns=['Id'],inplace=True)
    df_n = df_n.astype({c:'bool' for c in df_n.columns if "Soil_Type" in c or "Wilderness_Area" in c})
    if version >= 1: # 0.893320105820106
        col_normalize = ['Aspect','Slope','Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology', 
                     'Hillshade_9am','Hillshade_Noon','Hillshade_3pm','Horizontal_Distance_To_Fire_Points',
                     'Horizontal_Distance_To_Roadways']
        df_n['log_Horizontal_Distance_To_Roadways'] = np.log(df_n['Horizontal_Distance_To_Roadways']+1)
        df_n['log_Horizontal_Distance_To_Fire_Points'] = np.log(df_n['Horizontal_Distance_To_Fire_Points']+1)
        df_n[col_normalize] = normalize(df_n[col_normalize])
        df_n.drop(columns=['Soil_Type7'],inplace=True)
    if version >= 2: # 0.8964947089947091
        df_n['sq_Elevation'] = np.power(df['Elevation'],1.5)
        df_n.drop(columns='Aspect',inplace=True)
        df_n['norm_aspect'] = df.Aspect.map(lambda x: x-180 if x > 180 else x+180)
        df_n['atan_aspect'] = np.arctan(df_n.norm_aspect)
    if version >= 3: # 0.9104497354497356
        df_n['Vertical_Distance_To_Hydrology'] = np.abs(df_n.Vertical_Distance_To_Hydrology)
        df_n['E-VH'] = df.Elevation - df.Vertical_Distance_To_Hydrology * .9 
        df_n['E-HH'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * .5
        
        df_n['F+R'] = (df.Horizontal_Distance_To_Fire_Points + df.Horizontal_Distance_To_Roadways) ** 2
        df_n['F+H'] = (df.Horizontal_Distance_To_Fire_Points + df.Horizontal_Distance_To_Hydrology) ** 0.3
        df_n['H+R'] = (df.Horizontal_Distance_To_Hydrology + df.Horizontal_Distance_To_Roadways)
        
        df_n['abs_H-R'] = (np.abs(df.Horizontal_Distance_To_Hydrology - df.Horizontal_Distance_To_Roadways)) 
        df_n['abs_H-F'] = (np.abs(df.Horizontal_Distance_To_Hydrology - df.Horizontal_Distance_To_Fire_Points)) 
        df_n['abs_F-R'] = (np.abs(df.Horizontal_Distance_To_Fire_Points - df.Horizontal_Distance_To_Roadways)) 
    return df_n

def submit(model,version):
    global df_train
    df_train_c = pipeline(df_train.copy(),version)
    df_submit = pipeline(df_test.copy(),version)
    X, Y = df_train_c.drop(columns=['Cover_Type']).to_numpy(), df_train_c.Cover_Type.to_numpy()
    model.fit(X, Y)
    pred = model.predict(df_submit.to_numpy())
    final_df = df_test.copy()
    final_df['Cover_Type'] = pred
    return final_df[['Id','Cover_Type']]

def score_model(model,df, return_val=False, return_train=False, display=True, return_acc=False, return_time=False, show_weights=False, return_cv_acc=False, verbose=True, single_split=False, return_class_acc=False):
    X , Y = df.drop(columns=['Cover_Type']).to_numpy(), df.Cover_Type.to_numpy()
    start = time.time()
    results = model_selection.cross_val_score(model, X, Y, cv=kfold)
    if single_split or return_class_acc:
        X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=.33, random_state=0)
        model.fit(X_train, y_train)
        pred = model.predict(X_val)
        acc = accuracy_score(y_val, pred)
    end = time.time()
    cv_acc = results.mean()
    if verbose:
        print('cv acc:', cv_acc)
        if single_split:
            print('split acc:', acc)
        print('time taken:', end-start, end='\n\n')
    if display:
        matrix = cm(y_val, pred)
        print(matrix.diagonal() / matrix.sum(axis=1))

        disp = plot_confusion_matrix(model, X_val, y_val, display_labels=set(y_train), cmap=plt.cm.Blues, normalize='true')
        plt.show()
    
    if show_weights:
        for w,k in sorted(list(zip(model.feature_importances_, df.drop(columns=['Cover_Type']).columns)), key=lambda x: x[0]):
            print(k,w)
            
    # return all data
    return_data = [model]
    if return_train:
        return_data += [X_train, y_train]
    if return_val:
        return_data += [X_val, y_val]
    if return_acc:
        return_data += [acc]
    if return_cv_acc:
        return_data += [cv_acc]
    if return_time:
        return_data += [end-start]
    if return_class_acc:
        matrix = cm(y_val, pred)
        ca = matrix.diagonal() / matrix.sum(axis=1)
        return_data += [ca]
    return tuple(return_data)

# Baseline

### Best Model = ExtraTreesClassifier()

In [None]:
models = []
for clf in [ExtraTreesClassifier(n_jobs=-1, random_state=0),
           HistGradientBoostingClassifier(random_state=0),
           RandomForestClassifier(n_jobs=-1, random_state=0)]:
    
    models.append(score_model(clf,df_train))

### Best Ensemble = Mix 3

In [None]:
# Voting
models = [('et',ExtraTreesClassifier(n_jobs=-1, random_state=0)),
           ('hg',HistGradientBoostingClassifier(random_state=0)),
           ('rf',RandomForestClassifier(n_jobs=-1, random_state=0))]
model = VotingClassifier(models, n_jobs=-1)
score_model(model,df_train);

In [None]:
# Voting
models = [('hg1',HistGradientBoostingClassifier(random_state=0)),
           ('hg2',HistGradientBoostingClassifier(random_state=0)),
           ('hg3',HistGradientBoostingClassifier(random_state=0))]
model = VotingClassifier(models, n_jobs=-1)
score_model(model,df_train);

In [None]:
# Voting
models = [('et',ExtraTreesClassifier(n_jobs=-1, random_state=0)),
           ('et2',ExtraTreesClassifier(n_jobs=-1, random_state=0)),
           ('et3',ExtraTreesClassifier(n_jobs=-1, random_state=0))]
model = VotingClassifier(models, n_jobs=-1)
score_model(model,df_train);

In [None]:
# Voting
models = [('rf1',RandomForestClassifier(n_jobs=-1, random_state=0)),
           ('rf2',RandomForestClassifier(n_jobs=-1, random_state=0)),
           ('rf3',RandomForestClassifier(n_jobs=-1, random_state=0))]
model = VotingClassifier(models, n_jobs=-1)
score_model(model,df_train);

# Feature Selection

## Alt2

In [None]:
for v in range(0,4):
    print(v)
    df_train_norm = pipeline(df_train, v)
    best_m, best_X_train, best_y_train, best_X_val, best_y_val = score_model(ExtraTreesClassifier(n_jobs=-1, random_state=1189), df_train_norm, return_train = True, return_val = True, display=True)

In [None]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= r"http://stream1.cmatc.cn/pub/comet/FireWeather/S290Unit10FuelMoisture/comet/fire/s290/unit10/media/graphics/aspsumm.jpg")

In [None]:
df_train.columns

In [None]:
model = ExtraTreesClassifier(n_jobs=-1, random_state=1189)
df_train_norm = pipeline(df_train, 3)
_, cv_acc = score_model(model,df_train_norm, display=True, return_cv_acc=True, verbose=True);

In [None]:
good_rs = [x for x in sorted(good_rs, reverse=True)]
good_rs[0]

In [None]:
# Get good Random_state for Voting Model
results = []
for x in range(6001,7001):
    print(x, end='           \r', flush=True)
    model = ExtraTreesClassifier(n_jobs=-1, random_state=x)
    df_train_norm = pipeline(df_train, 3)
    _, cv_acc = score_model(model,df_train_norm, display=False, return_cv_acc=True, verbose=False);
    results.append([cv_acc,x])
grs = [r for r in results if r[0] > .91]
print(len(grs),' '*50)
good_rs = [x for x in sorted(good_rs+grs, reverse=True)]
print(len(good_rs),' '*50)
print(good_rs[0])

In [None]:
results_1 = [[0.9111111111111111, [1189]],
 [0.9117724867724867, [1189, 883]],
 [0.9124338624338624, [1189, 883, 2153]],
 [0.9116402116402116, [1189, 883, 2153, 5568]],
 [0.9123015873015874, [1189, 883, 2153, 5568, 2077]],
 [0.9130291005291007, [1189, 883, 2153, 5568, 2077, 769]],
 [0.9122354497354499, [1189, 883, 2153, 5568, 2077, 769, 2223]],
 [0.9130952380952382, [1189, 883, 2153, 5568, 2077, 769, 2223, 2675]],
 [0.9121693121693122, [1189, 883, 2153, 5568, 2077, 769, 2223, 2675, 995]],
 [0.9119708994708994, [1189, 883, 2153, 5568, 2077, 769, 2223, 2675, 995, 0]],
 [0.9121693121693122,
  [1189, 883, 2153, 5568, 2077, 769, 2223, 2675, 995, 0, 6890]],
 [0.9123677248677249,
  [1189, 883, 2153, 5568, 2077, 769, 2223, 2675, 995, 0, 6890, 1612]],
 [0.912037037037037,
  [1189, 883, 2153, 5568, 2077, 769, 2223, 2675, 995, 0, 6890, 1612, 1548]],
 [0.9123015873015874,
  [1189,
   883,
   2153,
   5568,
   2077,
   769,
   2223,
   2675,
   995,
   0,
   6890,
   1612,
   1548,
   2109]],
 [0.9119047619047619,
  [1189,
   883,
   2153,
   5568,
   2077,
   769,
   2223,
   2675,
   995,
   0,
   6890,
   1612,
   1548,
   2109,
   1529]],
 [0.9121031746031745,
  [1189,
   883,
   2153,
   5568,
   2077,
   769,
   2223,
   2675,
   995,
   0,
   6890,
   1612,
   1548,
   2109,
   1529,
   389]],
 [0.9117724867724867,
  [1189,
   883,
   2153,
   5568,
   2077,
   769,
   2223,
   2675,
   995,
   0,
   6890,
   1612,
   1548,
   2109,
   1529,
   389,
   3073]],
 [0.9121031746031747,
  [1189,
   883,
   2153,
   5568,
   2077,
   769,
   2223,
   2675,
   995,
   0,
   6890,
   1612,
   1548,
   2109,
   1529,
   389,
   3073,
   1040]],
 [0.9118386243386244,
  [1189,
   883,
   2153,
   5568,
   2077,
   769,
   2223,
   2675,
   995,
   0,
   6890,
   1612,
   1548,
   2109,
   1529,
   389,
   3073,
   1040,
   5915]],
 [0.9121031746031747,
  [1189,
   883,
   2153,
   5568,
   2077,
   769,
   2223,
   2675,
   995,
   0,
   6890,
   1612,
   1548,
   2109,
   1529,
   389,
   3073,
   1040,
   5915,
   5910]],
 [0.9119708994708994,
  [1189,
   883,
   2153,
   5568,
   2077,
   769,
   2223,
   2675,
   995,
   0,
   6890,
   1612,
   1548,
   2109,
   1529,
   389,
   3073,
   1040,
   5915,
   5910,
   5680]],
 [0.9117063492063492,
  [1189,
   883,
   2153,
   5568,
   2077,
   769,
   2223,
   2675,
   995,
   0,
   6890,
   1612,
   1548,
   2109,
   1529,
   389,
   3073,
   1040,
   5915,
   5910,
   5680,
   4069]],
 [0.9118386243386242,
  [1189,
   883,
   2153,
   5568,
   2077,
   769,
   2223,
   2675,
   995,
   0,
   6890,
   1612,
   1548,
   2109,
   1529,
   389,
   3073,
   1040,
   5915,
   5910,
   5680,
   4069,
   3010]],
 [0.9115740740740741,
  [1189,
   883,
   2153,
   5568,
   2077,
   769,
   2223,
   2675,
   995,
   0,
   6890,
   1612,
   1548,
   2109,
   1529,
   389,
   3073,
   1040,
   5915,
   5910,
   5680,
   4069,
   3010,
   1678]],
 [0.9116402116402117,
  [1189,
   883,
   2153,
   5568,
   2077,
   769,
   2223,
   2675,
   995,
   0,
   6890,
   1612,
   1548,
   2109,
   1529,
   389,
   3073,
   1040,
   5915,
   5910,
   5680,
   4069,
   3010,
   1678,
   1347]],
 [0.9113756613756614,
  [1189,
   883,
   2153,
   5568,
   2077,
   769,
   2223,
   2675,
   995,
   0,
   6890,
   1612,
   1548,
   2109,
   1529,
   389,
   3073,
   1040,
   5915,
   5910,
   5680,
   4069,
   3010,
   1678,
   1347,
   729]]]

good_rs = [[0.9111111111111111, 1189],
 [0.9109788359788361, 883],
 [0.9108465608465609, 2153],
 [0.9107804232804233, 5568],
 [0.9106481481481481, 2077],
 [0.9106481481481481, 769],
 [0.9105820105820104, 2223],
 [0.9105158730158729, 2675],
 [0.9105158730158729, 995],
 [0.9104497354497356, 0],
 [0.9104497354497354, 6890],
 [0.9104497354497354, 1612],
 [0.9104497354497354, 1548],
 [0.9103835978835979, 2109],
 [0.9103835978835979, 1529],
 [0.9103835978835979, 389],
 [0.9103835978835978, 3073],
 [0.9103174603174604, 1040],
 [0.9103174603174603, 5915],
 [0.9102513227513228, 5910],
 [0.9102513227513228, 5680],
 [0.9102513227513228, 4069],
 [0.9102513227513228, 3010],
 [0.9102513227513228, 1678],
 [0.9102513227513228, 1347],
 [0.9102513227513228, 729],
 [0.9102513227513226, 6044],
 [0.9102513227513226, 1081],
 [0.9101851851851853, 6893],
 [0.9101851851851853, 6314],
 [0.9101851851851853, 6013],
 [0.9101851851851853, 4022],
 [0.9101851851851853, 3897],
 [0.9101851851851853, 3122],
 [0.9101851851851853, 1633],
 [0.9101851851851851, 6277],
 [0.9101851851851851, 4470],
 [0.9101851851851851, 130],
 [0.9101190476190476, 5954],
 [0.9101190476190476, 5207],
 [0.9101190476190476, 3710],
 [0.9101190476190476, 2124],
 [0.9101190476190476, 813],
 [0.9101190476190476, 50],
 [0.9100529100529101, 6994],
 [0.9100529100529101, 6358],
 [0.9100529100529101, 4229],
 [0.9100529100529101, 2752],
 [0.91005291005291, 3281]]

et_hp = {0: 1200,
 50: 700,
 130: 900,
 389: 100,
 729: 100,
 769: 100,
 813: 200,
 883: 100,
 995: 300,
 1040: 100,
 1081: 1100,
 1189: 100,
 1347: 100,
 1529: 300,
 1548: 800,
 1612: 100,
 1633: 500,
 1678: 100,
 2077: 100,
 2109: 100,
 2124: 200,
 2153: 100,
 2223: 100,
 2675: 100,
 2752: 200,
 3010: 1200,
 3073: 100,
 3122: 300,
 3281: 100,
 3710: 100,
 3897: 100,
 4022: 100,
 4069: 1200,
 4229: 1100,
 4470: 100,
 5207: 1200,
 5568: 100,
 5680: 100,
 5910: 100,
 5915: 200,
 5954: 200,
 6013: 1000,
 6044: 100,
 6277: 300,
 6314: 100,
 6358: 100,
 6890: 100,
 6893: 400,
 6994: 900}

In [None]:
pd.DataFrame(results_1).sort_values(0, ascending=False)

In [None]:
[('et'+str(y),ExtraTreesClassifier(n_jobs=-1, random_state=y, n_estimators=et_hp[y])) for y in good[:8]]

In [None]:
best_ET_voting = [{'name': 'et1189', 'rs': 1189, 'n_tree': 100},
 {'name': 'et883', 'rs': 883, 'n_tree': 100},
 {'name': 'et2153', 'rs': 2153, 'n_tree': 100},
 {'name': 'et5568', 'rs': 5568, 'n_tree': 100},
 {'name': 'et2077', 'rs': 2077, 'n_tree': 100},
 {'name': 'et769', 'rs': 769, 'n_tree': 100},
 {'name': 'et2223', 'rs': 2223, 'n_tree': 100},
 {'name': 'et2675', 'rs': 2675, 'n_tree': 100}]

In [None]:
# Find best voting model
# 0.9110449735449736
# results_1 = []
good = [x[1] for x in good_rs]
x = 8
models = [('et'+str(y),ExtraTreesClassifier(n_jobs=-1, random_state=y, n_estimators=et_hp[y])) for y in good[:x]]
model = VotingClassifier(models, n_jobs=-1)
df_train_norm = pipeline(df_train, 3)
_, cv_acc = score_model(model,df_train_norm, display=True, return_cv_acc=True, verbose=True);

In [None]:
plt.figure(figsize=(28,13), dpi= 80)
plt.scatter(df_train.Horizontal_Distance_To_Fire_Points-df_train.Horizontal_Distance_To_Hydrology,df_train.Horizontal_Distance_To_Hydrology, c=df_train.Cover_Type)
plt.show()

In [None]:
# for st in soil_types:
plt.figure(figsize=(28,13), dpi= 80)
sns.violinplot(x='bin_st', y='Cover_Type', data=df_soil, scale='width', inner='box', palette=sns.color_palette("pastel"))
plt.show()

# Alternative Models 

## Catboost

Reasons to Use:<br>
* missing values numeric variables
* non-encoded categorical variables
* interpretable

In [None]:
import ipywidgets as widgets
from IPython.core.display import display, clear_output
from ipywidgets import interact

In [None]:
# df = df_train
df = df_train_dir.copy()
df = df.drop(columns=['Id','Cover_Type'])
X , Y = df.to_numpy(), df_train.Cover_Type.to_numpy()
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=.33, random_state=0)

cat_features = [df.columns.get_loc(c) for c in df.columns if "Soil_Type" in c or "Wilderness_Area" in c]

cv_dataset = Pool(data=X,
                  label=Y,
                  cat_features=cat_features,
                 )

In [None]:
params = {"iterations": 1000,
          "loss_function": "MultiClassOneVsAll", # MultiClass
          "verbose": False}

scores = cv(cv_dataset,
            params=params,
            nfold=10, 
           )

1 - scores[[x for x  in scores.columns if 'mean' in x]].apply(min)

In [None]:
params = {"iterations": 1000,
          "loss_function": "MultiClass", # MultiClass
          "verbose": False}

scores = cv(cv_dataset,
            params=params,
            nfold=10, 
           )

1 - scores[[x for x  in scores.columns if 'mean' in x]].apply(min)

In [None]:
booster = CatBoostClassifier(
#                               n_estimators=100,
#                               max_depth=10,
#                               learning_rate=0.1,
                              random_state=0,
                              objective='MultiClass', #OneVsAll
#                               iterations=100,
                              )

booster.fit(X_train, y_train, 
             cat_features = cat_features,
             plot=False, verbose=200,
             early_stopping_rounds=100,
            )

test_preds = booster.predict(X_val)
train_preds = booster.predict(X_train)

print("Train Accuracy : %.2f"%booster.score(X_train, y_train))
print("\nTest  Accuracy : %.2f"%booster.score(X_val, y_val))

for w,k in sorted(list(zip(booster.get_feature_importance(), df.columns)), key=lambda x: x[0]):
    print(k,w)

### V1

In [None]:
df = pipeline(df_train, '1')
df = df.drop(columns=['Id','Cover_Type'])
X , Y = df.to_numpy(), df_train.Cover_Type.to_numpy()
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=.33, random_state=0)

cat_features = [df.columns.get_loc(c) for c in df.columns if "Soil_Type" in c or "Wilderness_Area" in c]

cv_dataset = Pool(data=X,
                  label=Y,
                  cat_features=cat_features,
                 )

In [None]:
params = {"iterations": 1000,
          "loss_function": "MultiClassOneVsAll", # MultiClass
          "verbose": False}

scores = cv(cv_dataset,
            params=params,
            nfold=10, 
           )

1 - scores[[x for x  in scores.columns if 'mean' in x]].apply(min)

In [None]:
params = {"iterations": 1000,
          "loss_function": "MultiClass", # MultiClass
          "verbose": False}

scores = cv(cv_dataset,
            params=params,
            nfold=10, 
           )

1 - scores[[x for x  in scores.columns if 'mean' in x]].apply(min)

In [None]:
booster = CatBoostClassifier(
#                               n_estimators=100,
#                               max_depth=10,
#                               learning_rate=0.1,
                              random_state=0,
                              objective='MultiClass', #OneVsAll
#                               iterations=100,
                              )

booster.fit(X_train, y_train, 
             cat_features = cat_features,
             plot=False, verbose=200,
             early_stopping_rounds=100,
            )

test_preds = booster.predict(X_val)
train_preds = booster.predict(X_train)

print("Train Accuracy : %.2f"%booster.score(X_train, y_train))
print("\nTest  Accuracy : %.2f"%booster.score(X_val, y_val))

for w,k in sorted(list(zip(booster.get_feature_importance(), df.columns)), key=lambda x: x[0]):
    print(k,w)

In [None]:
booster = CatBoostClassifier(
#                               n_estimators=100,
#                               max_depth=10,
#                               learning_rate=0.1,
                              random_state=0,
                              objective='MultiClass', #OneVsAll
#                               iterations=100,
                              )

booster.fit(X_train, y_train, 
             cat_features = cat_features,
             plot=False, verbose=200,
             early_stopping_rounds=100,
            )

test_preds = booster.predict(X_val)
train_preds = booster.predict(X_train)

print("Train Accuracy : %.2f"%booster.score(X_train, y_train))
print("\nTest  Accuracy : %.2f"%booster.score(X_val, y_val))

for w,k in sorted(list(zip(booster.get_feature_importance(), df.columns)), key=lambda x: x[0]):
    print(k,w)

In [None]:
pred = booster.predict(X_val)
acc = accuracy_score(y_val, pred)
matrix = cm(y_val, pred)
print(matrix.diagonal() / matrix.sum(axis=1))

# Tuning Hyperparameters

In [None]:
def tune_estimator(v, rs=1189, model=''):
    df_train_norm = pipeline(df_train, v)
    tree_size_acc = []
    for n in range(100,1501,100):
        print(n, flush=True, end=' '*20+'\r')
        if model == 'ET':
            m, acc = score_model(ExtraTreesClassifier(n_estimators=n, n_jobs=-1, random_state=rs), df_train_norm, 
                                             display=False, return_cv_acc=True, return_time=False, verbose=False)
        elif model == 'RF':
            m, acc = score_model(RandomForestClassifier(n_estimators=n, n_jobs=-1, random_state=rs), df_train_norm, 
                                             display=False, return_cv_acc=True, return_time=False, verbose=False)
        else:
            m, acc = None, None
        tree_size_acc.append((n, acc))

    # fig, ax = plt.subplots()
    # # ax1 for # time
    # l1 = ax.plot(np.array(tree_size_acc)[:,1], np.array(tree_size_acc)[:,2], 'r', label='time')
    # ax.tick_params('y', colors='r')
    # ax.grid(axis='y', color='lightcoral', linestyle=':')
    # # ax2 for # n_estimators
    # ax2 = ax.twinx()
    # l2 = ax2.plot(np.array(tree_size_acc)[:,1], np.array(tree_size_acc)[:,0], 'b', label='n_estimators')
    # ax2.tick_params('y', colors='b')
    # ax2.grid(axis='y', color='royalblue', linestyle=(0,(5,5)))
    # # general plot
    # lines = l1 + l2
    # labels = [l.get_label() for l in lines]
    # ax.legend(lines,labels)
    # plt.show()
    return sorted(tree_size_acc, key=lambda x: x[1], reverse=True)[0][0]

In [None]:
df_train_norm = pipeline(df_train, 3)
tree_size_acc = []
for n in range(100,1501,100):
    print(n, flush=True, end=' '*20+'\r')
    m, acc = score_model(ExtraTreesClassifier(n_estimators=n, n_jobs=-1, random_state=rs), df_train_norm, 
                                         display=False, return_cv_acc=True, return_time=False, verbose=False)
    tree_size_acc.append((n, acc))

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
# ax for # n_estimators
l2 = ax.plot(np.array(tree_size_acc)[:,0], np.array(tree_size_acc)[:,1], label='n_estimators')
# general plot
ax.legend()
ax.set_xlabel('N_Estimators')
ax.set_ylabel('Accuracy')
ax.xaxis.label.set_color('w')
ax.yaxis.label.set_color('w')
plt.show()

In [None]:
sorted(tree_size_acc, key=lambda x: x[1], reverse=True)[0]

# Result

## ET

In [9]:
df_train_norm.columns

Index(['Elevation', 'Slope', 'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways',
       'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1',
       'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4',
       'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5',
       'Soil_Type6', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11',
       'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15',
       'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19',
       'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23',
       'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27',
       'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31',
       'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35',
       'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39',
       'Soil_Type40', 'Cover_Type', 'log_Ho

In [None]:
rs = 1189
for v in range(0,4):
    print(v)
    df_train_norm = pipeline(df_train, v)
    model = ExtraTreesClassifier(n_jobs=-1, random_state=rs)
    _, cv_acc = score_model(model,df_train_norm, display=False, return_cv_acc=True, verbose=True);

In [None]:
model_name = 'ET'
rs = 1189
for v in range(0,4):
    print(v)
    best_n = tune_estimator(v, rs, model_name)
    model = ExtraTreesClassifier(n_estimators=best_n, random_state=rs, n_jobs=-1)
    submit(model, v).to_csv('{}-{}-{}.csv'.format(model_name,v,best_n), index=False)

In [None]:
c = ['#d62728', '#268C70', 
     '#2ca02c', '#1f77b4', 
     '#9467bd', '#608475', 
     '#e377c2', '#BC6FC0', 
     '#d62728']

In [None]:
fig, ax = plt.subplots(figsize=(20,10))

# ax1 for original
ax.scatter([1]*len(np.arange(0,361,45)), np.arange(0,361,45), c=c)
ax.set_ylim((-20,380))
ax.set_xticks([])
ax.set_yticks(np.arange(0,361,45))
ax.set_ylabel('Original Aspect', c='w', size='x-large')
ax.set_yticklabels(['N','NE','E','SE','S','SW','W','NW','N'], fontsize='x-large')

# ax2 for transformed
ax2 = ax.twinx()
ax2.scatter([2]*len(np.arange(0,361,45)), [x-180 if x > 180 else x+180 for x in np.arange(0,361,45)], c=c)
ax2.set_ylim((-20,380))
ax2.set_xticks([])
ax2.set_ylabel('Transformed Aspect', c='w', size='x-large')
ax2.set_yticks(np.arange(45,361,45))
ax2.set_yticklabels(['SW','W','NW','N','NE','E','SE','S'], fontsize='x-large')

v = .02
for i in range(4):
    if i == 0:
        ax.arrow(1+v, 360-i*45, 
                 1-v*4.5, -180+10, head_width=0.03, head_length=10, fc='blue', ec='darkblue', ls=':', 
                 label='Aspect - 180')
    else: 
        ax.arrow(1+v, 360-i*45, 
             1-v*4.5, -180+10, head_width=0.03, head_length=10, fc='blue', ec='darkblue', ls=':')
for i in range(5):
    if i == 0:
        ax.arrow(1+v, 0+i*45, 
                 1-v*4.5, 180-10, head_width=0.03, head_length=10, fc='green', ec='darkgreen', ls=(0,(5,5)),
                label ='Aspect + 180')
    else:
        ax.arrow(1+v, 0+i*45, 
                 1-v*4.5, 180-10, head_width=0.03, head_length=10, fc='green', ec='darkgreen', ls=(0,(5,5)))
ax.grid(False)
ax2.grid(False)
ax.legend(loc='upper center', fontsize='x-large')
plt.show()

## ET Voting

In [None]:
ets_model = [{'name': 'et1189', 'rs': 1189, 'n_tree': 100},
             {'name': 'et883', 'rs': 883, 'n_tree': 100},
             {'name': 'et2153', 'rs': 2153, 'n_tree': 100},
             {'name': 'et5568', 'rs': 5568, 'n_tree': 100},
             {'name': 'et2077', 'rs': 2077, 'n_tree': 100},
             {'name': 'et769', 'rs': 769, 'n_tree': 100},
             {'name': 'et2223', 'rs': 2223, 'n_tree': 100},
             {'name': 'et2675', 'rs': 2675, 'n_tree': 100}]
models = [(y['name'],ExtraTreesClassifier(n_jobs=-1, random_state=y['rs'], n_estimators=y['n_tree'])) for y in ets_model]

for v in range(0,4):
    print(v)
    df_train_norm = pipeline(df_train, v)
    model = VotingClassifier(models, n_jobs=-1)
    _, cv_acc = score_model(model,df_train_norm, display=False, return_cv_acc=True, verbose=True);

In [None]:
model_name = 'ET_Voting'
ets_model = [{'name': 'et1189', 'rs': 1189, 'n_tree': 100},
             {'name': 'et883', 'rs': 883, 'n_tree': 100},
             {'name': 'et2153', 'rs': 2153, 'n_tree': 100},
             {'name': 'et5568', 'rs': 5568, 'n_tree': 100},
             {'name': 'et2077', 'rs': 2077, 'n_tree': 100},
             {'name': 'et769', 'rs': 769, 'n_tree': 100},
             {'name': 'et2223', 'rs': 2223, 'n_tree': 100},
             {'name': 'et2675', 'rs': 2675, 'n_tree': 100}]
models = [(y['name'],ExtraTreesClassifier(n_jobs=-1, random_state=y['rs'], n_estimators=y['n_tree'])) for y in ets_model]

for v in range(0,4):
    print(v)
    model = VotingClassifier(models, n_jobs=-1)
    submit(model, v).to_csv('{}-{}.csv'.format(model_name,v), index=False)

## RF

In [None]:
rs = 1189
for v in range(0,4):
    print(v)
    df_train_norm = pipeline(df_train, v)
    model = RandomForestClassifier(n_jobs=-1, random_state=rs)
    _, cv_acc = score_model(model,df_train_norm, display=False, return_cv_acc=True, verbose=True);

In [None]:
model_name = 'RF'
rs = 1189
for v in range(0,4):
    print(v)
    best_n = tune_estimator(v, rs, model_name)
    model = RandomForestClassifier(n_estimators=best_n, random_state=rs, n_jobs=-1)
    submit(model, v).to_csv('{}-{}-{}.csv'.format(model_name,v,best_n), index=False)

## CatBoost

In [None]:
for v in range(0,4):
    print(v)
    df_train_norm = pipeline(df_train, v)
    cat_features = [df_train_norm.columns.get_loc(c) for c in df_train_norm.columns if "Soil_Type" in c or "Wilderness_Area" in c]
    model = CatBoostClassifier(
                cat_features=cat_features,
                random_state=None,
                objective='MultiClass',
                verbose=False
                )
    _, cv_acc = score_model(model,df_train_norm, display=False, return_cv_acc=True, verbose=True);

In [None]:
model_name = 'CatBoost'
for v in range(0,4):
    print(v)
    df_train_norm = pipeline(df_train, v)
    cat_features = [df_train_norm.columns.get_loc(c) for c in df_train_norm.columns if "Soil_Type" in c or "Wilderness_Area" in c]
    model = CatBoostClassifier(
                cat_features=cat_features,
                objective='MultiClass',
                verbose=False
                )
    submit(model, v).to_csv('{}-{}.csv'.format(model_name,v), index=False)

# Plot results

In [None]:
from matplotlib import cycler
cs = ['#EE6666', '#7abf0a', '#9988DD',
      '#EECC55', '#88BB44', '#3a32d1',
      '#391306', '#3388BB', '#1DC690']
colors = cycler('color', cs)
plt.rc('axes', facecolor='#E6E6E6', edgecolor='none',
       axisbelow=True, grid=True, prop_cycle=colors)
plt.rc('grid', color='w', linestyle='solid')
plt.rc('xtick', direction='out', color='w')
plt.rc('ytick', direction='out', color='w')
plt.rc('patch', edgecolor='#E6E6E6')
plt.rc('lines', linewidth=2)

## pipeline version acc change

In [None]:
pv_acc = []
rs = 1189
for v in range(0,4):
    print(v)
    df_train_norm = pipeline(df_train, v)
    model = ExtraTreesClassifier(n_jobs=-1, random_state=rs)
    _, class_acc = score_model(model,df_train_norm, display=False, return_cv_acc=False, verbose=True, return_class_acc=True);
    pv_acc.append((v,*class_acc))

In [None]:
pv_df = pd.DataFrame(pv_acc)
fig, ax = plt.subplots(figsize=(20,10))

for x in range(1,pv_df.shape[1]):
    ax.plot(pv_df[0], pv_df[x] - pv_df[x].min(), label='Cover Type {}'.format(x))
    
ax.set_xticks([0,1,2,3])
ax.set_xlabel('Feature Version')
ax.set_ylabel('Accuracy')
ax.xaxis.label.set_color('limegreen')
ax.xaxis.label.set_size('large')
ax.yaxis.label.set_color('limegreen')
ax.yaxis.label.set_size('large')
ax.legend()
plt.show()

## Plot Acc

In [None]:
result_df = pd.read_excel('model_results.xlsx')
result_df

In [None]:
fig, ax = plt.subplots(1,1, figsize=(20,10))

# cmap = plt.cm.get_cmap('copper',len(result_df.model.unique()))
for i, model_name in enumerate(result_df.model.unique()):
    df_r = result_df[result_df.model == model_name]
    ax.plot(df_r.version, df_r.acc, label='{} training CV accuracy'.format(model_name), c=cs[i])
    ax.plot(df_r.version, df_r.k_acc, linestyle='--', dashes=(5,5) , label='{} test accuracy'.format(model_name), c=cs[i])
    
ax.set_xticks([0,1,2,3])
ax.set_xlabel('Feature Version')
ax.set_ylabel('Accuracy')
ax.xaxis.label.set_color('limegreen')
ax.xaxis.label.set_size('large')
ax.yaxis.label.set_color('limegreen')
ax.yaxis.label.set_size('large')
ax.legend()
plt.show()

## Plot Time

In [None]:
fig, ax = plt.subplots(1,1, figsize=(20,10))

for i, model_name in enumerate(result_df.model.unique()):
    df_r = result_df[result_df.model == model_name]
    ax.plot(df_r.version, df_r.time, label='{} - time taken'.format(model_name), c=cs[i])
    
ax.set_xticks([0,1,2,3])
ax.set_xlabel('Feature Version')
ax.set_ylabel('Accuracy')
ax.xaxis.label.set_color('limegreen')
ax.xaxis.label.set_size('large')
ax.yaxis.label.set_color('limegreen')
ax.yaxis.label.set_size('large')
ax.legend()
plt.show()