In [None]:
%matplotlib inline
import math
import time
import pandas as pd
import numpy as np
import lime
import lime.lime_tabular
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.decomposition import PCA, SparsePCA, TruncatedSVD
from sklearn.ensemble import *
from sklearn.preprocessing import normalize
import math
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.metrics import accuracy_score, plot_confusion_matrix, confusion_matrix as cm
from sklearn.preprocessing import normalize
import seaborn as sn
import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils.np_utils import to_categorical
from itertools import combinations 

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# normalization
col_normalize = ['Aspect','Slope','Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology', 'Hillshade_9am','Hillshade_Noon','Hillshade_3pm','Horizontal_Distance_To_Fire_Points']
df_train_norm = df_train.copy()
df_train_norm['Distance_To_Hydrology'] = (df_train_norm['Horizontal_Distance_To_Hydrology']**2 + df_train_norm['Vertical_Distance_To_Hydrology']**2)**(1/2)
df_train_norm[col_normalize] = normalize(df_train[col_normalize])
df_train_norm['log_Horizontal_Distance_To_Roadways'] = (df_train['Horizontal_Distance_To_Roadways']+1).apply(np.log)
df_train_norm['log_Elevation'] = ((df_train_norm['Elevation']**1.5)+1).apply(np.log)
df_train_norm.drop(columns=['Elevation','Horizontal_Distance_To_Roadways'])


In [None]:
kfold = model_selection.KFold(n_splits=10, random_state=0, shuffle=True)
np.set_printoptions(precision=5)
def score_model(model,df, return_val=False, return_train=False, display=True, return_acc=False, return_time=False, show_weights=False):
    X , Y = df.drop(columns=['Id','Cover_Type']).to_numpy(), df.Cover_Type.to_numpy()
    X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=.33, random_state=0)
    start = time.time()
    results = model_selection.cross_val_score(model, X, Y, cv=kfold)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    acc = accuracy_score(y_val, pred)
    end = time.time()
    print('\nModel:',type(model).__name__)
    print('\tcv acc:', round(results.mean(),4))
    print('\tsplit acc:', round(acc,4))
    print('\ttime taken:', round(end-start, 4))
    if display:
        matrix = cm(y_val, pred)
        print('\t', matrix.diagonal() / matrix.sum(axis=1))

        disp = plot_confusion_matrix(model, X_val, y_val, display_labels=set(y_train), cmap=plt.cm.Blues, normalize='true')
        plt.show()
    
    if show_weights:
        for w,k in sorted(list(zip(model.feature_importances_, df.drop(columns=['Id','Cover_Type']).columns)), key=lambda x: x[0]):
            print(k,w)
            
    # return all data
    return_data = [model]
    if return_train:
        return_data += [X_train, y_train]
    if return_val:
        return_data += [X_val, y_val]
    if return_acc:
        return_data += [acc]
    if return_time:
        return_data += [end-start]
    return tuple(return_data)

In [None]:
# This creates a dataset for EDA manipulation
df_eda = df_train_norm.copy()

# This creates 2 new columns that summarize the Wilderness Area and Soil Type columns for ease of visualization
df_eda['Wilderness_Area'] = df_eda.iloc[:,11:15].idxmax(axis=1).str.replace('Wilderness_Area','')
df_eda['Soil_Type'] = df_eda.iloc[:,16:55].idxmax(axis=1).str.replace('Soil_Type','')

In [None]:
df_eda['Wilderness_Areas'] = df_eda['Wilderness_Area1'] + df_eda['Wilderness_Area2'] + df_eda['Wilderness_Area3'] + df_eda['Wilderness_Area4']

plt.figure()
df_one = df_eda[df_eda['Cover_Type'] == 1]
sn.histplot(data=df_one, x='Elevation')

plt.figure()
df_two = df_eda[df_eda['Cover_Type'] == 2]
sn.histplot(data=df_two, x='Elevation')

In [None]:
from sklearn.feature_selection import SelectKBest

df_train_sp = df_train_norm[(df_train_norm['Cover_Type'] == 2) | (df_train_norm['Cover_Type'] == 1)]
X , Y = df_train_sp.drop(columns=['Id','Cover_Type']).to_numpy(), df_train_sp.Cover_Type.to_numpy()
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=.33, random_state=0)
X_new = SelectKBest(k='all').fit(X_train,y_train)
scores = X_new.scores_
ind_sc = np.argsort(scores)[::-1]
j = 0
for i in range(len(ind_sc)):
    if np.isnan(scores[ind_sc[i]]):
        continue
    print(str(j) + ". " + df_train_sp.columns[ind_sc[i] + 1] + ": " + str(scores[ind_sc[i]]))
    j+=1

In [None]:
df_train_sp.columns

### Kevin's code: 

In [None]:
def pipeline(df, version=0):
    df_n = df.copy()
    df_n.drop(columns=['Id'],inplace=True)
    df_n = df_n.astype({c:'bool' for c in df_n.columns if "Soil_Type" in c or "Wilderness_Area" in c})
    if version >= 1: 
        col_normalize = ['Slope','Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology', 
                     'Hillshade_9am','Hillshade_Noon','Hillshade_3pm','Horizontal_Distance_To_Fire_Points',
                     'Horizontal_Distance_To_Roadways']
        df_n['log_Horizontal_Distance_To_Roadways'] = np.log(df_n['Horizontal_Distance_To_Roadways']+1)
        df_n['log_Horizontal_Distance_To_Fire_Points'] = np.log(df_n['Horizontal_Distance_To_Fire_Points']+1)
        df_n[col_normalize] = normalize(df_n[col_normalize])
        df_n.drop(columns=['Soil_Type7'],inplace=True)
    if version >= 2: # 0.8964947089947091
        df_n['sq_Elevation'] = np.power(df['Elevation'],1.5)
        df_n.drop(columns='Aspect',inplace=True)
        df_n['norm_aspect'] = df.Aspect.map(lambda x: x-180 if x > 180 else x+180) # np.abs(df.Aspect - 180)
    if version >= 3: # 0.9104497354497356
        df_n['Vertical_Distance_To_Hydrology'] = np.abs(df_n.Vertical_Distance_To_Hydrology)
        df_n['E-VH'] = df.Elevation - df.Vertical_Distance_To_Hydrology * .9 
        df_n['E-HH'] = df.Elevation - df.Horizontal_Distance_To_Hydrology * .5
        
        
        df_n['F+R'] = (df.Horizontal_Distance_To_Fire_Points + df.Horizontal_Distance_To_Roadways) ** 2
        df_n['F+H'] = (df.Horizontal_Distance_To_Fire_Points + df.Horizontal_Distance_To_Hydrology) ** 0.3
        df_n['H+R'] = (df.Horizontal_Distance_To_Hydrology + df.Horizontal_Distance_To_Roadways)
        
        df_n['abs_H-R'] = (np.abs(df.Horizontal_Distance_To_Hydrology - df.Horizontal_Distance_To_Roadways)) 
        df_n['abs_H-F'] = (np.abs(df.Horizontal_Distance_To_Hydrology - df.Horizontal_Distance_To_Fire_Points)) 
        df_n['abs_F-R'] = (np.abs(df.Horizontal_Distance_To_Fire_Points - df.Horizontal_Distance_To_Roadways)) 
    return df_n

def soils(model):
    df_train_norm_copy = df_train_norm.copy()
    soils = ['Soil_Type' + str(i) for i in range(1,41) if i != 7]
    one_ind = list(df_train_norm_copy.columns).index('Soil_Type1')
    fort_ind = list(df_train_norm_copy.columns).index('Soil_Type40')
    df_train_norm_copy['Soil_Type'] = (df_train_norm_copy.iloc[:, one_ind:fort_ind] == 1).idxmax(1).str.replace('Soil_Type','').astype(float)
    df_train_norm_copy.drop(columns=soils, inplace=True)
    return df_train_norm_copy

def wa(model):
    df_train_norm_copy = df_train_norm.copy()
    was = ['Wilderness_Area' + str(i) for i in range(1,5)]
    one_ind = list(df_train_norm_copy.columns).index('Wilderness_Area1')
    four_ind = list(df_train_norm_copy.columns).index('Wilderness_Area4')
    df_train_norm_copy['Wilderness_Area'] = (df_train_norm_copy.iloc[:, one_ind:four_ind] == 1).idxmax(1).str.replace('Wilderness_Area','').astype(float)
    df_train_norm_copy.drop(columns=was, inplace=True)
    return df_train_norm_copy

def submit(model,version):
    global df_train
    df_train_c = pipeline(df_train.copy(),version)
    df_submit = pipeline(df_test.copy(),version)
    X, Y = df_train_c.drop(columns=['Cover_Type']).to_numpy(), df_train_c.Cover_Type.to_numpy()
    model.fit(X, Y)
    pred = model.predict(df_submit.to_numpy())
    final_df = df_test.copy()
    final_df['Cover_Type'] = pred
    return final_df[['Id','Cover_Type']]

In [None]:
df_train_norm = pipeline(df_train, version=3)
df_train_norm = soils(df_train_norm)
df_train_norm = wa(df_train_norm)
df_train_norm_1_2 = df_train_norm[(df_train_norm['Cover_Type'] == 1) | (df_train_norm['Cover_Type'] == 2)]

etc = ExtraTreesClassifier(n_jobs=-1, random_state=0)
X , Y = df_train_norm_1_2.drop(columns=['Id','Cover_Type']).to_numpy(), df_train_norm_1_2.Cover_Type.to_numpy()
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=.33, random_state=0)
etc.fit(X_train,y_train)
predict_fn = lambda x: etc.predict_proba(x).astype(float)
df_train_exp=df_train_norm.drop(columns=['Id','Cover_Type'])
# Create Lime Explainer
explainer =  lime.lime_tabular.LimeTabularExplainer(X_train, feature_names = list(df_train_exp.columns), class_names = ['1','2'])

ones = [ind for ind in range(y_val.shape[0]) if y_val[ind] == 1]
twos = [ind for ind in range(y_val.shape[0]) if y_val[ind] == 2]


k = 0
examples_one = []
examples_two = []

for i in range(len(ones)):
    exp = explainer.explain_instance(X_val[ones[i]], predict_fn, num_features=10)
    probs = np.array(etc.predict_proba([X_val[ones[i]]])[0])
    if probs.argmax() != 1:
        continue
    
    k+=1
    examples_one.append(X_val[ones[i]])
    if k == 75:
        break

j = 0
for i in range(len(twos)):
    exp = explainer.explain_instance(X_val[twos[i]], predict_fn, num_features=10)
    probs = np.array(etc.predict_proba([X_val[twos[i]]])[0])
    if probs.argmax() != 0:
        continue
    
    j+=1
    examples_two.append(X_val[twos[i]])
    if j == 75:
        break


In [None]:
examples_one

df_one = pd.DataFrame(np.row_stack(examples_one), columns=list(df_train_exp.columns))
df_two = pd.DataFrame(np.row_stack(examples_two), columns=list(df_train_exp.columns))

df_one

In [None]:
for i in df_one.columns:
    plt.figure()
    print(i)
    df_one[i].astype('float').hist()
    plt.xlabel(i)
    plt.figure()
    df_two[i].astype('float').hist()
    plt.xlabel(i)

In [None]:
df_one['Elevation'].astype('float').hist()

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_one.mean())
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_one.std())

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_two.mean())
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(df_two.std())

In [None]:
df_train_norm = pipeline(df_train, version=3)
# df_train_norm = soils(df_train_norm)
# df_train_norm = wa(df_train_norm)
etc = ExtraTreesClassifier(n_jobs=-1, random_state=0)
score_model(etc, df_train_norm)

In [None]:
df_train_norm = pipeline(df_train, version=3)
sn.histplot(data=df_train_norm, x='Vertical_Distance_To_Hydrology')
plt.figure()
sn.histplot(data=df_train_norm, x='norm_aspect')
plt.figure()
sn.histplot(data=df_train, x='Aspect')


In [None]:
df_train_norm = pipeline(df_train, version=0)
df_train_norm = soils(df_train_norm)
df_train_norm = wa(df_train_norm)
df_train_norm_1_2 = df_train_norm[(df_train_norm['Cover_Type'] == 3) | (df_train_norm['Cover_Type'] == 6)]

etc = ExtraTreesClassifier(n_jobs=-1, random_state=0)
X , Y = df_train_norm_1_2.drop(columns=['Id','Cover_Type']).to_numpy(), df_train_norm_1_2.Cover_Type.to_numpy()
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=.33, random_state=0)
etc.fit(X_train,y_train)
predict_fn = lambda x: etc.predict_proba(x).astype(float)
df_train_exp=df_train_norm_1_2.drop(columns=['Id','Cover_Type'])
# Create Lime Explainer
explainer =  lime.lime_tabular.LimeTabularExplainer(X_train, feature_names = list(df_train_exp.columns), class_names = ['3','6'])

ones = [ind for ind in range(y_val.shape[0]) if y_val[ind] == 3]
k=0
for i in range(len(ones)):
    exp = explainer.explain_instance(X_val[ones[i]], predict_fn, num_features=10)
    probs = np.array(etc.predict_proba([X_val[ones[i]]])[0])
    if probs.argmax() != 1:
        continue
    print(probs)
    k+=1
    exp.show_in_notebook(show_all=False)
    if k == 10:
        break

In [None]:
plt.figure()
df_one = df_train_norm[df_train_norm['Cover_Type'] == 3]

sn.histplot(data=df_one, x='sl_asp')

plt.figure()
df_two = df_train_norm[df_train_norm['Cover_Type'] == 6]
sn.histplot(data=df_two, x='sl_asp')

### Light GBM

In [None]:
!pip install lightgbm
# if not installed ^

In [None]:
import lightgbm as lgb

In [None]:
#!pip install lightgbm
import lightgbm as lgb
n_estimator = [100, 200, 500]
boosting_types = ['gbdt', 'dart', 'goss']
max_depth = [4, 6, 8]
num_leaves = [70, 90, 100]
output = {}
df_train_c = pipeline(df_train, version=3)

for i in n_estimator:
    for k in boosting_types:
        for l in max_depth:
            for q in num_leaves:
                model = lgb.LGBMClassifier(n_estimators=i, boosting_type=k, max_depth=l, num_leaves=q)
                start = time.time()
                model.fit(X_train, y_train)
                pred = model.predict(X_val)
                acc = accuracy_score(y_val, pred)
                end = time.time()
                output[str(k) + '_' + str(i) + '_' + str(l) + '_' + str(q)] = [acc, end-start]
output

In [None]:
print("Hyperparameters: boosting_Type: 'gbdt', n_estimators = 100, max_depth = 8, num_leaves = 90")
print("With the best hyperparameters, extra tree model achieves: " + " 0.8806 accuracy in 2.85 seconds")

In [None]:
df_submit = pipeline(df_test.copy(),version=3)

In [None]:
df_submit