In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline

from mlxtend.classifier import StackingCVClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
import random

In [2]:
RANDOM_SEED = 41
cpus = 3
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
hdth = 'Horizontal_Distance_To_Hydrology'
vdth = 'Vertical_Distance_To_Hydrology'
hdtr = 'Horizontal_Distance_To_Roadways'

In [3]:
# extremely stony
extremely_stony = ['Soil_Type' + str(i) for i in [1,24,25,27,28,29,30,31,34,36,37,38,39]]
# rubbly
rubbly = ['Soil_Type' + str(i) for i in [3,4,5,10,11,13]]
# very stony
very_stony = ['Soil_Type' + str(i) for i in [2,9,18,26]]
# stony
stony = ['Soil_Type' + str(i) for i in [6,12]]
lack_info_cols = ['Soil_Type' + str(i) for i in [7,15,14,17,33,32]]
wla = ['Wilderness_Area' + str(i) for i in [1,2,3,4]]
SHADES = ['Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm']

In [4]:
def features(X):
    X['HHV'] = np.sqrt(X[vdth]**2 + X[hdth]**2)
    X['HR'] = X[hdth] - X['Horizontal_Distance_To_Roadways']
    X['HF'] = X[hdth] - X['Horizontal_Distance_To_Fire_Points']
    X['RP'] = X['Horizontal_Distance_To_Roadways'] - X['Horizontal_Distance_To_Fire_Points']
    X['Hillshade_3pm'] = np.where(X['Hillshade_3pm'] > 0, X['Hillshade_3pm'], X['Hillshade_Noon'] - X['Hillshade_9am'])
    X['shade_noon_diff'] = X['Hillshade_9am'] - X['Hillshade_Noon']
    X['shade_3pm_diff'] = X['Hillshade_Noon'] - X['Hillshade_3pm']
    X['shade_all_diff'] = X['Hillshade_3pm'] - X['Hillshade_9am']
    X['shade_sum'] = X[SHADES].sum(axis=1)
    X['shade_mean'] = X[SHADES].mean(axis=1)
    X['extremely_stony'] = X[extremely_stony].sum(axis=1)
    X['rubbly'] = X[rubbly].sum(axis=1)
    X['very_stony'] = X[very_stony].sum(axis=1)
    X['stony'] = X[stony].sum(axis=1)  

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

features(train)
features(test)

X = train.copy().drop(['Cover_Type', 'Id'] + lack_info_cols, axis=1)
y = train.copy()['Cover_Type']
X_test = test.copy().drop(['Id'] + lack_info_cols, axis=1)

In [6]:
xgb_cl = xgb.XGBClassifier(learning_rate = 0.775, n_estimators = 268, 
                           n_jobs = cpus, random_state = RANDOM_SEED)
gb_cl = GradientBoostingClassifier(learning_rate = 0.1, max_depth = 10, 
                                   random_state = RANDOM_SEED, 
                                   n_estimators = 193)
ab_cl = AdaBoostClassifier(n_estimators=200,
                            base_estimator=DecisionTreeClassifier(
                                min_samples_leaf=2,
                                random_state=RANDOM_SEED),
                            random_state=RANDOM_SEED)
rf_cl = RandomForestClassifier(n_estimators = 719,
                               n_jobs = cpus,
                               max_features = 0.3,
                               max_depth = 464,
                               min_samples_split = 2,
                               min_samples_leaf = 1,
                               bootstrap = False,
                               random_state=RANDOM_SEED)

pipe_gb = make_pipeline(ColumnSelector(cols=range(0, X.shape[1])), gb_cl)
pipe_xgb = make_pipeline(ColumnSelector(cols=range(0, X.shape[1])), xgb_cl)
pipe_ab = make_pipeline(ColumnSelector(cols=range(0, X.shape[1])), ab_cl)
pipe_rf = make_pipeline(ColumnSelector(cols=range(0, X.shape[1])), rf_cl)

clf = StackingCVClassifier(classifiers=[pipe_xgb, pipe_xgb, pipe_ab],
                            meta_classifier=rf_cl,
#                             use_probas=True,
                           cv=5,
                            use_features_in_secondary=True,
                            n_jobs = cpus,                            
                            random_state=RANDOM_SEED)

In [7]:
clf.fit(X, y)
pred = clf.predict(X_test)
sub_pd = pd.DataFrame(pred, columns=['Cover_Type'])
sub_pd['Id'] = test['Id']
sub_pd.to_csv('submission.csv', index=False)