In [1]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb

%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train['Hillshade_mean'] = train['Hillshade_9am']+train['Hillshade_Noon']+train['Hillshade_3pm']
train['slope_hyd'] = (train['Horizontal_Distance_To_Hydrology']**2+train['Vertical_Distance_To_Hydrology']**2)**0.5
train.slope_hyd=train.slope_hyd.map(lambda x: 0 if np.isinf(x) else x) # remove infinite value if any
train['log_elevation']= np.log(train.Elevation)
train['Hillshade_9am_sq'] = train['Hillshade_9am']**2
train['Hillshade_noon_sq'] = train['Hillshade_Noon']**2
train['Hillshade_3pm_sq'] = train['Hillshade_3pm']**2

train['interaction_9amnoon'] = train['Hillshade_9am']*train['Hillshade_Noon']
train['interaction_noon3pm'] = train['Hillshade_3pm']*train['Hillshade_Noon']
train['interaction_9am3pm'] = train['Hillshade_3pm']*train['Hillshade_9am']

In [4]:
test['Hillshade_mean'] = test['Hillshade_9am']+test['Hillshade_Noon']+test['Hillshade_3pm']
test['slope_hyd'] = (test['Horizontal_Distance_To_Hydrology']**2+test['Vertical_Distance_To_Hydrology']**2)**0.5
test.slope_hyd=test.slope_hyd.map(lambda x: 0 if np.isinf(x) else x) # remove infinite value if any
test['log_elevation']= np.log(test.Elevation)
test['Hillshade_9am_sq'] = test['Hillshade_9am']**2
test['Hillshade_noon_sq'] = test['Hillshade_Noon']**2
test['Hillshade_3pm_sq'] = test['Hillshade_3pm']**2

test['interaction_9amnoon'] = test['Hillshade_9am']*test['Hillshade_Noon']
test['interaction_noon3pm'] = test['Hillshade_3pm']*test['Hillshade_Noon']
test['interaction_9am3pm'] = test['Hillshade_3pm']*test['Hillshade_9am']

In [5]:
X_train = train.drop(['Id','Cover_Type'],1)
Y_train = train['Cover_Type']
X_test = test.drop(['Id'],1)

In [6]:
scaler = StandardScaler()
X_train_tf = pd.DataFrame(scaler.fit_transform(X_train),index=X_train.index, columns=X_train.columns)
X_test_tf = pd.DataFrame(scaler.transform(X_test),index=X_test.index, columns=X_test.columns)

In [7]:
X_train_tf.head(2)

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type40,Hillshade_mean,slope_hyd,log_elevation,Hillshade_9am_sq,Hillshade_noon_sq,Hillshade_3pm_sq,interaction_9amnoon,interaction_noon3pm,interaction_9am3pm
0,-0.367095,-0.95998,-1.597132,0.146639,-0.834074,-0.908681,0.271454,0.571653,0.281259,4.334805,...,-0.176939,0.666456,0.102333,-0.297134,0.221026,0.56273,0.128799,0.590554,0.35316,0.690441
1,-0.381461,-0.914559,-1.715424,-0.072337,-0.932054,-0.999246,0.238732,0.703225,0.346627,4.28571,...,-0.176939,0.763782,-0.110745,-0.312188,0.184436,0.709867,0.203436,0.644256,0.451832,0.760204


In [8]:
preds = pd.DataFrame()

In [44]:
m1 = AdaBoostClassifier(ExtraTreesClassifier(n_estimators=500),
                        n_estimators=250, 
                        learning_rate=0.01, 
                        algorithm='SAMME')  
m1.fit(X_train_tf, Y_train) 


m2 = RandomForestClassifier(n_estimators=150,
                            max_depth=50,
                            max_features=20,
                            criterion='entropy',
                            n_jobs=3)
m2.fit(X_train_tf, Y_train)


m3 = GradientBoostingClassifier(loss='deviance',
                                n_estimators=500,
                                learning_rate=0.01,
                                random_state=100)
m3.fit(X_train_tf, Y_train)


# m4=xgb.XGBClassifier(objective = "multi:softmax",
#                        eval_metric = "merror",
#                        max_depth = 12,
#                        eta = 0.0399,
#                        gamma = 1.2393,
#                        subsample = 0.7052,
#                        colsample_bytree = 0.6296,
#                        min_child_weight = 7,
#                        colsample_bylevel = 1,
#                        reg_lambda = 1, 
#                        reg_alpha = 0, 
#                        num_class = 7,
#                        booster = "gbtree",
#                        silent = 0)
# m4.fit(X_train_tf, Y_train)



GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              presort='auto', random_state=100, subsample=1.0, verbose=0,
              warm_start=False)

In [45]:
preds["Model1"] = m1.predict(X_test_tf)
preds["Model2"] = m2.predict(X_test_tf)
preds["Model3"] = m3.predict(X_test_tf)
# preds["Model4"] = m4.predict(X_test_tf)

In [46]:
preds.head(100)

Unnamed: 0,Model1,Model2,Model3
0,2,5,2
1,1,2,2
2,2,1,2
3,2,2,2
4,5,2,2
5,2,2,2
6,2,1,2
7,2,1,2
8,1,1,2
9,2,1,2


In [50]:
preds.iloc[1:87,]

Unnamed: 0,Model1,Model2,Model3
1,1,2,2
2,2,1,2
3,2,2,2
4,5,2,2
5,2,2,2
6,2,1,2
7,2,1,2
8,1,1,2
9,2,1,2
10,2,2,2


In [55]:
pred=preds.iloc[1:88,].mode(axis=1)
pred[0]

1     2
2     2
3     2
4     2
5     2
6     2
7     2
8     1
9     2
10    2
11    2
12    2
13    2
14    2
15    2
16    1
17    2
18    2
19    2
20    2
21    2
22    2
23    2
24    2
25    2
26    2
27    2
28    2
29    2
30    2
     ..
58    2
59    2
60    2
61    2
62    2
63    2
64    2
65    2
66    2
67    2
68    1
69    2
70    1
71    2
72    2
73    2
74    2
75    2
76    2
77    2
78    2
79    2
80    2
81    2
82    2
83    5
84    5
85    5
86    5
87    5
Name: 0, dtype: int64

In [26]:
sub = pd.DataFrame({"Id": test['Id'],"Cover_Type": pred[0].astype('int').values})
sub.to_csv("submission_voting.csv", index=False)

ValueError: Cannot convert NA to integer