In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [6]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize
import math

In [23]:
loc_train = "data/forest-cover/train.csv"
loc_test = "data/forest-cover/test.csv"
loc_submission = "submissions/2models.submission.csv"

In [8]:
df_train = pd.read_csv(loc_train)
df_test = pd.read_csv(loc_test)

In [9]:
cols_to_normalize = ['Aspect','Slope','Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology',
'Hillshade_9am','Hillshade_Noon','Hillshade_3pm','Horizontal_Distance_To_Fire_Points']

df_train[cols_to_normalize] = normalize(df_train[cols_to_normalize])
df_test[cols_to_normalize] = normalize(df_test[cols_to_normalize])

In [10]:
feature_cols = [col for col in df_train.columns if col not in ['Cover_Type','Id']]
feature_cols.append('binned_elevation')
feature_cols.append('Horizontal_Distance_To_Roadways_Log')
feature_cols.append('Soil_Type12_32')
feature_cols.append('Soil_Type23_22_32_33')

In [11]:
#feature_cols.append('Horizontal_Distance_To_Hydrology')

df_train['binned_elevation'] = [math.floor(v/50.0) for v in df_train['Elevation']]
df_test['binned_elevation'] = [math.floor(v/50.0) for v in df_test['Elevation']]

df_train['Horizontal_Distance_To_Roadways_Log'] = [math.log(v+1) for v in df_train['Horizontal_Distance_To_Roadways']]
df_test['Horizontal_Distance_To_Roadways_Log'] = [math.log(v+1) for v in df_test['Horizontal_Distance_To_Roadways']]

df_train['Soil_Type12_32'] = df_train['Soil_Type32'] + df_train['Soil_Type12']
df_test['Soil_Type12_32'] = df_test['Soil_Type32'] + df_test['Soil_Type12']
df_train['Soil_Type23_22_32_33'] = df_train['Soil_Type23'] + df_train['Soil_Type22'] + df_train['Soil_Type32'] + df_train['Soil_Type33']
df_test['Soil_Type23_22_32_33'] = df_test['Soil_Type23'] + df_test['Soil_Type22'] + df_test['Soil_Type32'] + df_test['Soil_Type33']

#df_train['Horizontal_Distance_To_Hydrology_Log'] = [math.log(v+1) for v in df_train['Horizontal_Distance_To_Hydrology']]
#df_test['Horizontal_Distance_To_Hydrology_Log'] = [math.log(v+1) for v in df_test['Horizontal_Distance_To_Hydrology']]

df_train_1_2 = df_train[(df_train['Cover_Type'] <= 2)]
df_train_3_4_6 = df_train[(df_train['Cover_Type'].isin([3,4,6]))]

In [12]:
X_train = df_train[feature_cols]
X_test = df_test[feature_cols]

X_train_1_2 = df_train_1_2[feature_cols]
X_train_3_4_6 = df_train_3_4_6[feature_cols]

y = df_train['Cover_Type']
y_1_2 = df_train_1_2['Cover_Type']
y_3_4_6 = df_train_3_4_6['Cover_Type']

In [13]:
test_ids = df_test['Id']
del df_train
del df_test

In [14]:
clf = ensemble.ExtraTreesClassifier(n_estimators=100,n_jobs=-1,random_state=0)
clf.fit(X_train, y)

clf_1_2 = ensemble.RandomForestClassifier(n_estimators=200,n_jobs=-1,random_state=0)
clf_1_2.fit(X_train_1_2, y_1_2)

clf_3_4_6 = ensemble.RandomForestClassifier(n_estimators=200,n_jobs=-1,random_state=0)
clf_3_4_6.fit(X_train_3_4_6, y_3_4_6)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [17]:
vals_1_2 = {}
for e, val in enumerate(list(clf_1_2.predict_proba(X_test))):
    vals_1_2[e] = val
print(clf_1_2.classes_)

vals_3_4_6 = {}
for e, val in enumerate(list(clf_3_4_6.predict_proba(X_test))):
    vals_3_4_6[e] = val
print(clf_3_4_6.classes_)

vals = {}
for e, val in enumerate(list(clf.predict(X_test))):
    vals[e] = val

[1 2]
[3 4 6]


In [20]:
def largest_index(inlist):
  largest = -1
  largest_index = 0
  for i in range(len(inlist)):
    item = inlist[i]
    if item > largest:
      largest = item
      largest_index = i
  return largest_index

In [25]:
with open(loc_submission, "w") as outfile:
    outfile.write("Id,Cover_Type\n")
    
    for e, val in enumerate(list(clf.predict_proba(X_test))):
      #boost types 1 and 2
      val[0] += vals_1_2[e][0]/1.3
      val[1] += vals_1_2[e][1]/1.1
      #boost types 3,4, and 6
      val[2] += vals_3_4_6[e][0]/3.4
      val[3] += vals_3_4_6[e][1]/4.0
      val[5] += vals_3_4_6[e][2]/3.6
      #val[4] += vals_5_7[e][0]/2.4
      #val[6] += vals_5_7[e][1]/3.4
      i = largest_index(val)
      v = i  + 1
      outfile.write("%s,%s\n"%(test_ids[e],v))