In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

%matplotlib inline
pd.set_option('display.max_columns', None)

# Read Data

In [5]:
df_train = pd.read_csv('../input/forest-cover-type-prediction/train.csv')
df_test = pd.read_csv('../input/forest-cover-type-prediction/test.csv')
df_train.drop(['Id'], axis=1, inplace=True)
y = df_train.iloc[:, -1]
df_train = df_train.iloc[:, :-1]
test_Id = df_test.iloc[:, 0]
df_test.drop(['Id'], axis=1, inplace=True)

# Take a look at the data

In [6]:
ntrain = df_train.shape[0]
ntest = df_test.shape[0]
all_data = pd.concat([df_train, df_test], axis=0, sort=False)
print(ntrain, ntest)
print(all_data.shape)
all_data.head()

In [7]:
df_train.head()

In [8]:
df_train.describe().T

## Drop useless columns

In [9]:
df_train.drop(['Soil_Type7', 'Soil_Type15'], axis=1, inplace=True)

In [10]:
corrmat = all_data.iloc[:, :10].corr()
plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat)

In [11]:
a = all_data['Horizontal_Distance_To_Hydrology']
b = all_data['Vertical_Distance_To_Hydrology']
all_data['Distance_To_Hydrology'] = np.sqrt(np.power(a, 2) + np.power(b, 2))

In [12]:
all_data['Horizontal_distance'] = (all_data['Horizontal_Distance_To_Hydrology'] + 
                                   all_data['Horizontal_Distance_To_Roadways']  + 
                                   all_data['Horizontal_Distance_To_Fire_Points']) /3

In [13]:
all_data['average_hillshade'] = (all_data['Hillshade_3pm'] + 
                                 all_data['Hillshade_Noon'] + 
                                 all_data['Hillshade_9am'])/3

In [14]:
# high negative correlation, therefore making new feature
all_data['Aspect_hillshade'] = (all_data['Aspect'] * all_data['Hillshade_9am'])/255
all_data['slope_hillshade'] = (all_data['Slope'] * all_data['Hillshade_Noon'])/255
all_data['Elevation'] = [math.floor(v/50.0) for v in all_data['Elevation']]

In [15]:
all_data['EHDtH'] = all_data['Elevation'] - all_data['Horizontal_Distance_To_Hydrology'] * 0.2

In [16]:
all_data['Distanse_to_Hydrolody'] = (all_data['Horizontal_Distance_To_Hydrology'] ** 2 + 
                                     all_data['Vertical_Distance_To_Hydrology'] ** 2) ** 0.5

In [17]:
all_data['Hydro_Fire_1'] = (all_data['Horizontal_Distance_To_Hydrology'] + 
                            all_data['Horizontal_Distance_To_Fire_Points'])

In [18]:
all_data['Hydro_Fire_2'] = abs(all_data['Horizontal_Distance_To_Hydrology'] - 
                               all_data['Horizontal_Distance_To_Fire_Points'])

In [19]:
all_data['Hydro_Road_1'] = abs(all_data['Horizontal_Distance_To_Hydrology'] + 
                               all_data['Horizontal_Distance_To_Roadways'])

In [20]:
all_data['Hydro_Road_2'] = abs(all_data['Horizontal_Distance_To_Hydrology'] - 
                               all_data['Horizontal_Distance_To_Roadways'])

In [21]:
all_data['Fire_Road_1'] = abs(all_data['Horizontal_Distance_To_Fire_Points'] + 
                              all_data['Horizontal_Distance_To_Roadways'])

In [22]:
all_data['Fire_Road_2'] = abs(all_data['Horizontal_Distance_To_Fire_Points'] - 
                              all_data['Horizontal_Distance_To_Roadways'])

In [23]:
all_data.head()

In [24]:
num_labels = [i for i in all_data.columns[0:10]]
b = ['distance_to_hydrology','Horizontal_distance',
     'average_hillshade','Aspect_hillshade','slope_hillshade','EVDtH',
      'EHDtH','Distanse_to_Hydrolody', 'Hydro_Fire_1','Hydro_Fire_2','Hydro_Road_1', 
     'Hydro_Road_2','Fire_Road_1','Fire_Road_2']
num_labels.extend(b)

In [25]:
train_data = all_data.iloc[:df_train.shape[0],:]

test_data = all_data.iloc[df_train.shape[0]:,:]

In [26]:
rs = RobustScaler()
rs.fit(train_data)
train_data = rs.transform(train_data)
test_data = rs.transform(test_data)
train_data = pd.DataFrame(train_data, columns = all_data.columns)
test_data = pd.DataFrame(test_data, columns = all_data.columns)

In [27]:
train_data.head()

In [28]:
test_data.head()

In [29]:
X_train, X_test, y_train, y_test = train_test_split(train_data, y, test_size = 0.1, shuffle= True)

In [30]:
etc = ExtraTreesClassifier(n_estimators=400)
etc.fit(X_train, y_train)

In [31]:
u = etc.predict(X_test)
print(accuracy_score(u, y_test))
print(cross_val_score(etc, X_test, y_test, cv=5).mean())

In [32]:
etc.fit(train_data, y)
test_predictions = etc.predict(test_data)

In [33]:
solutions = pd.DataFrame({'Id': test_Id, 'Cover_Type': test_predictions})
solutions.to_csv('finsubmission.csv',index=False)

In [34]:
test_predictions