In [2]:
import csv
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as skl
from matplotlib.ticker import MultipleLocator
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_mldata
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [4]:
# Import data
df=pd.read_csv('train.csv', sep=',')
df.shape

(15120, 56)

In [5]:
# Specify predictor cols and target col
predictors = df.columns[0:-1]
target = df.columns[-1]

In [53]:
predictors

Index(['Id', 'Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_

In [6]:
# Create Train / Test split
from sklearn.model_selection import train_test_split

X_train, X_dev, y_train, y_dev = train_test_split(df[predictors], df[target], random_state=2)

### Random Forest

In [36]:
# Instantiate a Random Forest model
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True, random_state=121)

In [17]:
# start timer
start = time.time()

# Run cross validation on the scores generated from the model
scores = cross_val_score(rf, X_train, y_train, cv = 10)

# stop timer
end = time.time()

# Print the mean R^2 score
print("Mean R^2 = {:.3}".format(scores.mean()))
print("This process took {:.3} seconds to compute".format(end - start))

Mean R^2 = 0.766
This process took 65.1 seconds to compute


In [37]:
# fit the rf model to the train data
rf.fit(X_train, y_train)

# Print R^2 for train data and OOB samples
print("R^2 on training data = {:.3}".format(rf.score(X_train, y_train)))
print("R^2 on OOB samples only = {:.3}".format(rf.oob_score_))

R^2 on training data = 0.61
R^2 on OOB samples only = 0.568


In [38]:
# Calculate prediction accuracy
preds = rf.predict(X_dev)
accuracy = np.mean(preds == y_dev)

print("Prediction accuracy is {:.3}".format(accuracy))

Prediction accuracy is 0.0


In [48]:
# build a param grid to pass into gridsearchcv
tuned_params = [{"max_features" : ['auto', 'sqrt', 0.25, 0.5]}]

# Instantiate a GridSearchCV object
grid = GridSearchCV(rf, tuned_params, n_jobs=-1, cv = 5)

# fit grid object w/train data
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=50, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=True, random_state=121, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'max_features': ['auto', 'sqrt', 0.25, 0.5]}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [56]:
rf.feature_importances_.round(3)

array([0.089, 0.645, 0.008, 0.003, 0.029, 0.007, 0.031, 0.009, 0.008,
       0.007, 0.025, 0.   , 0.   , 0.015, 0.001, 0.   , 0.   , 0.001,
       0.002, 0.   , 0.   , 0.   , 0.   , 0.   , 0.011, 0.   , 0.   ,
       0.002, 0.   , 0.   , 0.   , 0.002, 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.015, 0.   , 0.   , 0.   , 0.   , 0.   , 0.036, 0.054,
       0.   ])

In [39]:
X_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,11340.0,7541.371693,4340.738478,2.0,3805.75,7558.5,11261.25,15120.0
Elevation,11340.0,2748.35873,417.293447,1879.0,2376.0,2752.0,3101.0,3849.0
Aspect,11340.0,156.082716,109.598235,0.0,65.0,125.0,258.0,359.0
Slope,11340.0,16.554586,8.451911,0.0,10.0,15.0,22.0,49.0
Horizontal_Distance_To_Hydrology,11340.0,227.997178,211.776952,0.0,67.0,180.0,324.0,1343.0
Vertical_Distance_To_Hydrology,11340.0,51.273016,61.33603,-123.0,5.0,32.0,80.0,547.0
Horizontal_Distance_To_Roadways,11340.0,1714.768783,1325.034484,0.0,765.0,1315.0,2268.0,6890.0
Hillshade_9am,11340.0,212.865432,30.652599,58.0,196.0,220.0,235.0,254.0
Hillshade_Noon,11340.0,218.901146,22.780645,99.0,207.0,222.0,235.0,254.0
Hillshade_3pm,11340.0,134.784656,46.029575,0.0,106.0,138.0,166.0,248.0


In [43]:
df_test = df

In [57]:
for i in range(0, df_test.shape[0]):
    if df_test.Wilderness_Area1:
        df_test[Wilderness_Area] = "Wilderness_Area1"
    elif df_test.Wilderness_Area2:
        df_test[Wilderness_Area] = "Wilderness_Area2"
    elif df_test.Wilderness_Area3:
        df_test[Wilderness_Area] = "Wilderness_Area3"
    elif df_test.Wilderness_Area4:
        df_test[Wilderness_Area] = "Wilderness_Area4"

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().