# Challenge 7 - Fight Fire with Data
## Random Forest Model to Predict Fire Spread

The user will be using a Jupyter Notebook to run code that was developed in R. First, the user will check to see if the wind speed and brightness are correlated with the speed that the fire spreads derived from the satellite data. The input data has been prepared for you. Next, the user will run the code that creates a model (random forest) using the features they select (windspeed and brightness) as the inputs and estimates the speed of spread as the target variable (speed of spread). They will train a model, record the Root Mean Squared Error, and save the model into a deployable format also known as Predictive Model Markup Language (PMML). 

## Install and Load Packages

In [49]:
import pandas as pd
import numpy as np
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
import types
from botocore.client import Config
import ibm_boto3

## Get and View Data

In [50]:
df = pd.read_csv("Challenge_7_Merged_Data_single_fire.csv" , low_memory=False)

In [51]:
df.shape

(13818, 35)

In [52]:
df.head()

Unnamed: 0,SiteId,latitude,longitude,DateHrGmt,DateHrLwt,WindSpeedMph,WindDirectionDegrees,SurfaceWindGustsMph,ZeroToTenLiquidSoilMoisturePercent,TenToFortyLiquidSoilMoisturePercent,...,bright_t31,frp,daynight,type,datetime_start,lat_start,long_start,distance,duration,speed
0,2161142584,36.46616,-121.89671,7/23/2016 3:00,7/22/2016 20:00,6.4,318,36.8,14.0,24.4,...,309.3,77.2,N,3,7/22/2016 20:21,36.46616,-121.89671,0.0,0.0,0.0
1,2161142584,36.46486,-121.90179,7/23/2016 3:00,7/22/2016 20:00,6.4,318,36.8,14.0,24.4,...,306.9,77.2,N,3,7/22/2016 20:21,36.46616,-121.89671,0.296552,0.0,0.0
2,2161142584,36.46379,-121.89375,7/23/2016 3:00,7/22/2016 20:00,6.4,318,36.8,14.0,24.4,...,306.1,77.7,N,3,7/22/2016 20:21,36.46616,-121.89671,0.232352,0.0,0.0
3,2161142584,36.46245,-121.8989,7/23/2016 3:00,7/22/2016 20:00,6.4,318,36.8,14.0,24.4,...,342.5,77.7,N,3,7/22/2016 20:21,36.46616,-121.89671,0.284073,0.0,0.0
4,2161142584,36.46112,-121.90392,7/23/2016 3:00,7/22/2016 20:00,6.4,318,36.8,14.0,24.4,...,301.6,55.2,N,3,7/22/2016 20:21,36.46616,-121.89671,0.53142,0.0,0.0


In [53]:
df.columns

Index(['SiteId', 'latitude', 'longitude', 'DateHrGmt', 'DateHrLwt',
       'WindSpeedMph', 'WindDirectionDegrees', 'SurfaceWindGustsMph',
       'ZeroToTenLiquidSoilMoisturePercent',
       'TenToFortyLiquidSoilMoisturePercent',
       'FortyToOneHundredLiquidSoilMoisturePercent',
       'SurfaceTemperatureFahrenheit', 'SurfaceDewpointTemperatureFahrenheit',
       'SurfaceWetBulbTemperatureFahrenheit', 'RelativeHumidityPercent',
       'time_stamp', 'brightness', 'scan', 'track', 'acq_date', 'acq_time',
       'satellite', 'instrument', 'confidence', 'version', 'bright_t31', 'frp',
       'daynight', 'type', 'datetime_start', 'lat_start', 'long_start',
       'distance', 'duration', 'speed'],
      dtype='object')

In [54]:
features_short = [
       'WindSpeedMph', 
       'SurfaceWindGustsMph',
       'ZeroToTenLiquidSoilMoisturePercent',
       'TenToFortyLiquidSoilMoisturePercent',
       'FortyToOneHundredLiquidSoilMoisturePercent',
       'SurfaceTemperatureFahrenheit', 
       'SurfaceDewpointTemperatureFahrenheit',
       'SurfaceWetBulbTemperatureFahrenheit', 
       'RelativeHumidityPercent',
       'brightness', 
       'bright_t31', 
       'frp', 
       'speed'
]

In [55]:
df[features_short].head()

Unnamed: 0,WindSpeedMph,SurfaceWindGustsMph,ZeroToTenLiquidSoilMoisturePercent,TenToFortyLiquidSoilMoisturePercent,FortyToOneHundredLiquidSoilMoisturePercent,SurfaceTemperatureFahrenheit,SurfaceDewpointTemperatureFahrenheit,SurfaceWetBulbTemperatureFahrenheit,RelativeHumidityPercent,brightness,bright_t31,frp,speed
0,6.4,36.8,14.0,24.4,25.4,60.4,50.3,54.6,70,367.0,309.3,77.2,0.0
1,6.4,36.8,14.0,24.4,25.4,60.4,50.3,54.6,70,267.7,306.9,77.2,0.0
2,6.4,36.8,14.0,24.4,25.4,60.4,50.3,54.6,70,367.0,306.1,77.7,0.0
3,6.4,36.8,14.0,24.4,25.4,60.4,50.3,54.6,70,367.0,342.5,77.7,0.0
4,6.4,36.8,14.0,24.4,25.4,60.4,50.3,54.6,70,356.6,301.6,55.2,0.0


In [56]:
df[features_short].describe()

Unnamed: 0,WindSpeedMph,SurfaceWindGustsMph,ZeroToTenLiquidSoilMoisturePercent,TenToFortyLiquidSoilMoisturePercent,FortyToOneHundredLiquidSoilMoisturePercent,SurfaceTemperatureFahrenheit,SurfaceDewpointTemperatureFahrenheit,SurfaceWetBulbTemperatureFahrenheit,RelativeHumidityPercent,brightness,bright_t31,frp,speed
count,13818.0,13818.0,13818.0,13818.0,13818.0,13818.0,13818.0,13818.0,13818.0,13818.0,13818.0,13818.0,13818.0
mean,4.545781,23.536286,13.847829,22.251563,23.355384,62.288197,51.163511,55.729831,70.660226,333.765769,299.103843,47.553807,0.049042
std,2.032529,7.382075,0.971136,1.828861,1.642913,7.603074,4.100671,3.157922,20.616516,23.644904,11.751736,162.295969,0.3526
min,0.1,3.6,12.3,17.9,19.1,39.2,16.8,38.1,11.0,208.0,260.2,0.2,0.0
25%,3.1,18.5,12.6,21.1,22.5,56.7,49.6,54.1,55.0,314.1,291.7,3.7,0.017262
50%,4.2,23.9,14.2,22.5,23.5,61.3,51.5,56.1,71.0,333.6,296.7,11.7,0.02727
75%,5.8,27.5,14.7,23.8,24.7,68.0,53.7,58.0,91.0,349.5,304.7,36.6,0.042211
max,17.9,55.1,15.6,24.7,25.6,91.6,60.0,64.3,100.0,502.1,400.1,5452.3,33.725228


In [57]:
input_features = features_short[:-1]

## These are the features that we will put in the model

In [103]:
input_features = [
       'WindSpeedMph', 
#        'SurfaceWindGustsMph',
#        'ZeroToTenLiquidSoilMoisturePercent',
#        'TenToFortyLiquidSoilMoisturePercent',
#        'FortyToOneHundredLiquidSoilMoisturePercent',
#        'SurfaceTemperatureFahrenheit', 
#        'SurfaceDewpointTemperatureFahrenheit',
#        'SurfaceWetBulbTemperatureFahrenheit', 
       'RelativeHumidityPercent',
       'brightness', 
       'bright_t31', 
       'frp' 
]

In [120]:
y = np.array(df['speed'])
X = np.array(df[input_features])
print(y.shape)
print(X.shape)

(13818,)
(13818, 5)


## Make a train/test split for the model

In [121]:
# make train test split
train_features, test_features, train_labels, test_labels = train_test_split(X, y, test_size = 0.25, random_state = 137)
train_features.shape
test_features.shape
train_labels.shape
test_labels.shape

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (10363, 5)
Training Labels Shape: (10363,)
Testing Features Shape: (3455, 5)
Testing Labels Shape: (3455,)


## Train a random forest model 

In [122]:
import time
start_time = time.time()

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(
    n_estimators = 1000,
    max_depth = 4,
    n_jobs= -1, 
    random_state = 137,
    verbose=1
    )
# Train the model on training data
rf.fit(train_features, train_labels)
print("--- %s seconds ---" % (time.time() - start_time))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    2.1s


--- 3.17508864402771 seconds ---


[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.7s finished


Time to train on 5,000 trees using my slow computer is:
--- 69.05580568313599 seconds ---

## Display accuracy of the model

In [123]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = (abs(predictions - test_labels))   
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 7), 'mph.')

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.1s


Mean Absolute Error: 0.0383191 mph.


[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.1s finished


MAE: is it the same as yours?  
Mean Absolute Error: 0.0358604 mph.


https://collaborate.pega.com/discussion/creating-pmml-python-r-and-pega

In [124]:
import pandas as pd
import numpy as np
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer

fire_detection = pd.read_csv("../cdh-datascientist-tools/dmsample/data/ChurnDMSample2.csv")

# Only use a subset of the data for modeling
devset = fire_detection[[input_features]]   # 

# Map the multiple values of the Churn field
y = fire_detection["speed"].map(lambda x: ("Churned", "Loyal")[x.startswith("N")])

# Create a preprocessor to replace missing values with median
pp = DataFrameMapper(
    [([input_features], 
      [SimpleImputer(missing_values=np.nan, strategy='median')])])

# Create a random forest classifier
fire_regressor = RandomForestRegressor(n_estimators=20)   # need to fix 

# Create a PMML pipeline including some preprocessing
pipeline = PMMLPipeline([
    ("preprocessing", pp),
    ('fire_classifier', fire__regressor)])

# Fit the model
pipeline.fit(devset, y)




In [None]:
from sklearn2pmml import sklearn2pmml
sklearn2pmml(pipeline, "modFit.pmml", with_repr = True)

In [None]:
#####

import pandas
import numpy
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn_pandas import DataFrameMapper
from sklearn.impute import SimpleImputer

churndata = pandas.read_csv("../cdh-datascientist-tools/dmsample/data/ChurnDMSample2.csv")

# Only use a subset of the data for modeling
devset = churndata[["Age", "AvgCallsOut"]]

# Map the multiple values of the Churn field
y = churndata["Churn"].map(lambda x: ("Churned", "Loyal")[x.startswith("N")])

# Create a preprocessor to replace missing values with median
pp = DataFrameMapper(
    [(["Age", "AvgCallsOut"], 
      [SimpleImputer(missing_values=numpy.nan, strategy='median')])])

# Create a random forest classifier
churn_classifier = RandomForestClassifier(n_estimators=20)

# Create a PMML pipeline including some preprocessing
pipeline = PMMLPipeline([
    ("preprocessing", pp),
    ('churn_classifier', churn_classifier)])

# Fit the model
pipeline.fit(devset, y)
######

from sklearn2pmml import sklearn2pmml
sklearn2pmml(pipeline, "churn_sklearn.pmml", with_repr = True


