In [3]:
import sqlite3
import numpy as np
import pandas as pd
import keras
import math
import datetime
import tensorflowjs as tfjs
import pickle
import datetime
import calendar
import sklearn.metrics as metrics
import joblib
from keras.models import Sequential, model_from_yaml, load_model
from keras.layers import Dense, LSTM, Dropout, Embedding, Masking
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.over_sampling import SMOTE

Using TensorFlow backend.


In [4]:
def encodeOutputVariable(y):
    labelencoder_Y_Origin = LabelEncoder()
    y = labelencoder_Y_Origin.fit_transform(y.astype(str))
    return y

def encodeCategoricalData(X, index):
    # encode categorical data
    labelencoder_X_Origin = LabelEncoder()
    X[:, index] = labelencoder_X_Origin.fit_transform(X[:, index].astype(str))
    return X    

def manualEncodeLongStrings(X, column):
    index = 0
    test = 0
    keys = {}
    for row in X:
        key = row[column].replace(", ", "").replace(" ", "")
        if (keys.get(key) == None):
            keys[key] = index
            index += 1
        X[test][column] = keys.get(key)
        test += 1
    return X
    
def defaultMinimumValues(values): 
    output = []
    for index, x in enumerate(values):
        x = float(x) if x != None and float(x) > 0.0 else 0
        output.append(x)
    return output

def encodeHotEncoder(X, categoryIndex):
    # meant to create dummy variables for each category data
    # you only use it for one column at a time, output will be the number of columns
    # needed to represent all discrete values of column
    onehotencoder = OneHotEncoder(categorical_features = [categoryIndex])
    X = defaultMinimumValues(X)
    X = onehotencoder.fit_transform(X.astype(str)).toarray()    
    X = X[:, 1:]
    return X

def determineTotalTime(startDay, startTime, endDay, endTime):
    minsPerDay = 24 * 60
    totalStart = (int(startDay) * minsPerDay) + int(startTime)
    totalEnd = (int(endDay) * minsPerDay) + int(endTime)
    return totalEnd - totalStart

def saveModelThroughPickle(model, filename):
    pickle.dump(model, open(filename, "wb"))

def outputPredictorResults(y_test, y_pred, title):
    # output results for model predictions
    print("For", title, "Classification")
    print(metrics.accuracy_score(y_test, y_pred) * 100)
    print(metrics.confusion_matrix(y_test, y_pred))
    print(pd.crosstab(y_test.ravel(), y_pred.ravel(), rownames=["True"], colnames=["Predicted"], margins=True))
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.zero_one_loss(y_test, y_pred))
#     print(metrics.log_loss(y_test, y_pred))
#    print(metrics.roc_auc_score(y_test, y_pred))    

In [5]:
def createKerasANN():
    # initialize the ann
    classifier = Sequential()

    # adding the input layer and the first hidden layer
    classifier.add(Dense(100, kernel_initializer = "uniform", activation = "relu", input_dim = 57))
    
    classifier.add(Dense(90, kernel_initializer = "uniform", activation="relu"))
    
    classifier.add(Dense(75, kernel_initializer = "uniform", activation="relu"))
    
    # adding the second hidden layer
    classifier.add(Dense(50, kernel_initializer = "uniform", activation = "relu"))

    # adding the third hidden layer
    classifier.add(Dense(25, kernel_initializer = "uniform", activation = "relu"))

    classifier.add(Dense(20, kernel_initializer = "uniform", activation="relu"))
    
    # adding the fourth hidden layer
    classifier.add(Dense(10, kernel_initializer = "uniform", activation = "relu"))
    
    classifier.add(Dense(5, kernel_initializer = "uniform", activation="relu"))

    # adding the output layer 
    classifier.add(Dense(1, kernel_initializer = "uniform", activation = "sigmoid"))

    # compiling the ANN
    classifier.compile(optimizer = "adam", loss = "mean_squared_error", metrics = ["accuracy"])
    
    return classifier

def createArtificialNeuralNetwork(X_train, y_train, X_test, y_test, preprocess):
    # create ANN
    print("\nArtificial Neural Network Classifier Section")
    print("---------------------------------")
    
    # wrap classifier around Pipeline classifier
    pipeline_classifier = KerasClassifier(build_fn=createKerasANN, verbose=0, batch_size=100, epochs=200)
    
    # create pipeline model with preprocess steps and classifier 
    model = Pipeline([
        ("preprocess", preprocess),
        ("estimator", pipeline_classifier)
    ])
    
    # train the model
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
#     y_pred = (y_pred > 0.5)
    
    print("Training set Score: ", model.score(X_train, y_train))
    print("Testing set Score: ", model.score(X_test, y_test)) 
    
    # output results
    outputPredictorResults(y_test, y_pred, "Artificial Neural Network")
    
#     model.save("ann_cause_model.h5")
#     model.save("ann_cause_model_weights.h5")

    # save keras model first 
    model_step = model.steps.pop(-1)[1]
    model_step.model.save("ann_cause_model.h5")
    
    # save the pipeline
    joblib.dump(model, "ann_pipeline.pkl")
    
    
def creatingRandomForestPredictor(X_train, y_train, X_test, y_test, preprocess):
    print("\nRandom Forest Classifier Section")
    print("---------------------------------")
    
    # initialize the Random Forest Classifier
    random_forest_classifier = RandomForestClassifier(**{'n_jobs': -1,
        'n_estimators': 1500,
        'warm_start': True, 
        'max_features': 0.5,
        'max_depth': 15,
        'min_samples_leaf': 2,
        'max_features' : 'sqrt',
        'random_state' : 0,
        'verbose': 0
                                                      })
    # run through oversampler simplifier
#     oversampler = SMOTE(random_state=0)
#     smote_X_train, smote_y_train = oversampler.fit_sample(X_train, y_train)
    
    model = make_pipeline(preprocess, random_forest_classifier)
    
    # fitting Random Forest to the training set
    model.fit(X_train, y_train)
    
    # Predicting the Test set results
    rf_y_pred = model.predict(X_test)
    
    # use the threshold of error to determine whether a prediction is valid
#     rf_y_pred = (rf_y_pred > 0.5)
    
    # making the confusion matrix
    cm = metrics.confusion_matrix(y_test.ravel(), rf_y_pred.ravel())
    
    print("Training set Score: ", model.score(X_train, y_train))
    print("Testing set Score: ", model.score(X_test, y_test)) 
    
    outputPredictorResults(y_test, rf_y_pred, "Random Forest")
    saveModelThroughPickle(model, "random_forest_cause_model.h5")

    # save the pipeline
    joblib.dump(model, "random_forest_pipeline.pkl")    
    
def createNaiveBayesModel(X_train, y_train, X_test, y_test, preprocess):
    # initialize the Naive Bayes Classifier
    classifier = GaussianNB()
    
    # hook up the preprocess step with the classifier params and create the pipeline
    model = make_pipeline(preprocess, classifier)
    
    # fit the Naive Bayes Classifier to the training set
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    y_pred = (y_pred > 0.5)
    
    print("Training set Score: ", model.score(X_train, y_train))
    print("Testing set Score: ", model.score(X_test, y_test))
    
    outputPredictorResults(y_test, y_pred, "Naive Bayes")
    saveModelThroughPickle(model, "naive_bayes_cause_model.h5")
    
    # save the pipeline
    joblib.dump(model, "naive_bayes_pipeline.pkl") 

In [6]:
sqlite_file = "./wildfires.sqlite"

# connecting to the database file and saving the select
conn = sqlite3.connect(sqlite_file)
dataset = pd.read_sql_query("select * from Fires limit 50000;", conn)

In [7]:
dataset.iloc[:,:]

Unnamed: 0,OBJECTID,FOD_ID,FPA_ID,SOURCE_SYSTEM_TYPE,SOURCE_SYSTEM,NWCG_REPORTING_AGENCY,NWCG_REPORTING_UNIT_ID,NWCG_REPORTING_UNIT_NAME,SOURCE_REPORTING_UNIT,SOURCE_REPORTING_UNIT_NAME,...,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,OWNER_CODE,OWNER_DESCR,STATE,COUNTY,FIPS_CODE,FIPS_NAME,Shape
0,1,1,FS-1418826,FED,FS-FIRESTAT,FS,USCAPNF,Plumas National Forest,0511,Plumas National Forest,...,A,40.036944,-121.005833,5.0,USFS,CA,63,063,Plumas,b'\x00\x01\xad\x10\x00\x00\xe8d\xc2\x92_@^\xc0...
1,2,2,FS-1418827,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,0503,Eldorado National Forest,...,A,38.933056,-120.404444,5.0,USFS,CA,61,061,Placer,b'\x00\x01\xad\x10\x00\x00T\xb6\xeej\xe2\x19^\...
2,3,3,FS-1418835,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,0503,Eldorado National Forest,...,A,38.984167,-120.735556,13.0,STATE OR PRIVATE,CA,17,017,El Dorado,b'\x00\x01\xad\x10\x00\x00\xd0\xa5\xa0W\x13/^\...
3,4,4,FS-1418845,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,0503,Eldorado National Forest,...,A,38.559167,-119.913333,5.0,USFS,CA,3,003,Alpine,b'\x00\x01\xad\x10\x00\x00\x94\xac\xa3\rt\xfa]...
4,5,5,FS-1418847,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,0503,Eldorado National Forest,...,A,38.559167,-119.933056,5.0,USFS,CA,3,003,Alpine,b'\x00\x01\xad\x10\x00\x00@\xe3\xaa.\xb7\xfb]\...
5,6,6,FS-1418849,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,0503,Eldorado National Forest,...,A,38.635278,-120.103611,5.0,USFS,CA,5,005,Amador,b'\x00\x01\xad\x10\x00\x00\xf0<~\x90\xa1\x06^\...
6,7,7,FS-1418851,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,0503,Eldorado National Forest,...,A,38.688333,-120.153333,5.0,USFS,CA,17,017,El Dorado,b'\x00\x01\xad\x10\x00\x00$o\x996\xd0\t^\xc0h\...
7,8,8,FS-1418854,FED,FS-FIRESTAT,FS,USCASHF,Shasta-Trinity National Forest,0514,Shasta-Trinity National Forest,...,B,40.968056,-122.433889,13.0,STATE OR PRIVATE,CA,,,,b'\x00\x01\xad\x10\x00\x00t)\xe8\xd5\xc4\x9b^\...
8,9,9,FS-1418856,FED,FS-FIRESTAT,FS,USCASHF,Shasta-Trinity National Forest,0514,Shasta-Trinity National Forest,...,B,41.233611,-122.283333,13.0,STATE OR PRIVATE,CA,,,,"b'\x00\x01\xad\x10\x00\x00\xdc\x8d\x1e""""\x92^\..."
9,10,10,FS-1418859,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,0503,Eldorado National Forest,...,A,38.548333,-120.149167,5.0,USFS,CA,5,005,Amador,b'\x00\x01\xad\x10\x00\x00dS\\\xf2\x8b\t^\xc0\...


In [8]:
# dataset = dataset.dropna()
dataset.head()

Unnamed: 0,OBJECTID,FOD_ID,FPA_ID,SOURCE_SYSTEM_TYPE,SOURCE_SYSTEM,NWCG_REPORTING_AGENCY,NWCG_REPORTING_UNIT_ID,NWCG_REPORTING_UNIT_NAME,SOURCE_REPORTING_UNIT,SOURCE_REPORTING_UNIT_NAME,...,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,OWNER_CODE,OWNER_DESCR,STATE,COUNTY,FIPS_CODE,FIPS_NAME,Shape
0,1,1,FS-1418826,FED,FS-FIRESTAT,FS,USCAPNF,Plumas National Forest,511,Plumas National Forest,...,A,40.036944,-121.005833,5.0,USFS,CA,63,63,Plumas,b'\x00\x01\xad\x10\x00\x00\xe8d\xc2\x92_@^\xc0...
1,2,2,FS-1418827,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,...,A,38.933056,-120.404444,5.0,USFS,CA,61,61,Placer,b'\x00\x01\xad\x10\x00\x00T\xb6\xeej\xe2\x19^\...
2,3,3,FS-1418835,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,...,A,38.984167,-120.735556,13.0,STATE OR PRIVATE,CA,17,17,El Dorado,b'\x00\x01\xad\x10\x00\x00\xd0\xa5\xa0W\x13/^\...
3,4,4,FS-1418845,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,...,A,38.559167,-119.913333,5.0,USFS,CA,3,3,Alpine,b'\x00\x01\xad\x10\x00\x00\x94\xac\xa3\rt\xfa]...
4,5,5,FS-1418847,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,...,A,38.559167,-119.933056,5.0,USFS,CA,3,3,Alpine,b'\x00\x01\xad\x10\x00\x00@\xe3\xaa.\xb7\xfb]\...


In [9]:
dataset.FIPS_CODE = defaultMinimumValues(dataset.FIPS_CODE)
dataset.CONT_TIME = defaultMinimumValues(dataset.CONT_TIME)
dataset.DISCOVERY_TIME = defaultMinimumValues(dataset.DISCOVERY_TIME)

# split dataset into train and test lists
filtered_independent = dataset[[
    "STATE", 
    "FIPS_CODE", 
    "LATITUDE", 
    "LONGITUDE", 
    "FIRE_SIZE_CLASS", 
    "FIRE_SIZE", 
    "FIRE_YEAR", 
    "DISCOVERY_DATE",
    "DISCOVERY_TIME",
    "CONT_DATE",
    "CONT_TIME"
]]
filtered_dependent = dataset["STAT_CAUSE_CODE"]
X = filtered_independent.values
y = filtered_dependent.values

In [10]:
# using the ColumnTransformer only approach
numerical_features = filtered_independent.dtypes != object
categorical_features = ~numerical_features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

preprocess = make_column_transformer (
    (OneHotEncoder(sparse=False), categorical_features),
    (make_pipeline(SimpleImputer(), StandardScaler()), numerical_features)
)
numerical_features
categorical_features

STATE               True
FIPS_CODE          False
LATITUDE           False
LONGITUDE          False
FIRE_SIZE_CLASS     True
FIRE_SIZE          False
FIRE_YEAR          False
DISCOVERY_DATE     False
DISCOVERY_TIME     False
CONT_DATE          False
CONT_TIME          False
dtype: bool

In [11]:
createNaiveBayesModel(X_train, y_train, X_test, y_test, preprocess)

Training set Score:  0.03014285714285714
Testing set Score:  0.0292
For Naive Bayes Classification
54.75333333333333
[[8213    0    0    0    0    0    0    0    0]
 [ 514    0    0    0    0    0    0    0    0]
 [ 261    0    0    0    0    0    0    0    0]
 [2019    0    0    0    0    0    0    0    0]
 [ 703    0    0    0    0    0    0    0    0]
 [  73    0    0    0    0    0    0    0    0]
 [1186    0    0    0    0    0    0    0    0]
 [ 117    0    0    0    0    0    0    0    0]
 [1914    0    0    0    0    0    0    0    0]]
Predicted   True    All
True                   
1.0         8213   8213
2.0          514    514
3.0          261    261
4.0         2019   2019
5.0          703    703
6.0           73     73
7.0         1186   1186
8.0          117    117
9.0         1914   1914
All        15000  15000
              precision    recall  f1-score   support

         1.0       0.55      1.00      0.71      8213
         2.0       0.00      0.00      0.00       514

  'precision', 'predicted', average, warn_for)


In [12]:
createArtificialNeuralNetwork(X_train, y_train, X_test, y_test, preprocess)


Artificial Neural Network Classifier Section
---------------------------------
Training set Score:  0.19171428565468107
Testing set Score:  0.17439999957879385
For Artificial Neural Network Classification
17.44
[[2121 6092    0    0    0    0    0    0    0]
 [  19  495    0    0    0    0    0    0    0]
 [  10  251    0    0    0    0    0    0    0]
 [  93 1926    0    0    0    0    0    0    0]
 [  19  684    0    0    0    0    0    0    0]
 [   4   69    0    0    0    0    0    0    0]
 [  25 1161    0    0    0    0    0    0    0]
 [   4  113    0    0    0    0    0    0    0]
 [ 102 1812    0    0    0    0    0    0    0]]
Predicted   1.0    2.0    All
True                         
1.0        2121   6092   8213
2.0          19    495    514
3.0          10    251    261
4.0          93   1926   2019
5.0          19    684    703
6.0           4     69     73
7.0          25   1161   1186
8.0           4    113    117
9.0         102   1812   1914
All        2397  12603  1

  'precision', 'predicted', average, warn_for)


In [13]:
creatingRandomForestPredictor(X_train, y_train, X_test, y_test, preprocess)


Random Forest Classifier Section
---------------------------------
Training set Score:  0.7090571428571428
Testing set Score:  0.6494
For Random Forest Classification
64.94
[[8010    0    0   30    9    0  100    0   64]
 [ 228   35    0   17   18    0   37    0  179]
 [ 192    0    0   20    5    0    8    0   36]
 [1418    2    0  403   22    0   61    0  113]
 [ 265    4    0   38  131    0  170    0   95]
 [  47    0    0    2    5    0   15    0    4]
 [ 281    3    0   19   30    0  719    0  134]
 [  75    0    0    3    4    0    2    0   33]
 [1073   20    0   83   68    0  227    0  443]]
Predicted    1.0  2.0  4.0  5.0   7.0   9.0    All
True                                              
1.0         8010    0   30    9   100    64   8213
2.0          228   35   17   18    37   179    514
3.0          192    0   20    5     8    36    261
4.0         1418    2  403   22    61   113   2019
5.0          265    4   38  131   170    95    703
6.0           47    0    2    5    1

  'precision', 'predicted', average, warn_for)
