#1] Imports

In [1]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 2] Pre-Processing

In [4]:
# Reading in relevant data
data = pd.read_csv("WildfireData.csv", na_values="NaN")
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,fire_name,fire_size,fire_size_class,stat_cause_descr,latitude,longitude,state,disc_clean_date,...,Wind_cont,Hum_pre_30,Hum_pre_15,Hum_pre_7,Hum_cont,Prec_pre_30,Prec_pre_15,Prec_pre_7,Prec_cont,remoteness
0,0,0,,10.0,C,Missing/Undefined,18.105072,-66.753044,PR,2/11/2007,...,3.250413,78.21659,76.79375,76.381579,78.72437,0.0,0.0,0.0,0.0,0.017923
1,1,1,,3.0,B,Arson,35.03833,-87.61,TN,12/11/2006,...,2.12232,70.84,65.858911,55.505882,81.682678,59.8,8.4,0.0,86.8,0.184355
2,2,2,,60.0,C,Arson,34.9478,-88.7225,MS,2/29/2004,...,3.36905,75.531629,75.868613,76.812834,65.0638,168.8,42.2,18.1,124.5,0.194544
3,3,3,WNA 1,1.0,B,Debris Burning,39.6414,-119.3083,NV,6/6/2005,...,0.0,44.778429,37.140811,35.353846,0.0,10.4,7.2,0.0,0.0,0.487447
4,4,4,,2.0,B,Miscellaneous,30.7006,-90.5914,LA,9/22/1999,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.214633


In [6]:
data = data[['fire_size', 'latitude', 'longitude', 'discovery_month', 'Vegetation', 'Temp_pre_7', 'Hum_pre_7', 'Prec_pre_7', 'Wind_pre_7']]

In [7]:
# Drop NA
data = data.dropna()

In [56]:
# One Hot Encodings
non_dummy_cols = ['fire_size', 'latitude','longitude','Vegetation', 'Temp_pre_7', 'Hum_pre_7', 'Prec_pre_7', 'Wind_pre_7', ] 
dummy_cols = list(set(data.columns) - set(non_dummy_cols))
data = pd.get_dummies(data, columns=dummy_cols)

# 3] Create Individual Models 

In [48]:
#Split train/test
from sklearn.model_selection import train_test_split
X = data.drop('fire_size', axis=1) 
y = data.fire_size  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [49]:
#Imports for various models
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from xgboost import XGBRegressor

seed = 43

In [50]:
#Initialize Models
elnt = ElasticNet(random_state = seed)
svm = SVR()
xgb = XGBRegressor(random_state = seed, n_jobs = -1)

elnt.fit(X_train, y_train)
svm.fit(X_train, y_train)
xgb.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=-1, nthread=None, objective='reg:linear', random_state=43,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [None]:
#Hyperparameter grid search
from sklearn.model_selection import GridSearchCV
def grid_search_cv(model, params):
    global best_params, best_score
    grid_search = GridSearchCV(estimator = model, param_grid = params, cv = 10, verbose = 1,
                            scoring = 'neg_mean_squared_error', n_jobs = -1)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_ 
    best_score = np.sqrt(-1*(np.round(grid_search.best_score_, 5)))
    return best_params, best_score

In [None]:
#Optimize Elastic Net
elastic_params = {'alpha': [0.0005, 0.001, 0.002, 0.005], 
                 'l1_ratio': [0.60, 0.65, 0.70, 0.80],
                 'random_state':[seed]}
grid_search_cv(elnt, elastic_params)
elastic_best_params, elastic_best_score = best_params, best_score
print('Elastic Net best params:{} & best_score:{:0.5f}' .format(elastic_best_params, elastic_best_score))

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   33.8s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:  1.7min finished


Elastic Net best params:{'alpha': 0.005, 'l1_ratio': 0.6, 'random_state': 43} & best_score:13879.59241


In [None]:
#Optimize Support Vector Regressor
svm_params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
    'C': [4, 5], 
    'gamma':[0.0001, 0.001]}

grid_search_cv(svm, svm_params)
svm_best_params, svm_best_score = best_params, best_score
print('SVM best params:{} & best_score:{:0.5f}' .format(svm_best_params, svm_best_score))

In [None]:
#Optimize XGB
xgb_opt = XGBRegressor(colsample_bytree = 0.4603, gamma = 0.0468, 
                             learning_rate = 0.04, max_depth = 3, 
                             min_child_weight = 1.7817, n_estimators = 2500,
                             reg_alpha = 0.4640, reg_lambda = 0.8571,
                             subsample = 0.5213, silent = 1,
                             nthread = -1, random_state = 7)

In [None]:
#Retrain with optimized hyperparameters
elastic_net_opt = ElasticNet(**elastic_best_params)
# svm_opt = SVR(**svm_best_params)
xgb_opt = xgb_opt

elastic_net_opt.fit(X_train, y_train)
# svm_opt.fit(X_train, y_train)
xgb_opt.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4603, gamma=0.0468,
             importance_type='gain', learning_rate=0.04, max_delta_step=0,
             max_depth=3, min_child_weight=1.7817, missing=None,
             n_estimators=2500, n_jobs=1, nthread=-1, objective='reg:linear',
             random_state=7, reg_alpha=0.464, reg_lambda=0.8571,
             scale_pos_weight=1, seed=None, silent=1, subsample=0.5213,
             verbosity=1)

In [None]:
#Define learning curve function
from sklearn.model_selection import learning_curve
def plot_learning_curve(model):
    train_sizes, train_scores, test_scores = learning_curve(model, X_train, y_train, 
                                            train_sizes = np.linspace(0.01, 1.0, 20), cv = 10, scoring = 'neg_mean_squared_error', 
                                            n_jobs = -1, random_state = seed)
    train_mean = np.mean(train_scores, axis = 1)
    train_std = np.std(train_scores, axis = 1)

    test_mean = np.mean(test_scores, axis = 1)
    test_std = np.std(test_scores, axis = 1)  
    
    plt.plot(train_sizes, train_mean, 'o-', color = 'red',  label = 'Training score')
    plt.plot(train_sizes, test_mean, 'o-', color = 'green', label = 'Cross-validation score')
    
    plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha = 0.1, color = 'r') # Alpha controls band transparency.
    plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha = 0.1, color = 'g')
    
    font_size = 12
    plt.xlabel('Training Set Size', fontsize = font_size)
    plt.ylabel('Accuracy Score', fontsize = font_size)
    plt.xticks(fontsize = font_size)
    plt.yticks(fontsize = font_size)
    plt.legend(loc = 'best')
    plt.grid()


#4] Combine into Ensemble Model

In [None]:
#Dataframe containing all 3 ensemble models
base_model_prediction = pd.DataFrame({
       'Elastic Net':elnt.predict(X_test), 
       'SVM':svm.predict(X_test),
       'XGB':xgb.predict(X_test), 
  })

from IPython.display import Markdown
def bold(string):
    display(Markdown(string))
bold('**All the Base Model Predictions:**')
display(base_model_prediction.head())

**All the Base Model Predictions:**

Unnamed: 0,Elastic Net,SVM,XGB
0,5376.199466,10.471358,6053.119141
1,1548.15129,3.253075,145.265396
2,1125.510421,3.248031,232.562973
3,6157.072399,9.606693,-13.402963
4,4800.066478,13.911445,5452.743164


In [51]:
#Averaging Models
y_elastic = elnt.predict(X_test)
y_svm = svm.predict(X_test)
y_xgb = xgb.predict(X_test)

avg_ensemble = (y_svm + y_xgb + y_elastic)/3
final = pd.DataFrame({'Actual': y_test, 'Predicted': avg_ensemble})
print(final)

        Actual     Predicted
54607  61416.0   3256.808049
39066      2.5    155.583506
52438  29229.5  15241.124802
269      168.3   2641.811681
37095      2.5    833.535488
...        ...           ...
15793   1201.8    624.305088
39713      2.0   -571.971091
42309      2.0   1936.209311
1990       1.0   -722.056072
3059       2.0    846.890622

[16611 rows x 2 columns]


In [52]:
# from joblib import dump, load
# dump(elnt, 'elnt.joblib')
# dump(svm, 'svm.joblib')
# dump(xgb, 'xgb.joblib')
xgb.save_model('xgb.json')