In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

`clean-weather.csv` contains the weather/wildfire data after having being thoroughly processed and cleaned in `data-cleaning-preparation.ipnyb`.

In [3]:
df = pd.read_csv('clean-weather.csv')

In [4]:
df.head()

Unnamed: 0,LATITUDE,LONGITUDE,ELEVATION,STATION_NAME,PROVINCE_CODE,ID,LOCAL_DATE,LOCAL_DAY,LOCAL_MONTH,LOCAL_YEAR,...,MEAN_TEMPERATURE,SNOW_ON_GROUND,TOTAL_SNOW,TOTAL_RAIN,TOTAL_PRECIPITATION,MIN_REL_HUMIDITY,MAX_REL_HUMIDITY,SPEED_MAX_GUST,hectares,CLASS
0,50.905,-126.9292,14.0,EGG ISLAND,BC,1062646.2000.3.10,2000-03-10 00:00:00,10.0,3.0,2000.0,...,6.7,0.0,0.0,3.5,3.5,,,,6.0,1
1,59.9617,-121.3608,498.0,SAMBAA K'E,NT,220CQHR.2000.6.27,2000-06-27 00:00:00,27.0,6.0,2000.0,...,17.5,0.0,0.0,0.0,0.0,,,,1000.0,1
2,59.9752,-121.0342,498.0,SAMBAA K'E,NT,220CQHR.2000.7.11,2000-07-11 00:00:00,11.0,7.0,2000.0,...,15.4,0.0,0.0,0.0,0.0,,,,12.0,1
3,59.1767,-122.019,378.3,FORT NELSON UA,BC,1192950.2001.6.13,2001-06-13 00:00:00,13.0,6.0,2001.0,...,17.7,0.0,0.0,0.0,0.0,,,,5.0,1
4,59.4008,-120.6438,777.2,PETITOT LO,AB,3075171.2000.6.26,2000-06-26 00:00:00,26.0,6.0,2000.0,...,16.0,0.0,0.0,1.2,1.2,,,,0.1,1


In [5]:
df.describe()

Unnamed: 0,LATITUDE,LONGITUDE,ELEVATION,LOCAL_DAY,LOCAL_MONTH,LOCAL_YEAR,MIN_TEMPERATURE,MAX_TEMPERATURE,MEAN_TEMPERATURE,SNOW_ON_GROUND,TOTAL_SNOW,TOTAL_RAIN,TOTAL_PRECIPITATION,MIN_REL_HUMIDITY,MAX_REL_HUMIDITY,SPEED_MAX_GUST,hectares,CLASS
count,59732.0,59732.0,59572.0,59732.0,59732.0,59732.0,59732.0,59732.0,59732.0,59732.0,59732.0,59732.0,59732.0,14295.0,14302.0,19311.0,59732.0,59732.0
mean,52.430932,-101.86118,498.738882,15.50437,5.835147,2009.038204,4.713177,18.190861,11.467476,2.705826,0.106384,1.12537,1.673659,41.89213,87.851559,26.892341,248.934998,0.526401
std,4.68011,20.590721,389.618273,8.80355,1.794696,6.677045,8.798398,9.858715,8.986341,12.224479,1.051774,4.046445,4.796884,17.45177,11.283643,22.494157,5267.998994,0.499307
min,40.0653,-162.48,1.2,1.0,1.0,1999.0,-46.9,-42.7,-44.8,0.0,0.0,0.0,0.0,8.0,22.0,0.0,0.0,0.0
25%,49.23,-117.542833,226.125,8.0,5.0,2003.0,-0.1,12.6,6.5,0.0,0.0,0.0,0.0,29.0,82.25,0.0,0.0,0.0
50%,51.9588,-110.581933,390.8,15.0,6.0,2007.0,6.1,20.0,13.4,0.0,0.0,0.0,0.0,39.0,91.0,33.0,0.009,1.0
75%,55.41195,-85.678991,683.3,23.0,7.0,2014.0,11.0,25.4,18.0,0.0,0.0,0.0,0.9,52.0,96.0,43.0,0.2,1.0
max,82.3104,116.188,2926.1,31.0,12.0,2024.0,26.5,47.9,34.4,322.0,38.6,168.0,168.0,100.0,100.0,149.0,577646.8,1.0


# Classification
We turn this into a classification problem by creating a class column, where 0 means no fire and 1 means fire

In [6]:
df['CLASS'] = (df.loc[:, 'hectares'] > 0).astype(int)

# Modeling
It is not evident which model would best be used for our dataset, so we perform simple tests to see if basic models might be viable using a single test-train split

In [7]:
df.dropna(subset=['ELEVATION'], inplace=True)
x = df[['LATITUDE', 'LONGITUDE', 'ELEVATION', 'MEAN_TEMPERATURE', 'MAX_TEMPERATURE', 'MIN_TEMPERATURE', 'SNOW_ON_GROUND', 'TOTAL_SNOW', 'TOTAL_PRECIPITATION', 'TOTAL_RAIN']] # Only select weather data
y = df['CLASS']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

## Logistic Regression model

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
model = LogisticRegression()
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# Predicting on the train set
y_pred_train = model.predict(x_train)

# Evaluating the model on the train set
accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)
f1_train = f1_score(y_train, y_pred_train)

print(f"Train set Accuracy: {accuracy_train}")
print(f"Train set Precision: {precision_train}")
print(f"Train set Recall: {recall_train}")
print(f"Train set f1: {f1_train}")

# Predicting on the test set
y_pred_test = model.predict(x_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
print(f"Test set Accuracy: {accuracy_test}")
print(f"Test set Precision: {precision_test}")
print(f"Test set Recall: {recall_test}")
print(f"Test set f1: {f1_test}")

Train set Accuracy: 0.6907274901903183
Train set Precision: 0.6704916411118805
Train set Recall: 0.8107655502392345
Train set f1: 0.733986680383345
Test set Accuracy: 0.6791439362148552
Test set Precision: 0.6574294008973344
Test set Recall: 0.8023836366564664
Test set f1: 0.7227097990860956


## Decision Tree Classifier

In [11]:
from sklearn.tree import DecisionTreeClassifier

In [12]:
model = DecisionTreeClassifier(random_state=11)
model.fit(x_train, y_train)

In [13]:
# Predicting on the train set
y_pred_train = model.predict(x_train)

# Evaluating the model on the train set
accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)
f1_train = f1_score(y_train, y_pred_train)

print(f"Train set Accuracy: {accuracy_train}")
print(f"Train set Precision: {precision_train}")
print(f"Train set Recall: {recall_train}")
print(f"Train set f1: {f1_train}")

# Predicting on the test set
y_pred_test = model.predict(x_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
print(f"Test set Accuracy: {accuracy_test}")
print(f"Test set Precision: {precision_test}")
print(f"Test set Recall: {recall_test}")
print(f"Test set f1: {f1_test}")

Train set Accuracy: 0.9999580334473425
Train set Precision: 1.0
Train set Recall: 0.9999202551834131
Train set f1: 0.9999601260018341
Test set Accuracy: 0.8375996642887117
Test set Precision: 0.8489549314173742
Test set Recall: 0.8373329038492511
Test set f1: 0.8431038676720992


Decision Tree Regressor has overfit

## Random Forests 

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

In [16]:
# Predicting on the train set
y_pred_train = model.predict(x_train)

# Evaluating the model on the train set
accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)
f1_train = f1_score(y_train, y_pred_train)

print(f"Train set Accuracy: {accuracy_train}")
print(f"Train set Precision: {precision_train}")
print(f"Train set Recall: {recall_train}")
print(f"Train set f1: {f1_train}")

# Predicting on the test set
y_pred_test = model.predict(x_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
print(f"Test set Accuracy: {accuracy_test}")
print(f"Test set Precision: {precision_test}")
print(f"Test set Recall: {recall_test}")
print(f"Test set f1: {f1_test}")

Train set Accuracy: 0.9999580334473425
Train set Precision: 0.9999601275917065
Train set Recall: 0.9999601275917065
Train set f1: 0.9999601275917065
Test set Accuracy: 0.838774653797734
Test set Precision: 0.8440308087291399
Test set Recall: 0.8471573522306329
Test set f1: 0.8455911904187766


## Gradient Boosting Model

In [17]:
from sklearn.ensemble import GradientBoostingClassifier

In [18]:
model = GradientBoostingClassifier()
model.fit(x_train, y_train)

In [19]:
# Predicting on the train set
y_pred_train = model.predict(x_train)

# Evaluating the model on the train set
accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)
f1_train = f1_score(y_train, y_pred_train)

print(f"Train set Accuracy: {accuracy_train}")
print(f"Train set Precision: {precision_train}")
print(f"Train set Recall: {recall_train}")
print(f"Train set f1: {f1_train}")

# Predicting on the test set
y_pred_test = model.predict(x_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
print(f"Test set Accuracy: {accuracy_test}")
print(f"Test set Precision: {precision_test}")
print(f"Test set Recall: {recall_test}")
print(f"Test set f1: {f1_test}")

Train set Accuracy: 0.7783326688629163
Train set Precision: 0.7691138301816833
Train set Recall: 0.82707336523126
Train set f1: 0.7970413064361191
Test set Accuracy: 0.7669324381032312
Test set Precision: 0.75696316262354
Test set Recall: 0.8141407634079562
Test set f1: 0.7845115232404749


## XGB Classifier

In [8]:
from xgboost import XGBClassifier

In [9]:
model = XGBClassifier(random_state=0)
model.fit(x_train, y_train)

In [10]:
# Predicting on the train set
y_pred_train = model.predict(x_train)

# Evaluating the model on the train set
accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)
f1_train = f1_score(y_train, y_pred_train)

print(f"Train set Accuracy: {accuracy_train}")
print(f"Train set Precision: {precision_train}")
print(f"Train set Recall: {recall_train}")
print(f"Train set f1: {f1_train}")

# Predicting on the test set
y_pred_test = model.predict(x_test)

accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
print(f"Test set Accuracy: {accuracy_test}")
print(f"Test set Precision: {precision_test}")
print(f"Test set Recall: {recall_test}")
print(f"Test set f1: {f1_test}")

Train set Accuracy: 0.9169062257380868
Train set Precision: 0.9259094942324756
Train set Recall: 0.9153508771929825
Train set f1: 0.9205999117776797
Test set Accuracy: 0.887452790600084
Test set Precision: 0.8975824893825547
Test set Recall: 0.8850056369785795
Test set f1: 0.8912496958884113


## K-Nearest Neighbors

In [23]:
from sklearn.neighbors import KNeighborsClassifier

In [24]:
model = KNeighborsClassifier()
model.fit(x_train, y_train)

In [25]:
# Predicting on the train set
y_pred_train = model.predict(x_train)

# Evaluating the model on the train set
accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)
f1_train = f1_score(y_train, y_pred_train)

print(f"Train set Accuracy: {accuracy_train}")
print(f"Train set Precision: {precision_train}")
print(f"Train set Recall: {recall_train}")
print(f"Train set f1: {f1_train}")

# Predicting on the test set
y_pred_test = model.predict(x_test)

accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
print(f"Test set Accuracy: {accuracy_test}")
print(f"Test set Precision: {precision_test}")
print(f"Test set Recall: {recall_test}")
print(f"Test set f1: {f1_test}")

Train set Accuracy: 0.8087164529869694
Train set Precision: 0.8012985052091197
Train set Recall: 0.8464114832535885
Train set f1: 0.8232374156519041
Test set Accuracy: 0.7184221569450273
Test set Precision: 0.7151040096472716
Test set Recall: 0.7640521823159929
Test set f1: 0.7387682005761894


## Support Vector Machines

In [26]:
from sklearn.svm import SVC

In [27]:
model = SVC()
model.fit(x_train, y_train)

In [28]:
# Predicting on the train set
y_pred_train = model.predict(x_train)

# Evaluating the model on the train set
accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)
f1_train = f1_score(y_train, y_pred_train)

print(f"Train set Accuracy: {accuracy_train}")
print(f"Train set Precision: {precision_train}")
print(f"Train set Recall: {recall_train}")
print(f"Train set f1: {f1_train}")

# Predicting on the test set
y_pred_test = model.predict(x_test)

accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
print(f"Test set Accuracy: {accuracy_test}")
print(f"Test set Precision: {precision_test}")
print(f"Test set Recall: {recall_test}")
print(f"Test set f1: {f1_test}")

Train set Accuracy: 0.6901399584531128
Train set Precision: 0.6611859585508424
Train set Recall: 0.8433811802232855
Train set f1: 0.7412521245466174
Test set Accuracy: 0.6783046579941251
Test set Precision: 0.64746772591857
Test set Recall: 0.8400708648735706
Test set f1: 0.7313003855590606


# Features
Removing duplicate `TOTAL_RAIN` and `SNOW_ON_GROUND`

In [46]:
from xgboost import XGBClassifier

In [47]:
input_col = ['LATITUDE', 'LONGITUDE', 'ELEVATION', 'MEAN_TEMPERATURE', 'MAX_TEMPERATURE', 'MIN_TEMPERATURE', 'TOTAL_RAIN', 'LOCAL_MONTH']
target_col = 'CLASS'

In [48]:
df.dropna(subset=['ELEVATION'], inplace=True)
x = df[input_col] # Only select weather data
y = df[target_col]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [49]:
model = XGBClassifier(random_state=0)
model.fit(x_train, y_train)

# Predicting on the train set
y_pred_train = model.predict(x_train)

# Evaluating the model on the train set
accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)
f1_train = f1_score(y_train, y_pred_train)
print(f"Train set Accuracy: {accuracy_train}")
print(f"Train set Precision: {precision_train}")
print(f"Train set Recall: {recall_train}")
print(f"Train set f1: {f1_train}")

# Predicting on the test set
y_pred_test = model.predict(x_test)

accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
print(f"Test set Accuracy: {accuracy_test}")
print(f"Test set Precision: {precision_test}")
print(f"Test set Recall: {recall_test}")
print(f"Test set f1: {f1_test}")

Train set Accuracy: 0.9207681461089102
Train set Precision: 0.9256261714090986
Train set Recall: 0.9237374596157116
Train set f1: 0.9246808510638298
Test set Accuracy: 0.8926341234136843
Test set Precision: 0.8953577840471916
Test set Recall: 0.8991629104958145
Test set f1: 0.8972563130501832


## Feature Scaling

In [50]:
from sklearn.preprocessing import MinMaxScaler

In [51]:
scaler = MinMaxScaler().fit(df[input_col])

In [52]:
df[input_col] = scaler.transform(df[input_col])

In [53]:
df

Unnamed: 0,LATITUDE,LONGITUDE,ELEVATION,STATION_NAME,PROVINCE_CODE,ID,LOCAL_DATE,LOCAL_DAY,LOCAL_MONTH,LOCAL_YEAR,...,MEAN_TEMPERATURE,SNOW_ON_GROUND,TOTAL_SNOW,TOTAL_RAIN,TOTAL_PRECIPITATION,MIN_REL_HUMIDITY,MAX_REL_HUMIDITY,SPEED_MAX_GUST,hectares,CLASS
0,0.256591,0.127574,0.004376,EGG ISLAND,BC,1062646.2000.3.10,2000-03-10 00:00:00,10.0,0.181818,2000.0,...,0.650253,0.0,0.0,0.020833,3.5,,,,6.0,1
1,0.470975,0.147556,0.169852,SAMBAA K'E,NT,220CQHR.2000.6.27,2000-06-27 00:00:00,27.0,0.454545,2000.0,...,0.786616,0.0,0.0,0.000000,0.0,,,,1000.0,1
2,0.471295,0.148728,0.169852,SAMBAA K'E,NT,220CQHR.2000.7.11,2000-07-11 00:00:00,11.0,0.545455,2000.0,...,0.760101,0.0,0.0,0.000000,0.0,,,,12.0,1
3,0.452393,0.145194,0.128927,FORT NELSON UA,BC,1192950.2001.6.13,2001-06-13 00:00:00,13.0,0.454545,2001.0,...,0.789141,0.0,0.0,0.000000,0.0,,,,5.0,1
4,0.457698,0.150129,0.265308,PETITOT LO,AB,3075171.2000.6.26,2000-06-26 00:00:00,26.0,0.454545,2000.0,...,0.767677,0.0,0.0,0.007143,1.2,,,,0.1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59727,0.149374,0.294276,0.118568,YELLOW GRASS NORTH,SK,4019041.2017.1.25,2017-01-25 00:00:00,25.0,0.000000,2017.0,...,0.424242,10.0,0.0,0.000000,0.0,,,32.0,0.0,0
59728,0.149374,0.294276,0.118568,YOHO PARK,BC,11790J1.2000.8.2,2000-08-02 00:00:00,2.0,0.636364,2000.0,...,0.680993,0.0,0.0,0.000000,0.2,,,,0.0,0
59729,0.149374,0.294276,0.118568,YORKTON A,SK,4019080.2000.2.19,2000-02-19 00:00:00,19.0,0.090909,2000.0,...,0.444444,6.0,0.0,0.000000,0.0,,,0.0,0.0,0
59730,0.175784,0.284479,0.152792,YORKTON RCMP SNOW,SK,4019095.2010.1.31,2010-01-31 00:00:00,31.0,0.000000,2010.0,...,0.611780,25.0,0.0,0.000000,0.0,,,,0.0,0


In [54]:
df.dropna(subset=['ELEVATION'], inplace=True)
x = df[input_col] # Only select weather data
y = df[target_col]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [36]:
model = XGBClassifier(random_state=0)
model.fit(x_train, y_train)

# Predicting on the train set
y_pred_train = model.predict(x_train)

# Evaluating the model on the train set
accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)
f1_train = f1_score(y_train, y_pred_train)
print(f"Train set Accuracy: {accuracy_train}")
print(f"Train set Precision: {precision_train}")
print(f"Train set Recall: {recall_train}")
print(f"Train set f1: {f1_train}")

# Predicting on the test set
y_pred_test = model.predict(x_test)

accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
print(f"Test set Accuracy: {accuracy_test}")
print(f"Test set Precision: {precision_test}")
print(f"Test set Recall: {recall_test}")
print(f"Test set f1: {f1_test}")

Train set Accuracy: 0.9229074427681138
Train set Precision: 0.9259052924791087
Train set Recall: 0.9277511961722488
Train set f1: 0.9268273252340171
Test set Accuracy: 0.8978598405371381
Test set Precision: 0.9002565747273894
Test set Recall: 0.9041713641488163
Test set f1: 0.902209722780233


## Feature importance

In [37]:
importance_df = pd.DataFrame({
    'feature': x.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

In [38]:
importance_df

Unnamed: 0,feature,importance
4,MAX_TEMPERATURE,0.339863
7,LOCAL_MONTH,0.231742
0,LATITUDE,0.126645
1,LONGITUDE,0.115174
2,ELEVATION,0.061189
3,MEAN_TEMPERATURE,0.05757
6,TOTAL_RAIN,0.038759
5,MIN_TEMPERATURE,0.029059


## Hyperparameter tuning

In [26]:
# search_space = {
#     'n_estimators': [300, 400, 500, 600],
#     'max_depth': [6, 7, 8],
#     'gamma': [0.01, 0.1, 0.5],
#     'learning_rate': [0.05, 0.1, 0.15],
# }
search_space = {
    'n_estimators': [450, 500, 550],
    'max_depth': [8, 9, 10],
    'gamma': [0.005, 0.01, 0.015],
    'learning_rate': [0.15, 0.2, 0.25],
}

In [20]:
from sklearn.model_selection import GridSearchCV

In [29]:
GS = GridSearchCV(estimator=model, 
                  param_grid=search_space, 
                  cv=10, # KFold Cross validation
                  scoring='accuracy',
                  verbose=4
)

In [30]:
GS.fit(x_train, y_train)

Fitting 10 folds for each of 81 candidates, totalling 810 fits
[CV 1/10] END gamma=0.005, learning_rate=0.15, max_depth=8, n_estimators=450;, score=0.912 total time=   0.8s
[CV 2/10] END gamma=0.005, learning_rate=0.15, max_depth=8, n_estimators=450;, score=0.908 total time=   0.8s
[CV 3/10] END gamma=0.005, learning_rate=0.15, max_depth=8, n_estimators=450;, score=0.911 total time=   0.8s
[CV 4/10] END gamma=0.005, learning_rate=0.15, max_depth=8, n_estimators=450;, score=0.915 total time=   0.8s
[CV 5/10] END gamma=0.005, learning_rate=0.15, max_depth=8, n_estimators=450;, score=0.918 total time=   0.8s
[CV 6/10] END gamma=0.005, learning_rate=0.15, max_depth=8, n_estimators=450;, score=0.908 total time=   0.8s
[CV 7/10] END gamma=0.005, learning_rate=0.15, max_depth=8, n_estimators=450;, score=0.915 total time=   0.8s
[CV 8/10] END gamma=0.005, learning_rate=0.15, max_depth=8, n_estimators=450;, score=0.907 total time=   1.2s
[CV 9/10] END gamma=0.005, learning_rate=0.15, max_depth=

In [31]:
print(f"Best parameters: {GS.best_params_}\n")

Best parameters: {'gamma': 0.01, 'learning_rate': 0.15, 'max_depth': 8, 'n_estimators': 500}



In [32]:
best_model = GS.best_estimator_
accuracy = best_model.score(x_test, y_test)
print(f"Accuracy on test set with best hyperparameters: {accuracy}")

Accuracy on test set with best hyperparameters: 0.9139739823751574


In [39]:
model = XGBClassifier(
    random_state=0,
    gamma=0.01,
    n_estimators=500,
    learning_rate=0.15,
    max_depth=8,
)
model.fit(x_train, y_train)

# Predicting on the train set
y_pred_train = model.predict(x_train)

# Evaluating the model on the train set
accuracy_train = accuracy_score(y_train, y_pred_train)
precision_train = precision_score(y_train, y_pred_train)
recall_train = recall_score(y_train, y_pred_train)
f1_train = f1_score(y_train, y_pred_train)
print(f"Train set Accuracy: {accuracy_train}")
print(f"Train set Precision: {precision_train}")
print(f"Train set Recall: {recall_train}")
print(f"Train set f1: {f1_train}")

# Predicting on the test set
y_pred_test = model.predict(x_test)

accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test = precision_score(y_test, y_pred_test)
recall_test = recall_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test)
print(f"Test set Accuracy: {accuracy_test}")
print(f"Test set Precision: {precision_test}")
print(f"Test set Recall: {recall_test}")
print(f"Test set f1: {f1_test}")

Train set Accuracy: 0.9882913318085486
Train set Precision: 0.9890706023135222
Train set Recall: 0.988676236044657
Train set f1: 0.9888733798604188
Test set Accuracy: 0.9139739823751574
Test set Precision: 0.9183344092963202
Test set Recall: 0.916411660492833
Test set f1: 0.9173720274083031


# Testing on secondary dataset

In [64]:
df_no = pd.read_csv('2000-2021_nofires+weather.csv') # No wildfire
df_no['hectares'] = 0
df_yes = pd.read_csv('2000-2021_fires+weather.csv') # Wildfire dataset
df_gab = pd.concat([df_yes, df_no])
df_gab['CLASS'] = (df_gab.loc[:, 'hectares'] > 0).astype(int)

In [65]:
df_gab

Unnamed: 0,lat,lon,date,hectares,elevation,temp_c,max_temp_c,min_temp_c,wind_kph,wind_dir,precip_mm,humidity,pressure_hPa,soil_temp_c,soil_moisture,totalsnow_cm,CLASS
0,56.713600,-110.449000,2000-01-01,0.20,347.0,-29.992,-25.265,-33.665,9.470,242.195,0.0,72.753,971.810,-1.340,0.585,0.0,1
1,54.752200,-114.914700,2000-01-03,0.30,827.0,-18.440,-13.971,-22.872,9.720,179.298,0.0,58.110,921.216,-8.824,0.558,0.0,1
2,53.661500,-102.177100,2000-01-06,0.30,264.0,-21.514,-18.139,-22.889,10.163,265.019,0.0,77.443,985.953,-1.606,0.587,0.0,1
3,51.072200,-115.300200,2000-01-07,0.50,1333.0,-8.396,-3.916,-10.766,17.917,249.972,0.0,81.143,853.531,0.592,0.212,0.0,1
4,51.066800,-115.128800,2000-01-07,0.25,1283.0,-6.168,-1.646,-8.895,17.917,249.972,0.0,72.987,860.034,-3.393,0.290,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86774,49.881450,-114.011833,2021-10-20,0.00,1644.0,1.891,10.156,-2.694,12.387,171.290,0.0,79.652,831.251,-0.375,0.226,0.0,0
86775,54.039500,-109.323767,2021-10-21,0.00,551.0,0.967,4.910,-2.790,15.630,101.629,0.0,69.323,954.092,1.842,0.216,0.0,0
86776,49.513200,-124.850600,2021-10-22,0.00,35.0,11.089,13.108,9.558,25.772,147.717,10.2,91.173,1000.783,11.118,0.317,0.0,0
86777,49.513800,-124.849900,2021-10-24,0.00,30.0,9.678,10.490,8.790,35.946,132.132,15.7,93.895,986.835,9.770,0.322,0.0,0


We want to turn this dataset into the same format with the same features as the other dataset

In [66]:
input_col = ['LATITUDE', 'LONGITUDE', 'ELEVATION', 'MEAN_TEMPERATURE', 'MAX_TEMPERATURE', 'MIN_TEMPERATURE', 'TOTAL_RAIN', 'LOCAL_MONTH']
target_col = 'CLASS'

In [67]:
# Convert date column to datetime
df_gab['date'] = pd.to_datetime(df_gab['date'])
df_gab['LOCAL_MONTH'] = df_gab['date'].dt.month

In [68]:
# Rename input feature columns
df_gab = df_gab.rename(columns=
    {
        'lat': 'LATITUDE', 
        'lon': 'LONGITUDE', 
        'elevation': 'ELEVATION', 
        'temp_c': 'MEAN_TEMPERATURE', 
        'max_temp_c' : 'MAX_TEMPERATURE', 
        'min_temp_c' : 'MIN_TEMPERATURE',
        'precip_mm': 'TOTAL_RAIN'
    }
)

In [69]:
# Feature Scaling
df_gab[input_col] = scaler.transform(df_gab[input_col])

In [70]:
df_gab

Unnamed: 0,LATITUDE,LONGITUDE,date,hectares,ELEVATION,MEAN_TEMPERATURE,MAX_TEMPERATURE,MIN_TEMPERATURE,wind_kph,wind_dir,TOTAL_RAIN,humidity,pressure_hPa,soil_temp_c,soil_moisture,totalsnow_cm,CLASS,LOCAL_MONTH
0,56.713600,-110.449000,2000-01-01,0.20,347.0,-29.992,-25.265,-33.665,9.470,242.195,0.0,72.753,971.810,-1.340,0.585,0.0,1,1.0
1,54.752200,-114.914700,2000-01-03,0.30,827.0,-18.440,-13.971,-22.872,9.720,179.298,0.0,58.110,921.216,-8.824,0.558,0.0,1,1.0
2,53.661500,-102.177100,2000-01-06,0.30,264.0,-21.514,-18.139,-22.889,10.163,265.019,0.0,77.443,985.953,-1.606,0.587,0.0,1,1.0
3,51.072200,-115.300200,2000-01-07,0.50,1333.0,-8.396,-3.916,-10.766,17.917,249.972,0.0,81.143,853.531,0.592,0.212,0.0,1,1.0
4,51.066800,-115.128800,2000-01-07,0.25,1283.0,-6.168,-1.646,-8.895,17.917,249.972,0.0,72.987,860.034,-3.393,0.290,0.0,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86774,49.881450,-114.011833,2021-10-20,0.00,1644.0,1.891,10.156,-2.694,12.387,171.290,0.0,79.652,831.251,-0.375,0.226,0.0,0,10.0
86775,54.039500,-109.323767,2021-10-21,0.00,551.0,0.967,4.910,-2.790,15.630,101.629,0.0,69.323,954.092,1.842,0.216,0.0,0,10.0
86776,49.513200,-124.850600,2021-10-22,0.00,35.0,11.089,13.108,9.558,25.772,147.717,10.2,91.173,1000.783,11.118,0.317,0.0,0,10.0
86777,49.513800,-124.849900,2021-10-24,0.00,30.0,9.678,10.490,8.790,35.946,132.132,15.7,93.895,986.835,9.770,0.322,0.0,0,10.0


In [71]:
df_gab = df_gab.sample(frac=0.2)

In [72]:
# Features and targets
X = df_gab[input_col]
Y = df_gab[target_col]

In [73]:
# Predicting on the gab set
y_pred_test = model.predict(X)

accuracy_test = accuracy_score(Y, y_pred_test)
precision_test = precision_score(Y, y_pred_test)
recall_test = recall_score(Y, y_pred_test)
f1_test = f1_score(Y, y_pred_test)
print(f"Test set Accuracy: {accuracy_test}")
print(f"Test set Precision: {precision_test}")
print(f"Test set Recall: {recall_test}")
print(f"Test set f1: {f1_test}")

Test set Accuracy: 0.49452638856879466
Test set Precision: 0.49452638856879466
Test set Recall: 1.0
Test set f1: 0.6617834149350399
