In [351]:
import pandas as pd
import numpy as np

df = pd.read_csv('data-cleaned.csv')
label=df["isNaturalCaused"]
df.drop(labels=["isNaturalCaused","fire_year", "true_cause", "general_cause_desc"], axis=1, inplace=True)
#df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

#Preprocessing:

categorical = [var for var in df.columns if df[var].dtype=='O']
numerical = [var for var in df.columns if var not in categorical]

In [352]:
# clean up categorical null variable
for var in categorical:
  df[var]= df[var].fillna("N/A")
  print(df[var].value_counts())


# clean up numerical null variable using median
for var in numerical:
  df[var] = df[var].fillna(df[var].median())

df.isnull().sum()

fire_type
Surface       16718
Ground         5022
N/A            2569
Crown          1010
   Surface        1
                  1
Name: count, dtype: int64
fire_position_on_slope
Flat          18384
N/A            2809
Bottom         1388
Upper 1/3      1070
Middle 1/3      910
Lower 1/3       760
Name: count, dtype: int64
weather_conditions_over_fire
Clear          10514
Cloudy          8121
N/A             2818
CB Wet          1502
CB Dry          1414
Rainshowers      952
Name: count, dtype: int64
wind_direction
W      5103
CLM    3249
N/A    2823
SW     2714
NW     2682
SE     2470
E      1820
S      1590
N      1553
NE     1314
 S        2
 NW       1
Name: count, dtype: int64


assessment_hectares             0
current_size                    0
fire_spread_rate                0
fire_type                       0
fire_position_on_slope          0
weather_conditions_over_fire    0
temperature                     0
relative_humidity               0
wind_direction                  0
wind_speed                      0
dtype: int64

In [353]:
# Apply Labeling Encoding
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
for var in categorical:
  df[var] = le.fit_transform(df[var])

df.head()

Unnamed: 0,assessment_hectares,current_size,fire_spread_rate,fire_type,fire_position_on_slope,weather_conditions_over_fire,temperature,relative_humidity,wind_direction,wind_speed
0,0.01,0.1,0.0,5,1,2,18.0,10.0,10,2.0
1,0.2,0.2,0.0,5,2,2,12.0,22.0,10,10.0
2,0.5,0.5,0.0,5,0,2,12.0,22.0,10,10.0
3,0.01,0.01,0.0,5,1,2,12.0,22.0,10,10.0
4,0.1,0.1,0.1,5,1,2,6.0,37.0,10,2.0


In [354]:
# handling imbalanced data
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss

x = df
y = label

smk = SMOTETomek()
x_res,y_res = smk.fit_resample(x,y)

x_res.shape, y_res.shape

((32020, 10), (32020,))

In [355]:
from sklearn.model_selection import train_test_split
import random

# train test split
x_train, x_test, y_train, y_test = train_test_split(df,label,test_size=0.20,shuffle=True)

print("X_train shape:",x_train.shape)
print("X_test shape:",x_test.shape)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

X_train shape: (20256, 10)
X_test shape: (5065, 10)
y_train shape: (20256,)
y_test shape: (5065,)


In [356]:
import sklearn.tree
import time

start_time = time.time()
decision_tree = sklearn.tree.DecisionTreeClassifier(criterion = "log_loss")
decision_tree.fit(x_train, y_train)
end_time = time.time()
elapsed_time = end_time - start_time

In [None]:
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

grid = {
    'criterion' : ["gini","entropy","log_loss"]
}

tree  = sklearn.tree.DecisionTreeClassifier()
tree_cv = GridSearchCV(tree, grid, cv=5)
tree_cv.fit(x_train,y_train)
print("Best Parameters:",tree_cv.best_params_)
pred_y = tree_cv.predict(x_test)

print("Train Score:",tree_cv.best_score_)
print("Test Score:",tree_cv.score(x_test,y_test))

print("\nDecision Tree")
print("Accuracy:", metrics.accuracy_score(y_test, pred_y))
print("Precision:", metrics.precision_score(y_test, pred_y))
print("\n",metrics.classification_report(y_test, pred_y, digits=6))
print(f"Runtime: {elapsed_time}s")

Best Parameters: {'criterion': 'entropy'}
Train Score: 0.7901366657057366
Test Score: 0.7913129318854887
Decision Tree
Accuracy: 0.7913129318854887
Precision: 0.6889387984278496

               precision    recall  f1-score   support

           0   0.846833  0.833883  0.840308      3335
           1   0.688939  0.709249  0.698946      1730

    accuracy                       0.791313      5065
   macro avg   0.767886  0.771566  0.769627      5065
weighted avg   0.792903  0.791313  0.792025      5065

Runtime: 0.05919623374938965s
