In [12]:
import pandas as pd
import numpy as np

In [13]:
df = pd.read_csv('data-cleaned.csv')
label=df["isNaturalCaused"]
df.drop(labels=["isNaturalCaused","fire_year", "true_cause", "general_cause_desc"], axis=1, inplace=True)


#Preprocessing:

categorical = [var for var in df.columns if df[var].dtype=='O']
numerical = [var for var in df.columns if var not in categorical]

print(categorical)
print(numerical)

for var in categorical:
  df[var] = df[var].str.replace(" ", "")
  print(df[var].value_counts())

df.info()

['fire_type', 'fire_position_on_slope', 'weather_conditions_over_fire', 'wind_direction']
['assessment_hectares', 'current_size', 'fire_spread_rate', 'temperature', 'relative_humidity', 'wind_speed']
Surface    16719
Ground      5022
Crown       1010
               1
Name: fire_type, dtype: int64
Flat         18384
Bottom        1388
Upper1/3      1070
Middle1/3      910
Lower1/3       760
Name: fire_position_on_slope, dtype: int64
Clear          10514
Cloudy          8121
CBWet           1502
CBDry           1414
Rainshowers      952
Name: weather_conditions_over_fire, dtype: int64
W      5103
CLM    3249
SW     2714
NW     2683
SE     2470
E      1820
S      1592
N      1553
NE     1314
Name: wind_direction, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25321 entries, 0 to 25320
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   assessment_hectares           25321

In [14]:
# clean up categorical null variable
for var in categorical:
  df[var]= df[var].fillna("N/A")
  print(df[var].value_counts())


# clean up numerical null variable using median
for var in numerical:
  df[var] = df[var].fillna(df[var].median())

df.isnull().sum()



Surface    16719
Ground      5022
N/A         2569
Crown       1010
               1
Name: fire_type, dtype: int64
Flat         18384
N/A           2809
Bottom        1388
Upper1/3      1070
Middle1/3      910
Lower1/3       760
Name: fire_position_on_slope, dtype: int64
Clear          10514
Cloudy          8121
N/A             2818
CBWet           1502
CBDry           1414
Rainshowers      952
Name: weather_conditions_over_fire, dtype: int64
W      5103
CLM    3249
N/A    2823
SW     2714
NW     2683
SE     2470
E      1820
S      1592
N      1553
NE     1314
Name: wind_direction, dtype: int64


assessment_hectares             0
current_size                    0
fire_spread_rate                0
fire_type                       0
fire_position_on_slope          0
weather_conditions_over_fire    0
temperature                     0
relative_humidity               0
wind_direction                  0
wind_speed                      0
dtype: int64

In [15]:
# Apply Labeling Encoding
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
for var in categorical:
  df[var] = le.fit_transform(df[var])

df.head()

Unnamed: 0,assessment_hectares,current_size,fire_spread_rate,fire_type,fire_position_on_slope,weather_conditions_over_fire,temperature,relative_humidity,wind_direction,wind_speed
0,0.01,0.1,0.0,4,1,2,18.0,10.0,8,2.0
1,0.2,0.2,0.0,4,2,2,12.0,22.0,8,10.0
2,0.5,0.5,0.0,4,0,2,12.0,22.0,8,10.0
3,0.01,0.01,0.0,4,1,2,12.0,22.0,8,10.0
4,0.1,0.1,0.1,4,1,2,6.0,37.0,8,2.0


In [16]:
# normalizing
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df)
scaled_features = scaler.transform(df)

In [17]:
df_scaled = pd.DataFrame(scaled_features,columns=df.columns)
df_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25321 entries, 0 to 25320
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   assessment_hectares           25321 non-null  float64
 1   current_size                  25321 non-null  float64
 2   fire_spread_rate              25321 non-null  float64
 3   fire_type                     25321 non-null  float64
 4   fire_position_on_slope        25321 non-null  float64
 5   weather_conditions_over_fire  25321 non-null  float64
 6   temperature                   25321 non-null  float64
 7   relative_humidity             25321 non-null  float64
 8   wind_direction                25321 non-null  float64
 9   wind_speed                    25321 non-null  float64
dtypes: float64(10)
memory usage: 1.9 MB


In [18]:
# handling imbalanced data
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss

x = df_scaled
y = label

smk = SMOTETomek()
x_res,y_res = smk.fit_resample(x,y)

x_res.shape, y_res.shape

((31288, 10), (31288,))

In [19]:
from sklearn.model_selection import train_test_split

# train test split
x_train, x_test, y_train, y_test = train_test_split(x_res,y_res,test_size=0.20,shuffle=True)

print("X_train shape:",x_train.shape)
print("X_test shape:",x_test.shape)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

X_train shape: (25030, 10)
X_test shape: (6258, 10)
y_train shape: (25030,)
y_test shape: (6258,)


In [20]:
# model result
result_dict_train = {}
result_dict_test = {}

In [21]:
# model list
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

random_forrest = RandomForestClassifier(n_jobs = -1, criterion="gini", max_features="log2", min_impurity_decrease=0.0, min_samples_split=2, n_estimators= 100)

svc = SVC(C=10, degree=7, gamma= 1, kernel='poly')

gradient_boost = GradientBoostingClassifier(n_estimators=100, min_samples_split=2, max_features="log2")

logistic_regression = LogisticRegression()

decision_tree = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_impurity_decrease=0.0, max_features="log2")

In [None]:
# Voting Mechanism
from sklearn import metrics
from sklearn.ensemble import VotingClassifier
voting_classifier = VotingClassifier([("rf", random_forrest), ("svc", svc), ("gb", gradient_boost), ("lr", logistic_regression), ("dt", decision_tree)],
                                     voting="hard", 
                                     n_jobs=-1)
voting_classifier.fit(x_train, y_train)
y_pred = voting_classifier.predict(x_test)

metrics.confusion_matrix(y_test, y_pred)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
classification_rep = metrics.classification_report(y_test, y_pred, digits=6, target_names=["Human Caused", "Natural Caused"])
print(classification_rep)

Accuracy: 0.8902205177372963
Precision: 0.8856516290726817
                precision    recall  f1-score   support

  Human Caused   0.894977  0.882599  0.888745      3109
Natural Caused   0.885652  0.897745  0.891657      3149

      accuracy                       0.890221      6258
     macro avg   0.890314  0.890172  0.890201      6258
  weighted avg   0.890285  0.890221  0.890211      6258



In [23]:
# Stacking Classifier
from sklearn.ensemble import StackingClassifier
stacking_classifier = StackingClassifier([("rf", random_forrest), ("svc", svc), ("gb", gradient_boost), ("lr", logistic_regression), ("dt", decision_tree)],
                                        cv=5,
                                        n_jobs=-1,
                                     )
stacking_classifier.fit(x_train, y_train)
y_pred = stacking_classifier.predict(x_test)

metrics.confusion_matrix(y_test, y_pred)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
classification_rep = metrics.classification_report(y_test, y_pred, digits=6, target_names=["Human Caused", "Natural Caused"])
print(classification_rep)

Accuracy: 0.900287631831256
Precision: 0.9014308426073132
                precision    recall  f1-score   support

  Human Caused   0.899133  0.900289  0.899711      3109
Natural Caused   0.901431  0.900286  0.900858      3149

      accuracy                       0.900288      6258
     macro avg   0.900282  0.900288  0.900284      6258
  weighted avg   0.900289  0.900288  0.900288      6258

