In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data-cleaned-removed-empty.csv')
label=df["isNaturalCaused"]
df.drop(labels=["isNaturalCaused","fire_year", "true_cause", "general_cause_desc"], axis=1, inplace=True)


#Preprocessing:

categorical = [var for var in df.columns if df[var].dtype=='O']
numerical = [var for var in df.columns if var not in categorical]

print(categorical)
print(numerical)

for var in categorical:
  df[var] = df[var].str.replace(" ", "")
  print(df[var].value_counts())

df.info()

['fire_type', 'fire_position_on_slope', 'weather_conditions_over_fire', 'wind_direction']
['assessment_hectares', 'current_size', 'fire_spread_rate', 'temperature', 'relative_humidity', 'wind_speed']
Surface    16526
Ground      4947
Crown       1006
Name: fire_type, dtype: int64
Flat         18359
Bottom        1386
Upper1/3      1067
Middle1/3      909
Lower1/3       758
Name: fire_position_on_slope, dtype: int64
Clear          10505
Cloudy          8109
CBWet           1501
CBDry           1414
Rainshowers      950
Name: weather_conditions_over_fire, dtype: int64
W      5099
CLM    3246
SW     2711
NW     2682
SE     2467
E      1820
S      1591
N      1552
NE     1311
Name: wind_direction, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22479 entries, 0 to 22478
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   assessment_hectares           22479 non-null  float6

In [3]:
# clean up categorical null variable
for var in categorical:
  df[var]= df[var].fillna("N/A")
  print(df[var].value_counts())


# clean up numerical null variable using median
for var in numerical:
  df[var] = df[var].fillna(df[var].median())

df.isnull().sum()



Surface    16526
Ground      4947
Crown       1006
Name: fire_type, dtype: int64
Flat         18359
Bottom        1386
Upper1/3      1067
Middle1/3      909
Lower1/3       758
Name: fire_position_on_slope, dtype: int64
Clear          10505
Cloudy          8109
CBWet           1501
CBDry           1414
Rainshowers      950
Name: weather_conditions_over_fire, dtype: int64
W      5099
CLM    3246
SW     2711
NW     2682
SE     2467
E      1820
S      1591
N      1552
NE     1311
Name: wind_direction, dtype: int64


assessment_hectares             0
current_size                    0
fire_spread_rate                0
fire_type                       0
fire_position_on_slope          0
weather_conditions_over_fire    0
temperature                     0
relative_humidity               0
wind_direction                  0
wind_speed                      0
dtype: int64

In [4]:
# Apply Labeling Encoding
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
for var in categorical:
  df[var] = le.fit_transform(df[var])

df.head()

Unnamed: 0,assessment_hectares,current_size,fire_spread_rate,fire_type,fire_position_on_slope,weather_conditions_over_fire,temperature,relative_humidity,wind_direction,wind_speed
0,0.01,0.1,0.0,2,1,2,18.0,10,7,2
1,0.2,0.2,0.0,2,2,2,12.0,22,7,10
2,0.5,0.5,0.0,2,0,2,12.0,22,7,10
3,0.01,0.01,0.0,2,1,2,12.0,22,7,10
4,0.1,0.1,0.1,2,1,2,6.0,37,7,2


In [5]:
# normalizing
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df)
scaled_features = scaler.transform(df)

In [6]:
df_scaled = pd.DataFrame(scaled_features,columns=df.columns)
df_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22479 entries, 0 to 22478
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   assessment_hectares           22479 non-null  float64
 1   current_size                  22479 non-null  float64
 2   fire_spread_rate              22479 non-null  float64
 3   fire_type                     22479 non-null  float64
 4   fire_position_on_slope        22479 non-null  float64
 5   weather_conditions_over_fire  22479 non-null  float64
 6   temperature                   22479 non-null  float64
 7   relative_humidity             22479 non-null  float64
 8   wind_direction                22479 non-null  float64
 9   wind_speed                    22479 non-null  float64
dtypes: float64(10)
memory usage: 1.7 MB


In [7]:
# handling imbalanced data
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import NearMiss

x = df_scaled
y = label

smk = SMOTETomek()
x_res,y_res = smk.fit_resample(x,y)

x_res.shape, y_res.shape

((25418, 10), (25418,))

In [8]:
from sklearn.model_selection import train_test_split

# train test split
x_train, x_test, y_train, y_test = train_test_split(x_res,y_res,test_size=0.20,shuffle=True)

print("X_train shape:",x_train.shape)
print("X_test shape:",x_test.shape)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

X_train shape: (20334, 10)
X_test shape: (5084, 10)
y_train shape: (20334,)
y_test shape: (5084,)


In [9]:
# model result
result_dict_train = {}
result_dict_test = {}

In [10]:
# SVC model
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train, y_train)

In [11]:
# Measure test acc and precision
from sklearn import metrics

y_pred = svc.predict(x_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))

Accuracy: 0.8271046420141621
Precision: 0.8162650602409639


In [13]:
# parameters tunning
from sklearn.model_selection import GridSearchCV
from sklearnex import patch_sklearn, config_context
patch_sklearn()

with config_context(target_offload="gpu:0"):
    grid = {
    'C':[0.01,0.1,0.5,1,10],
    'kernel' : ["linear","poly","rbf","sigmoid"],
    'degree' : [1,3,5,7 ],
    'gamma' : [0.01,1, "scale", "auto"]
    }   

    svm  = SVC()
    svm_cv = GridSearchCV(svm, grid, cv=5)
    svm_cv.fit(x_train,y_train)
    print("Best Parameters:",svm_cv.best_params_)
    y_pred = svm_cv.predict(x_test)

    print("Train Score:",svm_cv.best_score_)
    print("Test Score:",svm_cv.score(x_test,y_test))
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print("Precision:",metrics.precision_score(y_test, y_pred))

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Best Parameters: {'C': 10, 'degree': 7, 'gamma': 1, 'kernel': 'poly'}
Train Score: 0.8326938681172991
Test Score: 0.8375295043273013
Accuracy: 0.8375295043273013
Precision: 0.848411741053478


In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))

Accuracy: 0.8490596110934013
Precision: 0.8292831105710814
