In [2]:
!wmic cpu get NumberOfCores,NumberOfLogicalProcessors

NumberOfCores  NumberOfLogicalProcessors  

4              8                          





In [5]:
import os
os.environ['OMP_NUM_THREADS']='7'

In [6]:
# import the necessary libraries

import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import set_config             
set_config(transform_output="pandas")      
import multiprocessing
from sklearn.ensemble import RandomForestClassifier

In [7]:
# to load dataframe

url = 'https://drive.google.com/file/d/1UTI48spmL372ZWTrntQf3KlwbK4BgMvb/view?usp=sharing'
path = 'https://drive.google.com/uc?id='+url.split('/')[-2]
labeled_fires = pd.read_csv(path)

In [8]:
labeled_fires.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3295 entries, 0 to 3294
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   district           3295 non-null   object 
 1   community          3295 non-null   object 
 2   area_w_trees_ha    3295 non-null   float64
 3   area_wo_trees_ha   3295 non-null   float64
 4   reported_datetime  3295 non-null   object 
 5   duration_h         3295 non-null   float64
 6   duration_m         3295 non-null   float64
 7   deployment_m       3295 non-null   float64
 8   year               3295 non-null   int64  
 9   month              3295 non-null   int64  
 10  MO_N               1824 non-null   float64
 11  MO_TT              3286 non-null   float64
 12  MO_TX              3286 non-null   float64
 13  MO_TN              3286 non-null   float64
 14  MO_FK              2062 non-null   float64
 15  MX_TX              3286 non-null   float64
 16  MX_FX              2062 

In [9]:
# first we'll separate the datetime column into different columns - year, month, day, hour
# but first we'll convert it to datetime

labeled_fires['reported_datetime'] = pd.to_datetime(labeled_fires['reported_datetime'])

In [10]:
# now we can create separate columns

labeled_fires['year'] = labeled_fires['reported_datetime'].dt.year
labeled_fires['month'] = labeled_fires['reported_datetime'].dt.month
labeled_fires['day'] = labeled_fires['reported_datetime'].dt.dayofweek
labeled_fires['hour'] = labeled_fires['reported_datetime'].dt.hour

In [11]:
# we need to decide the columns we'll take into account for the model

labeled_fires.columns

Index(['district', 'community', 'area_w_trees_ha', 'area_wo_trees_ha',
       'reported_datetime', 'duration_h', 'duration_m', 'deployment_m', 'year',
       'month', 'MO_N', 'MO_TT', 'MO_TX', 'MO_TN', 'MO_FK', 'MX_TX', 'MX_FX',
       'MX_TN', 'MO_SD_S', 'MO_RR', 'MX_RS', 'cluster', 'burned_area',
       'log_burned_area', 'military', 'mili_y_n', 'day', 'hour'],
      dtype='object')

In [12]:
# we need to create a new data frame with only the columns we'll use for the model

model_df = labeled_fires[['district', 'year', 'month', 'day', 'hour', 'military', 'MO_N', 'MO_TT', 'MO_TX', 'MO_TN', 'MO_FK', 'MX_TX', 'MX_FX',
       'MX_TN', 'MO_SD_S', 'MO_RR', 'MX_RS', 'cluster']]
model_df

Unnamed: 0,district,year,month,day,hour,military,MO_N,MO_TT,MO_TX,MO_TN,MO_FK,MX_TX,MX_FX,MX_TN,MO_SD_S,MO_RR,MX_RS,cluster
0,Teltow-Fläming,2012,1,1,7,0,5.79,1.81,4.04,-0.74,3.16,13.0,23.0,-11.0,49.9,75.5,9.7,1
1,Oberspreewald-Lausitz,2012,3,6,16,0,,6.77,12.11,1.41,,22.2,,-5.4,169.8,13.8,3.8,1
2,Dahme-Spreewald,2012,3,2,11,0,,6.99,12.45,1.72,,22.1,,-4.9,169.3,10.8,3.1,2
3,Teltow-Fläming,2012,3,2,12,0,4.16,6.93,12.51,1.44,2.71,21.7,20.7,-5.1,169.3,9.1,2.6,2
4,Elbe-Elster,2012,3,4,14,0,5.54,6.90,12.25,1.82,2.50,21.9,22.4,-4.7,166.1,10.0,2.8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3290,Elbe-Elster,2023,9,0,11,0,,,,,,,,,,,,0
3291,Barnim,2023,9,0,16,0,,,,,,,,,,,,2
3292,Elbe-Elster,2023,9,1,14,0,,,,,,,,,,,,1
3293,Spree-Neiße,2023,9,0,12,0,,,,,,,,,,,,1


In [13]:
# now we need to split our dataset into X and y  - X will be the features we'll use for the model and y will be the target

X = model_df
y = X.pop('cluster') # we'll use the cluster column as our target

In [14]:
# and now we can split into traing and testing sets
# 80% for training and 20% for testing  - we'll use the random_state parameter to make sure we get the same split every time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 4)

In [15]:
# first we need to divide into numerical and categorical data

X_num_train =  X_train.select_dtypes(include = "number")
X_num_test = X_test.select_dtypes(include = "number")
X_cat_train = X_train.select_dtypes(exclude="number")
X_cat_test = X_test.select_dtypes(exclude="number")

In [16]:
# now we need to impute the missing values in the dataset  - we'll use the SimpleImputer class from the sklearn library

# we import necessary libraries
from sklearn.impute import SimpleImputer

# we start with numeric features
num_imputer = SimpleImputer(strategy = 'mean')                 # first we initialize the imputer
num_imputer.fit(X_num_train)                                   # then we fit the imputer to the training data
X_num_train_imp = num_imputer.transform(X_num_train)           # and transform the training set
X_num_test_imp = num_imputer.transform(X_num_test)             # and transform the testing set

# now we do the same for the categorical features
cat_imputer = SimpleImputer(strategy = 'constant', fill_value = 'N_A')         # first we initialize the imputer
cat_imputer.fit(X_cat_train)                                             # then we fit the imputer to the training data
X_cat_train_imp = cat_imputer.transform(X_cat_train)                       # and transform the training set
X_cat_test_imp = cat_imputer.transform(X_cat_test)                         # and transform the testing set

In [17]:
# and now we need to use one hot encoder to transform the categorical data into numerical data

# first we initialize the encoder
cat_encoder = OneHotEncoder(drop = 'first', sparse_output = False, handle_unknown = 'ignore') 
cat_encoder.fit(X_cat_train_imp)                                            # then we fit the encoder to the training data
X_cat_train_imp_hot = cat_encoder.transform(X_cat_train_imp)                # and transform the training set
X_cat_test_imp_hot = cat_encoder.transform(X_cat_test_imp)                  # and transform the testing set

In [18]:
# now we concatenate the numerical and categorical data

X_train_imp = pd.concat([X_num_train_imp, X_cat_train_imp_hot], axis = 1)
X_test_imp = pd.concat([X_num_test_imp, X_cat_test_imp_hot], axis = 1)

display(X_train_imp.head(1))
display(X_test_imp.head(1))

Unnamed: 0,year,month,day,hour,military,MO_N,MO_TT,MO_TX,MO_TN,MO_FK,...,district_Oberhavel,district_Oberspreewald-Lausitz,district_Oder-Spree,district_Ostprignitz-Ruppin,district_Potsdam,district_Potsdam-Mittelmark,district_Prignitz,district_Spree-Neiße,district_Teltow-Fläming,district_Uckermark
2265,2020.0,6.0,2.0,12.0,0.0,5.04,18.07,24.07,11.76,2.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,year,month,day,hour,military,MO_N,MO_TT,MO_TX,MO_TN,MO_FK,...,district_Oberhavel,district_Oberspreewald-Lausitz,district_Oder-Spree,district_Ostprignitz-Ruppin,district_Potsdam,district_Potsdam-Mittelmark,district_Prignitz,district_Spree-Neiße,district_Teltow-Fläming,district_Uckermark
229,2012.0,9.0,1.0,14.0,0.0,5.61,13.66,19.52,7.99,2.53,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Pipeline
Now that we've seen every step of the way works, we can build an automated pipeline with branches to train our model with GridSearch Cross Validation

In [19]:
# first we save the numerical and categorical columns in different variables
X_num_col = X.select_dtypes(include = "number").columns
X_cat_col = X.select_dtypes(exclude = "number").columns

# now we create the pipeline for numerical data
# we create and 'empty' pipeline because we'll use GridSearch to find the best options for the model
num_pipe = make_pipeline(SimpleImputer())

# and the categorical data
cat_pipe = make_pipeline(SimpleImputer(strategy = 'constant', fill_value = 'N_A'), 
                         OneHotEncoder(drop = 'first', sparse_output = False, handle_unknown = 'ignore'))

# and we initialize the scaler and the model
r_forest = RandomForestClassifier()
scaler = StandardScaler()

In [20]:
# we create the column transformer that will combine the numerical and categorical data
preprocessor = make_column_transformer((num_pipe, X_num_col), 
                                       (cat_pipe, X_cat_col),)

In [21]:
# and now we can build the full pipeline
full_pipe =  make_pipeline(preprocessor, scaler, r_forest)
full_pipe

In [22]:
# now we need to define the paraemters we'll use for the grid search

param_grid = {
    'columntransformer__pipeline-1__simpleimputer__strategy': ['mean', 'median'],
    'standardscaler__with_mean': [True, False],
    'standardscaler__with_std': [True, False],
    'randomforestclassifier__n_estimators': [100, 200, 300],
    'randomforestclassifier__max_depth': [None, 10, 20],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],
    'randomforestclassifier__bootstrap': [True, False]
}


In [23]:
search = GridSearchCV(full_pipe,
                      param_grid,
                      cv = 5,
                      scoring = 'accuracy',
                      verbose = 1)

In [24]:
search.fit(X_train, y_train)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


In [25]:
search.best_params_

{'columntransformer__pipeline-1__simpleimputer__strategy': 'median',
 'randomforestclassifier__bootstrap': True,
 'randomforestclassifier__max_depth': 10,
 'randomforestclassifier__min_samples_leaf': 4,
 'randomforestclassifier__min_samples_split': 5,
 'randomforestclassifier__n_estimators': 100,
 'standardscaler__with_mean': False,
 'standardscaler__with_std': False}

In [26]:
search.best_score_

0.5356549364613881

In [27]:
y_train_pred = search.predict(X_train)
accuracy_score(y_train, y_train_pred)

0.6407435508345979

In [28]:
y_test_pred = search.predict(X_test)
accuracy_score(y_test, y_test_pred)

0.5083459787556904

# Numeric pipe

In [29]:
# we have a low accuracy and we seem to be having overfitting, so we need to try it in a diffrent way
# we'll first try only with numerical data
# we have: X_num_col for the numeric columns, the numeric pipe in num_pipe, and the model and scaler in r_forest and scaler

# and now we can build the full pipeline
full_num_pipe =  make_pipeline(num_pipe, scaler, r_forest)
full_num_pipe

In [37]:
param_grid_num = {
    'pipeline__simpleimputer__strategy': ['mean', 'median'],
    'standardscaler__with_mean': [True, False],
    'standardscaler__with_std': [True, False],
    'randomforestclassifier__n_estimators': [100, 200, 300],
    'randomforestclassifier__max_depth': [None, 10, 20],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],
    'randomforestclassifier__bootstrap': [True, False]
}

In [38]:
search_num = GridSearchCV(full_num_pipe,
                      param_grid_num,
                      cv = 5,
                      scoring = 'accuracy',
                      verbose = 1)

In [41]:
search_num.fit(X_num_train, y_train)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


In [42]:
search_num.best_params_

{'pipeline__simpleimputer__strategy': 'mean',
 'randomforestclassifier__bootstrap': True,
 'randomforestclassifier__max_depth': 10,
 'randomforestclassifier__min_samples_leaf': 4,
 'randomforestclassifier__min_samples_split': 2,
 'randomforestclassifier__n_estimators': 300,
 'standardscaler__with_mean': True,
 'standardscaler__with_std': False}

In [43]:
search_num.best_score_

0.5303454372951527

In [47]:
accuracy_score(y_train, search_num.predict(X_num_train))

0.678679817905918

In [46]:
accuracy_score(y_test, search_num.predict(X_num_test))

0.5022761760242792

# Feature selection
We know we have some features with a lot of NaNs or that may not be relevant. We'll try to exclude those in order to increase accuracy and reduce overfitting

In [48]:
labeled_fires.isnull().sum()

district                0
community               0
area_w_trees_ha         0
area_wo_trees_ha        0
reported_datetime       0
duration_h              0
duration_m              0
deployment_m            0
year                    0
month                   0
MO_N                 1471
MO_TT                   9
MO_TX                   9
MO_TN                   9
MO_FK                1233
MX_TX                   9
MX_FX                1233
MX_TN                   9
MO_SD_S                97
MO_RR                  65
MX_RS                  65
cluster                 0
burned_area             0
log_burned_area         0
military                0
mili_y_n                0
day                     0
hour                    0
dtype: int64

In [86]:
# we need to create a new data frame with only the columns we'll use for the new model
# we'll drop MO_N, MO_FK, MX_FX, and military - and the other features we don't need for the model

model_2_df = labeled_fires[['district', 'year', 'month', 'day', 'hour', 'military', 'MO_TT', 'MO_TX', 'MO_TN', 'MX_TX',
       'MX_TN', 'MO_SD_S', 'MO_RR', 'MX_RS', 'cluster']]
model_2_df

Unnamed: 0,district,year,month,day,hour,military,MO_TT,MO_TX,MO_TN,MX_TX,MX_TN,MO_SD_S,MO_RR,MX_RS,cluster
0,Teltow-Fläming,2012,1,1,7,0,1.81,4.04,-0.74,13.0,-11.0,49.9,75.5,9.7,1
1,Oberspreewald-Lausitz,2012,3,6,16,0,6.77,12.11,1.41,22.2,-5.4,169.8,13.8,3.8,1
2,Dahme-Spreewald,2012,3,2,11,0,6.99,12.45,1.72,22.1,-4.9,169.3,10.8,3.1,2
3,Teltow-Fläming,2012,3,2,12,0,6.93,12.51,1.44,21.7,-5.1,169.3,9.1,2.6,2
4,Elbe-Elster,2012,3,4,14,0,6.90,12.25,1.82,21.9,-4.7,166.1,10.0,2.8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3290,Elbe-Elster,2023,9,0,11,0,,,,,,,,,0
3291,Barnim,2023,9,0,16,0,,,,,,,,,2
3292,Elbe-Elster,2023,9,1,14,0,,,,,,,,,1
3293,Spree-Neiße,2023,9,0,12,0,,,,,,,,,1


In [87]:
# now we need to split our dataset into X and y  - X will be the features we'll use for the model and y will be the target

X2 = model_2_df
y2 = X2.pop('cluster') # we'll use the cluster column as our target

In [88]:
# and now we can split into traing and testing sets
# 80% for training and 20% for testing  - we'll use the random_state parameter to make sure we get the same split every time

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.2, random_state = 4)

In [89]:
# first we need to divide into numerical and categorical data

X2_num_train =  X2_train.select_dtypes(include = "number")
X2_num_test = X2_test.select_dtypes(include = "number")
X2_cat_train = X2_train.select_dtypes(exclude="number")
X2_cat_test = X2_test.select_dtypes(exclude="number")

In [90]:
# now we need to impute the missing values in the dataset  - we'll use the SimpleImputer class from the sklearn library

# we start with numeric features
num_imputer2 = SimpleImputer(strategy = 'mean')                 # first we initialize the imputer
num_imputer2.fit(X2_num_train)                                   # then we fit the imputer to the training data
X_num_train_imp2 = num_imputer2.transform(X2_num_train)           # and transform the training set
X_num_test_imp2 = num_imputer2.transform(X2_num_test)             # and transform the testing set

# now we do the same for the categorical features
cat_imputer2 = SimpleImputer(strategy = 'constant', fill_value = 'N_A')         # first we initialize the imputer
cat_imputer2.fit(X2_cat_train)                                             # then we fit the imputer to the training data
X_cat_train_imp2 = cat_imputer2.transform(X2_cat_train)                       # and transform the training set
X_cat_test_imp2 = cat_imputer2.transform(X2_cat_test)                         # and transform the testing set

In [91]:
# and now we need to use one hot encoder to transform the categorical data into numerical data

# first we initialize the encoder
cat_encoder2 = OneHotEncoder(drop = 'first', sparse_output = False, handle_unknown = 'ignore') 
cat_encoder2.fit(X_cat_train_imp2)                                            # then we fit the encoder to the training data
X_cat_train_imp_hot2 = cat_encoder.transform(X_cat_train_imp2)                # and transform the training set
X_cat_test_imp_hot2 = cat_encoder.transform(X_cat_test_imp2)                  # and transform the testing set

In [92]:
# now we concatenate the numerical and categorical data

X_train_imp2 = pd.concat([X_num_train_imp2, X_cat_train_imp_hot2], axis = 1)
X_test_imp2 = pd.concat([X_num_test_imp2, X_cat_test_imp_hot2], axis = 1)

display(X_train_imp2.head(1))
display(X_test_imp2.head(1))

Unnamed: 0,year,month,day,hour,military,MO_TT,MO_TX,MO_TN,MX_TX,MX_TN,...,district_Oberhavel,district_Oberspreewald-Lausitz,district_Oder-Spree,district_Ostprignitz-Ruppin,district_Potsdam,district_Potsdam-Mittelmark,district_Prignitz,district_Spree-Neiße,district_Teltow-Fläming,district_Uckermark
2265,2020.0,6.0,2.0,12.0,0.0,18.07,24.07,11.76,30.8,6.57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,year,month,day,hour,military,MO_TT,MO_TX,MO_TN,MX_TX,MX_TN,...,district_Oberhavel,district_Oberspreewald-Lausitz,district_Oder-Spree,district_Ostprignitz-Ruppin,district_Potsdam,district_Potsdam-Mittelmark,district_Prignitz,district_Spree-Neiße,district_Teltow-Fläming,district_Uckermark
229,2012.0,9.0,1.0,14.0,0.0,13.66,19.52,7.99,29.9,2.9,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Pipeline
Now that we've seen every step of the way works, we can build an automated pipeline with branches to train our model with GridSearch Cross Validation

In [93]:
# first we save the numerical and categorical columns in different variables
X_num_col2 = X2.select_dtypes(include = "number").columns
X_cat_col2 = X2.select_dtypes(exclude = "number").columns

# now we create the pipeline for numerical data
# we create and 'empty' pipeline because we'll use GridSearch to find the best options for the model
num_pipe2 = make_pipeline(SimpleImputer())

# and the categorical data
cat_pipe2 = make_pipeline(SimpleImputer(strategy = 'constant', fill_value = 'N_A'), 
                         OneHotEncoder(drop = 'first', sparse_output = False, handle_unknown = 'ignore'))

# and we initialize the scaler and the model
r_forest = RandomForestClassifier()
scaler = StandardScaler()

In [94]:
# we create the column transformer that will combine the numerical and categorical data
preprocessor2 = make_column_transformer((num_pipe2, X_num_col2), 
                                       (cat_pipe2, X_cat_col2),)

In [95]:
# and now we can build the full pipeline
full_pipe2 =  make_pipeline(preprocessor2, scaler, r_forest)
full_pipe2

In [96]:
# now we need to define the paraemters we'll use for the grid search

param_grid2 = {
    'columntransformer__pipeline-1__simpleimputer__strategy': ['mean', 'median'],
    'standardscaler__with_mean': [True, False],
    'standardscaler__with_std': [True, False],
    'randomforestclassifier__n_estimators': [50, 100, 150],
    'randomforestclassifier__max_depth': [5, 10, 15],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],
    'randomforestclassifier__bootstrap': [True, False]
}


In [97]:
search2 = GridSearchCV(full_pipe2,
                      param_grid2,
                      cv = 5,
                      scoring = 'accuracy',
                      verbose = 1)

In [98]:
search2.fit(X2_train, y2_train)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


In [99]:
search2.best_params_

{'columntransformer__pipeline-1__simpleimputer__strategy': 'mean',
 'randomforestclassifier__bootstrap': True,
 'randomforestclassifier__max_depth': 10,
 'randomforestclassifier__min_samples_leaf': 4,
 'randomforestclassifier__min_samples_split': 10,
 'randomforestclassifier__n_estimators': 100,
 'standardscaler__with_mean': True,
 'standardscaler__with_std': False}

In [100]:
search2.best_score_

0.5371779943649015

In [101]:
accuracy_score(y2_train, search2.predict(X2_train))

0.6388467374810318

In [102]:
accuracy_score(y2_test, search2.predict(X2_test))

0.5189681335356601

# Numeric pipe

In [71]:
# we have a low accuracy and we seem to be having overfitting, so we need to try it in a diffrent way
# we'll first try only with numerical data
# we have: X_num_col for the numeric columns, the numeric pipe in num_pipe, and the model and scaler in r_forest and scaler

# and now we can build the full pipeline
full_num_pipe2 =  make_pipeline(num_pipe2, scaler, r_forest)
full_num_pipe2

In [72]:
param_grid_num2 = {
    'pipeline__simpleimputer__strategy': ['mean', 'median'],
    'standardscaler__with_mean': [True, False],
    'standardscaler__with_std': [True, False],
    'randomforestclassifier__n_estimators': [50, 100, 150],
    'randomforestclassifier__max_depth': [5, 10, 15],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],
    'randomforestclassifier__bootstrap': [True, False]
}

In [75]:
search_num2 = GridSearchCV(full_num_pipe2,
                      param_grid_num2,
                      cv = 5,
                      scoring = 'accuracy',
                      verbose = 1)

In [80]:
search_num2.fit(X2_num_train, y2_train)

Fitting 5 folds for each of 1296 candidates, totalling 6480 fits


In [81]:
search_num2.best_params_

{'pipeline__simpleimputer__strategy': 'mean',
 'randomforestclassifier__bootstrap': True,
 'randomforestclassifier__max_depth': 10,
 'randomforestclassifier__min_samples_leaf': 4,
 'randomforestclassifier__min_samples_split': 5,
 'randomforestclassifier__n_estimators': 100,
 'standardscaler__with_mean': False,
 'standardscaler__with_std': False}

In [82]:
search_num2.best_score_

0.5265525271692255

In [84]:
accuracy_score(y2_train, search_num2.predict(X2_num_train))

0.6942336874051593

In [85]:
accuracy_score(y2_test, search_num2.predict(X2_num_test))

0.49165402124430957