In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from sklearn.metrics import classification_report, balanced_accuracy_score, explained_variance_score, precision_recall_curve
from sklearn.metrics import zero_one_loss, cohen_kappa_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import pickle
%matplotlib inline

In [18]:
df = pd.read_csv('D:/Rubix/data/fires_dataset.csv')
#display all the columns of the dataframe
pd.set_option('display.max_columns',None)

In [19]:
df.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,1,6,2012,32,71,12,0.7,57.1,2.5,8.2,0.6,2.8,0.2,not fire
1,2,6,2012,30,73,13,4.0,55.7,2.7,7.8,0.6,2.9,0.2,not fire
2,3,6,2012,29,80,14,2.0,48.7,2.2,7.6,0.3,2.6,0.1,not fire
3,4,6,2012,30,64,14,0.0,79.4,5.2,15.4,2.2,5.6,1.0,not fire
4,5,6,2012,32,60,14,0.2,77.1,6.0,17.6,1.8,6.5,0.9,not fire


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   day          122 non-null    int64  
 1   month        122 non-null    int64  
 2   year         122 non-null    int64  
 3   Temperature  122 non-null    int64  
 4    RH          122 non-null    int64  
 5    Ws          122 non-null    int64  
 6   Rain         122 non-null    float64
 7   FFMC         122 non-null    float64
 8   DMC          122 non-null    float64
 9   DC           122 non-null    object 
 10  ISI          122 non-null    float64
 11  BUI          122 non-null    float64
 12  FWI          122 non-null    object 
 13  Classes      121 non-null    object 
dtypes: float64(5), int64(6), object(3)
memory usage: 13.5+ KB


In [21]:
df['day'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
      dtype=int64)

In [22]:
data = df.copy()
print(data.shape)
data = data.dropna(how = 'all')
print(data.shape)

(122, 14)
(122, 14)


In [23]:
data.drop(data[data['day'] == 'Sidi-Bel Abbes Region Dataset'].index, inplace = True)

In [24]:
data.drop(data[data['day'] == 'day'].index, inplace = True)

In [25]:
data.isnull().sum()

day            0
month          0
year           0
Temperature    0
 RH            0
 Ws            0
Rain           0
FFMC           0
DMC            0
DC             0
ISI            0
BUI            0
FWI            0
Classes        1
dtype: int64

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   day          122 non-null    int64  
 1   month        122 non-null    int64  
 2   year         122 non-null    int64  
 3   Temperature  122 non-null    int64  
 4    RH          122 non-null    int64  
 5    Ws          122 non-null    int64  
 6   Rain         122 non-null    float64
 7   FFMC         122 non-null    float64
 8   DMC          122 non-null    float64
 9   DC           122 non-null    object 
 10  ISI          122 non-null    float64
 11  BUI          122 non-null    float64
 12  FWI          122 non-null    object 
 13  Classes      121 non-null    object 
dtypes: float64(5), int64(6), object(3)
memory usage: 13.5+ KB


In [27]:
data['FWI'].unique()

array(['0.2', '0.1', '1', '0.9', '3.1', '6', '0.8', '1.9', '10', '16.7',
       '12.9', '0.3', '0.6', '0.4', '4.9', '1.2', '5.3', '6.7', '9.5',
       '12', '18.4', '7.2', '2.2', '0.5', '6.4', '5.2', '9.9', '15',
       '6.3', '3', '1.3', '9.6', '4.7', 'fire   ', '14.1', '7.3', '7.7',
       '9.1', '13', '17.3', '30', '25.4', '16.3', '2.5', '0.7', '3.2',
       '9', '10.9', '13.2', '14.5', '13.5', '19.5', '20.9', '12.6', '5.9',
       '12.7', '10.7', '21.6', '18.8', '10.5', '15.7', '5.5', '14.8',
       '24', '26.3', '12.2', '18.1', '24.5', '26.9', '31.1', '30.3',
       '26.1', '6.1', '16', '19.4', '0', '2.7', '3.7', '3.9', '10.3',
       '5.7', '9.8', '19.3', '17.5', '15.4', '5.6', '3.8', '13.7', '15.2',
       '6.5'], dtype=object)

In [28]:
data.columns = data.columns.str.rstrip()
data.columns = data.columns.str.lstrip()

In [29]:
data.columns

Index(['day', 'month', 'year', 'Temperature', 'RH', 'Ws', 'Rain', 'FFMC',
       'DMC', 'DC', 'ISI', 'BUI', 'FWI', 'Classes'],
      dtype='object')

In [30]:
data['Classes'].unique()

array(['not fire   ', 'fire   ', 'not fire     ', nan, 'not fire    '],
      dtype=object)

In [31]:
data['Classes'] = data['Classes'].str.rstrip()
data['Classes'].unique()

array(['not fire', 'fire', nan], dtype=object)

In [32]:
data['ISI'] = np.where(data['DC'] == '14.6 9','9',data['ISI'])
data['BUI'] = np.where(data['DC'] == '14.6 9','12.5',data['BUI'])
data['FWI'] = np.where(data['DC'] == '14.6 9','10.4',data['FWI'])
data['Classes'] = np.where(data['DC'] == '14.6 9','fire',data['Classes'])
data['DC'] = np.where(data['DC'] == '14.6 9','14.6',data['DC'])

In [33]:
data['FWI'].unique()

array(['0.2', '0.1', '1', '0.9', '3.1', '6', '0.8', '1.9', '10', '16.7',
       '12.9', '0.3', '0.6', '0.4', '4.9', '1.2', '5.3', '6.7', '9.5',
       '12', '18.4', '7.2', '2.2', '0.5', '6.4', '5.2', '9.9', '15',
       '6.3', '3', '1.3', '9.6', '4.7', '10.4', '14.1', '7.3', '7.7',
       '9.1', '13', '17.3', '30', '25.4', '16.3', '2.5', '0.7', '3.2',
       '9', '10.9', '13.2', '14.5', '13.5', '19.5', '20.9', '12.6', '5.9',
       '12.7', '10.7', '21.6', '18.8', '10.5', '15.7', '5.5', '14.8',
       '24', '26.3', '12.2', '18.1', '24.5', '26.9', '31.1', '30.3',
       '26.1', '6.1', '16', '19.4', '0', '2.7', '3.7', '3.9', '10.3',
       '5.7', '9.8', '19.3', '17.5', '15.4', '5.6', '3.8', '13.7', '15.2',
       '6.5'], dtype=object)

In [34]:
int_columns = data.columns[0:6]
int_columns

Index(['day', 'month', 'year', 'Temperature', 'RH', 'Ws'], dtype='object')

In [35]:
float_columns = data.columns[6:-1]
float_columns

Index(['Rain', 'FFMC', 'DMC', 'DC', 'ISI', 'BUI', 'FWI'], dtype='object')

In [36]:
data[int_columns] = data[int_columns].astype('int')
data[float_columns] = data[float_columns].astype('float')

In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   day          122 non-null    int32  
 1   month        122 non-null    int32  
 2   year         122 non-null    int32  
 3   Temperature  122 non-null    int32  
 4   RH           122 non-null    int32  
 5   Ws           122 non-null    int32  
 6   Rain         122 non-null    float64
 7   FFMC         122 non-null    float64
 8   DMC          122 non-null    float64
 9   DC           122 non-null    float64
 10  ISI          122 non-null    float64
 11  BUI          122 non-null    float64
 12  FWI          122 non-null    float64
 13  Classes      122 non-null    object 
dtypes: float64(7), int32(6), object(1)
memory usage: 10.6+ KB


In [38]:
data.describe()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI
count,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0
mean,15.754098,7.5,2012.0,33.163934,55.901639,15.008197,0.678689,81.102459,17.031967,45.415574,5.863934,17.920492,8.520492
std,8.843274,1.115259,0.0,3.675608,15.716186,2.692186,1.486759,12.244064,12.995068,42.927562,4.803667,13.870785,8.137424
min,1.0,6.0,2012.0,24.0,21.0,6.0,0.0,37.9,0.9,7.3,0.1,1.4,0.0
25%,8.0,7.0,2012.0,30.0,43.25,14.0,0.0,77.65,7.325,14.7,1.825,7.4,0.925
50%,16.0,7.5,2012.0,34.0,56.0,15.0,0.0,84.85,13.15,31.5,4.6,13.9,6.05
75%,23.0,8.0,2012.0,36.0,66.75,16.75,0.475,89.275,22.9,56.975,8.625,23.875,13.65
max,31.0,9.0,2012.0,42.0,90.0,29.0,8.7,96.0,65.9,177.3,19.0,68.0,31.1


In [39]:
num_col = data._get_numeric_data().columns
num_col

Index(['day', 'month', 'year', 'Temperature', 'RH', 'Ws', 'Rain', 'FFMC',
       'DMC', 'DC', 'ISI', 'BUI', 'FWI'],
      dtype='object')

In [42]:
describe_num_df = data.describe(include=['int64','float64'])
describe_num_df.reset_index(inplace=True)
describe_num_df = describe_num_df[describe_num_df['index'] != 'count']
for i in num_col:
  if i in ['index']:
    continue
  sns.factorplot(x="index", y=i, data=describe_num_df)
  plt.show()

NameError: name 'seaborn' is not defined