# Forest Fire

## Predicting the area spreaded by the fire in forests.

### Feature engineering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data_frame=pd.read_csv('forest_fires.csv')

In [3]:
data_frame.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [4]:
#missing values.
data_frame.isnull().sum()

X        0
Y        0
month    0
day      0
FFMC     0
DMC      0
DC       0
ISI      0
temp     0
RH       0
wind     0
rain     0
area     0
dtype: int64

There are no missing values

In [5]:
#coverting categorical into numerical features.
for cols in data_frame.columns:
    if data_frame[cols].dtype=='O':
        rank=data_frame[cols].value_counts(ascending=True).index
        mapping={i:k for k,i in enumerate(rank,0)}
        data_frame[cols]=data_frame[cols].map(mapping)

In [6]:
data_frame.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,9,5,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,5,2,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,5,4,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,9,5,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,9,6,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [7]:
#coverting only the values of 'RH' feature to log values.
data_frame['RH']=np.log(data_frame['RH'])

In [8]:
data_frame.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,9,5,86.2,26.2,94.3,5.1,8.2,3.931826,6.7,0.0,0.0
1,7,4,5,2,90.6,35.4,669.1,6.7,18.0,3.496508,0.9,0.0,0.0
2,7,4,5,4,90.6,43.7,686.9,6.7,14.6,3.496508,1.3,0.0,0.0
3,8,6,9,5,91.7,33.3,77.5,9.0,8.3,4.574711,4.0,0.2,0.0
4,8,6,9,6,89.3,51.3,102.2,9.6,11.4,4.59512,1.8,0.0,0.0


In [10]:
#handling the outliers
continuos_features=[]
for cols in data_frame.columns:
    if data_frame[cols].dtype!='O':
        if len(data_frame[cols].unique())>25:
            continuos_features.append(cols)

In [12]:
for cols in continuos_features:
    threshold=3
    mean=data_frame[cols].mean()
    std=data_frame[cols].std()
    outliers=[]
    for val in data_frame[cols]:
        z_score=(val-mean)/std
        if z_score>threshold:
            outliers.append(val)
            
    print('Outliers of {}'.format(cols))
    print(outliers)

Outliers of FFMC
[]
Outliers of DMC
[]
Outliers of DC
[]
Outliers of ISI
[56.1]
Outliers of temp
[]
Outliers of RH
[]
Outliers of area
[212.88, 1090.84, 746.28, 278.53]


In [13]:
for cols in continuos_features:
    threshold=3
    mean=data_frame[cols].mean()
    std=data_frame[cols].std()
    median=data_frame[cols].median()
    outliers=[]
    for val in data_frame[cols]:
        z_score=(val-mean)/std
        if z_score>threshold:
            outliers.append(val)
            data_frame.replace(val,median,inplace=True)

In [14]:
for cols in continuos_features:
    threshold=3
    mean=data_frame[cols].mean()
    std=data_frame[cols].std()
    outliers=[]
    for val in data_frame[cols]:
        z_score=(val-mean)/std
        if z_score>threshold:
            outliers.append(val)
            
    print('Outliers of {}'.format(cols))
    print(outliers)

Outliers of FFMC
[]
Outliers of DMC
[]
Outliers of DC
[]
Outliers of ISI
[22.6, 22.7, 21.3]
Outliers of temp
[]
Outliers of RH
[]
Outliers of area
[88.49, 95.18, 103.39, 105.66, 154.88, 196.48, 200.94, 86.45, 174.63, 185.76, 82.75]


Outliers has been increased. But, there is similar difference between each outlier value.

In [15]:
#saving the modified data_frame
data_frame.to_csv('modified_data.csv',index=False)