In [61]:
import pandas as pd
pd.set_option('display.max_columns',None)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,KFold,GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
import warnings
warnings.filterwarnings('ignore')
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import RFE
from pickle import dump

## 1.Loading data

In [2]:
data = pd.read_csv('Datasets/forestfires.csv')
data.head()

Unnamed: 0,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep,size_category
0,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,...,0,0,0,0,1,0,0,0,0,small
1,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,...,0,0,0,0,0,0,0,1,0,small
2,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,...,0,0,0,0,0,0,0,1,0,small
3,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,...,0,0,0,0,1,0,0,0,0,small
4,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,...,0,0,0,0,1,0,0,0,0,small


## 2.Data Analysis

In [3]:
data.shape

(517, 31)

In [4]:
data.dtypes

month             object
day               object
FFMC             float64
DMC              float64
DC               float64
ISI              float64
temp             float64
RH                 int64
wind             float64
rain             float64
area             float64
dayfri             int64
daymon             int64
daysat             int64
daysun             int64
daythu             int64
daytue             int64
daywed             int64
monthapr           int64
monthaug           int64
monthdec           int64
monthfeb           int64
monthjan           int64
monthjul           int64
monthjun           int64
monthmar           int64
monthmay           int64
monthnov           int64
monthoct           int64
monthsep           int64
size_category     object
dtype: object

In [5]:
data.isna().sum()

month            0
day              0
FFMC             0
DMC              0
DC               0
ISI              0
temp             0
RH               0
wind             0
rain             0
area             0
dayfri           0
daymon           0
daysat           0
daysun           0
daythu           0
daytue           0
daywed           0
monthapr         0
monthaug         0
monthdec         0
monthfeb         0
monthjan         0
monthjul         0
monthjun         0
monthmar         0
monthmay         0
monthnov         0
monthoct         0
monthsep         0
size_category    0
dtype: int64

## 3.Data preprocessing

In [6]:
[x for x in data.columns if data[x].dtype == 'O']

['month', 'day', 'size_category']

In [7]:
data['month'].unique()

array(['mar', 'oct', 'aug', 'sep', 'apr', 'jun', 'jul', 'feb', 'jan',
       'dec', 'may', 'nov'], dtype=object)

In [8]:
data['day'].unique()

array(['fri', 'tue', 'sat', 'sun', 'mon', 'wed', 'thu'], dtype=object)

In [9]:
data['size_category'].unique()

array(['small', 'large'], dtype=object)

In [12]:
data.head(15)

Unnamed: 0,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,dayfri,daymon,daysat,daysun,daythu,daytue,daywed,monthapr,monthaug,monthdec,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep,size_category
0,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,small
1,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,small
2,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,small
3,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,small
4,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,small
5,aug,sun,92.3,85.3,488.0,14.7,22.2,29,5.4,0.0,0.0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,small
6,aug,mon,92.3,88.9,495.6,8.5,24.1,27,3.1,0.0,0.0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,small
7,aug,mon,91.5,145.4,608.2,10.7,8.0,86,2.2,0.0,0.0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,small
8,sep,tue,91.0,129.5,692.6,7.0,13.1,63,5.4,0.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,small
9,sep,sat,92.5,88.0,698.6,7.1,22.8,40,4.0,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,small


In [17]:

data.groupby(['month','size_category'])[['size_category','month']].count()

Unnamed: 0_level_0,Unnamed: 1_level_0,size_category,month
month,size_category,Unnamed: 2_level_1,Unnamed: 3_level_1
apr,large,2,2
apr,small,7,7
aug,large,43,43
aug,small,141,141
dec,large,8,8
dec,small,1,1
feb,large,6,6
feb,small,14,14
jan,small,2,2
jul,large,9,9


In [20]:
forest_data = data.drop(labels=(['month','day']),axis=1)
forest_data.head()

Unnamed: 0,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,dayfri,daymon,daysat,daysun,daythu,daytue,daywed,monthapr,monthaug,monthdec,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep,size_category
0,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,small
1,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,small
2,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,small
3,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,small
4,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,small


In [22]:
forest_data['size_category'] = forest_data['size_category'].apply(lambda x: 0 if x == 'small' else 1)
forest_data.head()

Unnamed: 0,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,dayfri,daymon,daysat,daysun,daythu,daytue,daywed,monthapr,monthaug,monthdec,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep,size_category
0,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [23]:
forest_data['size_category'],data['size_category']

(0      0
 1      0
 2      0
 3      0
 4      0
       ..
 512    1
 513    1
 514    1
 515    0
 516    0
 Name: size_category, Length: 517, dtype: int64,
 0      small
 1      small
 2      small
 3      small
 4      small
        ...  
 512    large
 513    large
 514    large
 515    small
 516    small
 Name: size_category, Length: 517, dtype: object)

### Spliting X,y

In [24]:
X = forest_data.drop('size_category',axis=1)
y = forest_data[['size_category']]

In [27]:
std_sclr = StandardScaler()
col = X.columns
X = pd.DataFrame(std_sclr.fit_transform(X),columns=col)
X.head()

Unnamed: 0,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,dayfri,daymon,daysat,daysun,daythu,daytue,daywed,monthapr,monthaug,monthdec,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep
0,-0.805959,-1.323326,-1.830477,-0.860946,-1.84264,0.411724,1.498614,-0.073268,-0.20202,2.254407,-0.408709,-0.440449,-0.474467,-0.365748,-0.375873,-0.341512,-0.133103,-0.743339,-0.133103,-0.200603,-0.062318,-0.256865,-0.184391,2.928152,-0.062318,-0.044023,-0.17286,-0.706081
1,-0.008102,-1.179541,0.488891,-0.509688,-0.153278,-0.692456,-1.741756,-0.073268,-0.20202,-0.443576,-0.408709,-0.440449,-0.474467,-0.365748,2.660475,-0.341512,-0.133103,-0.743339,-0.133103,-0.200603,-0.062318,-0.256865,-0.184391,-0.341512,-0.062318,-0.044023,5.785038,-0.706081
2,-0.008102,-1.049822,0.560715,-0.509688,-0.739383,-0.692456,-1.518282,-0.073268,-0.20202,-0.443576,-0.408709,2.27041,-0.474467,-0.365748,-0.375873,-0.341512,-0.133103,-0.743339,-0.133103,-0.200603,-0.062318,-0.256865,-0.184391,-0.341512,-0.062318,-0.044023,5.785038,-0.706081
3,0.191362,-1.212361,-1.898266,-0.004756,-1.825402,3.233519,-0.009834,0.603155,-0.20202,2.254407,-0.408709,-0.440449,-0.474467,-0.365748,-0.375873,-0.341512,-0.133103,-0.743339,-0.133103,-0.200603,-0.062318,-0.256865,-0.184391,2.928152,-0.062318,-0.044023,-0.17286,-0.706081
4,-0.243833,-0.931043,-1.7986,0.126966,-1.291012,3.356206,-1.23894,-0.073268,-0.20202,-0.443576,-0.408709,-0.440449,2.10763,-0.365748,-0.375873,-0.341512,-0.133103,-0.743339,-0.133103,-0.200603,-0.062318,-0.256865,-0.184391,2.928152,-0.062318,-0.044023,-0.17286,-0.706081


In [28]:
y.head()

Unnamed: 0,size_category
0,0
1,0
2,0
3,0
4,0


### Train test split

In [30]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,shuffle=True,random_state=23,stratify=y)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((439, 28), (78, 28), (439, 1), (78, 1))

## 4.Model building :base models
### svm rbf

In [37]:
base_model1 = SVC()
base_model1.fit(X_train,y_train)
pred1 = base_model1.predict(X_test)

In [38]:
accuracy_score(y_test,pred1)

0.7435897435897436

### svm linear

In [39]:
base_model2 = SVC(kernel='linear')
base_model2.fit(X_train,y_train)
pred2 = base_model2.predict(X_test)

In [40]:
accuracy_score(y_test,pred2)

0.8974358974358975

In [43]:
confusion_matrix(y_test,pred2)

array([[57,  0],
       [ 8, 13]])

In [44]:
print('\n',classification_report(y_test,pred2))


               precision    recall  f1-score   support

           0       0.88      1.00      0.93        57
           1       1.00      0.62      0.76        21

    accuracy                           0.90        78
   macro avg       0.94      0.81      0.85        78
weighted avg       0.91      0.90      0.89        78



### svm poly

In [41]:
base_model3 = SVC(kernel='poly')
base_model3.fit(X_train,y_train)
pred3 = base_model3.predict(X_test)

In [42]:
accuracy_score(y_test,pred3)

0.7435897435897436

## 5.Hyper parameter tuninng of best kernel i.e rbf

In [46]:
cv = KFold(n_splits=5,shuffle=True,random_state=23)

param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              }
 

grid = GridSearchCV(estimator=base_model2,param_grid=param_grid,scoring='accuracy',cv=cv)
grid.fit(X_train,y_train)

print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'gamma': 1}
0.9817920585161964


## 6.Feature selection

In [53]:
## usinf mlxtend SequantialFetureSelector for  getting two best features # forward feature selection

feature_selection = SequentialFeatureSelector(estimator=SVC(C=10,gamma=1),k_features=2,forward=True,n_jobs=-1)
feature_selection.fit(X,y)
print(feature_selection.k_feature_names_)
print(feature_selection.k_score_)

('area', 'monthjan')
0.9961351755041076


In [54]:
## getting best features

feature_selection = SequentialFeatureSelector(estimator=SVC(C=10,gamma=1),k_features='best',forward=True,n_jobs=-1)
feature_selection.fit(X,y)
print(feature_selection.k_feature_names_)
print(feature_selection.k_score_)

('area',)
0.9961351755041076


## 7. Final model : model with two best features and C=10,gamma=1

In [55]:
X = data[['area','monthjan']]
y = data[['size_category']]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,shuffle=True,random_state=23,stratify=y)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((439, 2), (78, 2), (439, 1), (78, 1))

In [56]:
final_model = SVC(C=10,gamma=1)
final_model.fit(X_train,y_train)

SVC(C=10, gamma=1)

In [57]:
y_pred = final_model.predict(X_test)

In [58]:
accuracy_score(y_test,y_pred)

1.0

In [59]:
confusion_matrix(y_test,y_pred)

array([[21,  0],
       [ 0, 57]])

In [60]:
print('\n',classification_report(y_test,y_pred))


               precision    recall  f1-score   support

       large       1.00      1.00      1.00        21
       small       1.00      1.00      1.00        57

    accuracy                           1.00        78
   macro avg       1.00      1.00      1.00        78
weighted avg       1.00      1.00      1.00        78



## Model Deployment

In [62]:
dump(final_model,open('FOrest_fire_pred.pkl','wb'))