In [331]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [332]:
data = pd.read_csv('../input/forest-fires-data-set/forestfires.csv')


In [333]:
#Hiển thị thuộc tính các cột trong data
data.columns

In [334]:
data.head()

In [335]:
print("Dữ liệu gốc:", data.shape)

In [336]:
data.describe()

In [337]:
sns.set()
sns.pairplot(data[['FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain', 'area']])

In [338]:
plt.hist(data['area'], bins=50, range=[0, 50])
plt.ylabel('Density')
plt.xlabel('Burn area (ha)')
plt.title('Histogram of burn area (restricted domain)')

In [339]:
data['Log-area']=np.log10(data['area']+1)


In [340]:
plt.hist(data['Log-area'], bins=30, range=[0, 3])
plt.ylabel('Density')
plt.xlabel('Log(area+1) (ha)')
plt.title('Histogram of log burn area')

In [341]:
def month_season (df):
    
    if df['month'] in ['jan','feb','mar']:
        return 'winter'
    if df['month'] in ['apr','may','jun']:
        return 'spring'
    if df['month'] in ['jul','aug','sep']:
        return 'summer'
    if df['month'] in ['oct','nov','dec']:
        return 'fall'
data['season'] = data.apply (lambda df: month_season (df), axis=1)

In [342]:
data.head(5)

In [370]:
enc = LabelEncoder()
enc.fit(data['season'])
data['season_enc'] = enc.transform(data['season'])
enc.fit(data['day'])
data['day_enc'] = enc.transform(data['day'])
data.head(25)

In [344]:
X_data = data.drop(['area','Log-area','month','day','season'], axis=1)
y_data = data['Log-area']

In [345]:
test_size = 0.4 # training set 60%; test set 40%
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=test_size)

In [346]:
y_train_orig = y_train # save copy of vector
b = y_train.values.reshape(y_train.size,1)
y_train = pd.DataFrame(b)
y_train.columns = ['Log-area']

In [347]:
y_test_orig = y_test # save copy of vector
b2 = y_test.values.reshape(y_test.size,1)
y_test = pd.DataFrame(b2)
y_test.columns = ['Log-area']

In [348]:
def rec(m,n,tol):
    if type(m)!='numpy.ndarray':
        m=np.array(m)
    if type(n)!='numpy.ndarray':
        n=np.array(n)
    l=m.size
    percent = 0
    for i in range(l):
        if np.abs(10**m[i]-10**n[i])<=tol:
            percent+=1
    return 100*(float(percent)/l)
tol_max = 20 
y_test_array = np.array(y_test_orig.values) # make np.array
# max tolerance limit for REC curve x-axis; abs value of error in the pry_test_array = np.array(y_test_orig.values) # make np.array

In [349]:
for i in data.describe().columns[:-2]:
    data.plot.scatter(i,'Log-area',grid=True)

In [350]:
#Visualize mô hình thể hiện mối tương quan giữa các thuộc tính
plt.figure(figsize=(14,12))
sns.heatmap(data.corr(),linewidths=.1,cmap="YlGnBu", annot=True, annot_kws={"size": 8})
plt.yticks(rotation=0);
plt.show()

In [351]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [352]:
 scaler = StandardScaler()
    
    # Parameter grid for the Grid Search
param_grid = {'C': [0.01,0.1,1, 10], 'epsilon': [10,1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']}

In [353]:
grid_SVR = GridSearchCV(SVR(), param_grid, refit=True, verbose=0, cv=5)
grid_SVR.fit(scaler.fit_transform(X_train), scaler.fit_transform(y_train))
#print("Best parameters obtained by GridSearch:", grid_SVR.best_params_)

In [354]:
print("Best parameters obtained by GridSearch:", grid_SVR.best_params_)

In [355]:
a_svr = grid_SVR.predict(X_test)
print("RMSE for Support Vector Regression:", np.sqrt(np.mean((y_test_orig-a_svr)**2)))
a_svr = np.array(a_svr) # np.ndarray type

In [356]:
plt.xlabel("Actual area burned ($ha$)")
plt.ylabel("Error")
plt.grid(True)
plt.scatter(10**(y_test_orig),10**(a_svr)-10**(y_test_orig))

In [357]:
rec_SVR=[]
for i in range(tol_max):
    rec_SVR.append(rec(a_svr,y_test,i))
    
plt.figure(figsize=(10,5))
plt.title("REC curve for the Support Vector Regressor\n",fontsize=15)
plt.xlabel("Absolute error (tolerance) in prediction ($)")
plt.ylabel("Percentage of correct prediction")
plt.xticks([i*5 for i in range(tol_max+1)])
plt.ylim(-10,100)
plt.yticks([i*20 for i in range(6)])
plt.grid(True)
plt.plot(range(tol_max),rec_SVR)

In [358]:
param_grid = {'max_depth': [5,10,15,20,50], 'max_leaf_nodes': [2,5,10], 'min_samples_leaf': [2,5,10],
             'min_samples_split':[2,5,10]}
grid_RF = GridSearchCV(RandomForestRegressor(),param_grid, refit=True, verbose=0, cv=5)
grid_RF.fit(X_train, y_train)
#print("Best parameters obtained by Grid Search:", grid_RF.best_params_)

In [359]:
print("Best parameters obtained by Grid Search:", grid_RF.best_params_)

In [360]:
a_rf = grid_RF.predict(X_test)
rmse_rf = np.sqrt(np.mean((y_test_orig-a_rf)**2))
print("RMSE for Random Forest:", rmse_rf)
a_rf = np.array(a_rf) # np.ndarray type

In [361]:
plt.xlabel("Actual area burned")
plt.ylabel("Error")
plt.grid(True)
plt.scatter(10**(y_test_orig), 10**(a_rf)-10**(y_test_orig))


In [362]:
plt.figure(figsize=(10, 5))
plt.title("Prediction Errors for All ML Methods\n", fontsize=18)
plt.xlabel("Prediction error ($ha$)", fontsize=14)
plt.grid(True)
plt.hist(10**(a_svr.reshape(a_svr.size,))-10**(y_test_orig), bins=50, range=[-250, 50], alpha = 1)
plt.legend(['SVR'], fontsize=14)

In [363]:
plt.figure(figsize=(10, 5))
plt.title("Prediction Errors for All ML Methods\n", fontsize=18)
plt.xlabel("Prediction error ($ha$)", fontsize=14)
plt.grid(True)
plt.hist(10**(a_rf.reshape(a_rf.size,))-10**(y_test_orig), bins=50, range=[-250, 50], alpha = 0.6)
plt.legend(['Random Forest'], fontsize=14)

In [364]:
plt.figure(figsize=(10, 5))
plt.title("Prediction Errors for All ML Methods\n", fontsize=18)
plt.xlabel("Prediction error ($ha$)", fontsize=14)
plt.grid(True)
plt.hist(10**(a_svr.reshape(a_svr.size,))-10**(y_test_orig), bins=50, range=[-250, 50], alpha = 1)
plt.hist(10**(a_rf.reshape(a_rf.size,))-10**(y_test_orig), bins=50, range=[-250, 50], alpha = 0.6)
plt.legend(['SVR', 'Random Forest'], fontsize=14)


In [365]:
rec_RF=[]
for i in range(tol_max):
    rec_RF.append(rec(a_rf,y_test,i))

plt.figure(figsize=(10,5))
plt.title("REC curve for the Random Forest\n",fontsize=15)
plt.xlabel("Absolute error (tolerance) in prediction ($)")
plt.ylabel("Percentage of correct prediction")
plt.xticks([i for i in range(0,tol_max+1,5)])
plt.ylim(-10,100)
plt.yticks([i*20 for i in range(6)])
plt.grid(True)
plt.plot(range(tol_max),rec_RF)

In [366]:
rec_SVR=[]
for i in range(tol_max):
    rec_SVR.append(rec(a_svr,y_test_array,i))

In [367]:
rec_RF=[]
for i in range(tol_max):
    rec_RF.append(rec(a_rf,y_test_array,i))

In [368]:
plt.figure(figsize=(10, 8))
plt.title("Regression Error Characteristic Curves for All ML Methods\n", fontsize=20)
plt.xlabel("Absolute prediction error (tolerance) ($ha$)", fontsize=14)
plt.ylabel("Percentage of predictions correct", fontsize=14)
plt.xticks([i for i in range(0, tol_max+1, 1)], fontsize=11)
plt.yticks([i*20 for i in range(6)], fontsize=18)
plt.xlim(-1, tol_max)
plt.ylim(-5, 100)
plt.grid(True)
plt.plot(range(tol_max), rec_SVR, '--', lw=2)
plt.plot(range(tol_max), rec_RF, 'o-', lw=2)
plt.legend(['SVR', 'Random Forest'], fontsize=14)