In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
df = pd.read_csv('forestfires.csv', index_col=0)
df

Unnamed: 0_level_0,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...
4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 517 entries, 7 to 6
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Y       517 non-null    int64  
 1   month   517 non-null    object 
 2   day     517 non-null    object 
 3   FFMC    517 non-null    float64
 4   DMC     517 non-null    float64
 5   DC      517 non-null    float64
 6   ISI     517 non-null    float64
 7   temp    517 non-null    float64
 8   RH      517 non-null    int64  
 9   wind    517 non-null    float64
 10  rain    517 non-null    float64
 11  area    517 non-null    float64
dtypes: float64(8), int64(2), object(2)
memory usage: 52.5+ KB


## Preprocessing

In [21]:
for col_val in df.columns:
    print(f'Number of null value: {df[col_val].isnull().sum()}')
    print(f'Number of n/a value: {df[col_val].isna().sum()}')
    print(f'Range: {df[col_val].min()} - {df[col_val].max()}')
    print('*' * 20)

Number of null value: 0
Number of n/a value: 0
Range: 2 - 9
********************
Number of null value: 0
Number of n/a value: 0
Range: apr - sep
********************
Number of null value: 0
Number of n/a value: 0
Range: fri - wed
********************
Number of null value: 0
Number of n/a value: 0
Range: 18.7 - 96.2
********************
Number of null value: 0
Number of n/a value: 0
Range: 1.1 - 291.3
********************
Number of null value: 0
Number of n/a value: 0
Range: 7.9 - 860.6
********************
Number of null value: 0
Number of n/a value: 0
Range: 0.0 - 56.1
********************
Number of null value: 0
Number of n/a value: 0
Range: 2.2 - 33.3
********************
Number of null value: 0
Number of n/a value: 0
Range: 15 - 100
********************
Number of null value: 0
Number of n/a value: 0
Range: 0.4 - 9.4
********************
Number of null value: 0
Number of n/a value: 0
Range: 0.0 - 6.4
********************
Number of null value: 0
Number of n/a value: 0
Range: 0.0 - 10

In [22]:
df['month'].unique()

array(['mar', 'oct', 'aug', 'sep', 'apr', 'jun', 'jul', 'feb', 'jan',
       'dec', 'may', 'nov'], dtype=object)

In [23]:
df['day'].unique()

array(['fri', 'tue', 'sat', 'sun', 'mon', 'wed', 'thu'], dtype=object)

In [24]:
def convert_day(day: str) -> int:
    convert_dict = {
        'sun': 8,
        'mon': 2,
        'tue': 3,
        'wed': 4,
        'thu': 5,
        'fri': 6,
        'sat': 7,
    }
    return convert_dict[day]

def convert_month(month: str) -> int:
    convert_dict = {
        'mar': 3, 
        'oct': 10, 
        'aug': 8, 
        'sep': 9, 
        'apr': 4, 
        'jun': 6, 
        'jul': 7, 
        'feb': 2, 
        'jan': 1,
        'dec': 12, 
        'may': 5, 
        'nov': 11
    }
    return convert_dict[month]

df['day'] = df['day'].apply(convert_day)
df['month'] = df['month'].apply(convert_month)

In [25]:
df.head(20)

Unnamed: 0_level_0,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
7,5,3,6,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
7,4,10,3,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
7,4,10,7,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
8,6,3,6,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
8,6,3,8,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0
8,6,8,8,92.3,85.3,488.0,14.7,22.2,29,5.4,0.0,0.0
8,6,8,2,92.3,88.9,495.6,8.5,24.1,27,3.1,0.0,0.0
8,6,8,2,91.5,145.4,608.2,10.7,8.0,86,2.2,0.0,0.0
8,6,9,3,91.0,129.5,692.6,7.0,13.1,63,5.4,0.0,0.0
7,5,9,7,92.5,88.0,698.6,7.1,22.8,40,4.0,0.0,0.0


In [26]:
df.columns

Index(['Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind',
       'rain', 'area'],
      dtype='object')

In [27]:
#df.to_csv('Cleaned.csv')

## Model

In [28]:
class LinearRegression:
    def __init__(self):
        self.weight = None
    
    def fit(self, X, y):
        first = np.dot(X.T, X)
        invert_mat = np.linalg.inv(first)
        self.weight = (invert_mat @ X.T) @ y 

    def delta(self, y, y_hat):
        return y - y_hat
    
    def rmse(self, y, y_hat):
        return np.sqrt(np.mean(self.delta(y, y_hat) ** 2))
    
    def predict(self, X):
        X = np.array(X)
        return np.dot(X, self.weight)

In [29]:
X_train, X_test  = np.split(df, [int(0.8 * len(df))])

  return bound(*args, **kwds)


In [30]:
y_train = X_train['area']
y_test = X_test['area']
X_train = X_train.drop(columns='area')
X_test = X_test.drop(columns='area')

## Training

In [31]:
lr = LinearRegression()
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

In [32]:
for w in lr.weight:
    print(f"{w:.20f}")

0.58573417652677362000
2.76928502750960925383
1.79257493485416108392
-0.07003486698103321417
0.10725041493743839305
-0.02439744132552740946
-0.86583299689672321708
0.31045806710065998457
-0.24678376825041919984
0.05364875367112293542
-9.16741205663717906305


In [33]:
print(f"{lr.rmse(y_test, predictions):.20f}")

80.10009874865808399136
