In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [68]:
df = pd.read_csv('forestfires.csv')
df

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       517 non-null    int64  
 1   Y       517 non-null    int64  
 2   month   517 non-null    object 
 3   day     517 non-null    object 
 4   FFMC    517 non-null    float64
 5   DMC     517 non-null    float64
 6   DC      517 non-null    float64
 7   ISI     517 non-null    float64
 8   temp    517 non-null    float64
 9   RH      517 non-null    int64  
 10  wind    517 non-null    float64
 11  rain    517 non-null    float64
 12  area    517 non-null    float64
dtypes: float64(8), int64(3), object(2)
memory usage: 52.6+ KB


## Preprocessing

In [70]:
for col_val in df.columns:
    print(f'Number of null value: {df[col_val].isnull().sum()}')
    print(f'Number of n/a value: {df[col_val].isna().sum()}')
    print(f'Range: {df[col_val].min()} - {df[col_val].max()}')
    print('*' * 20)

Number of null value: 0
Number of n/a value: 0
Range: 1 - 9
********************
Number of null value: 0
Number of n/a value: 0
Range: 2 - 9
********************
Number of null value: 0
Number of n/a value: 0
Range: apr - sep
********************
Number of null value: 0
Number of n/a value: 0
Range: fri - wed
********************
Number of null value: 0
Number of n/a value: 0
Range: 18.7 - 96.2
********************
Number of null value: 0
Number of n/a value: 0
Range: 1.1 - 291.3
********************
Number of null value: 0
Number of n/a value: 0
Range: 7.9 - 860.6
********************
Number of null value: 0
Number of n/a value: 0
Range: 0.0 - 56.1
********************
Number of null value: 0
Number of n/a value: 0
Range: 2.2 - 33.3
********************
Number of null value: 0
Number of n/a value: 0
Range: 15 - 100
********************
Number of null value: 0
Number of n/a value: 0
Range: 0.4 - 9.4
********************
Number of null value: 0
Number of n/a value: 0
Range: 0.0 - 6.4
**

In [71]:
df['month'].unique()

array(['mar', 'oct', 'aug', 'sep', 'apr', 'jun', 'jul', 'feb', 'jan',
       'dec', 'may', 'nov'], dtype=object)

In [72]:
df['day'].unique()

array(['fri', 'tue', 'sat', 'sun', 'mon', 'wed', 'thu'], dtype=object)

In [73]:
def convert_day(day: str) -> int:
    convert_dict = {
        'sun': 8,
        'mon': 2,
        'tue': 3,
        'wed': 4,
        'thu': 5,
        'fri': 6,
        'sat': 7,
    }
    return convert_dict[day]

def convert_month(month: str) -> int:
    convert_dict = {
        'mar': 3, 
        'oct': 10, 
        'aug': 8, 
        'sep': 9, 
        'apr': 4, 
        'jun': 6, 
        'jul': 7, 
        'feb': 2, 
        'jan': 1,
        'dec': 12, 
        'may': 5, 
        'nov': 11
    }
    return convert_dict[month]

df['day'] = df['day'].apply(convert_day)
df['month'] = df['month'].apply(convert_month)

In [74]:
df.head(20)

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,3,6,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,10,3,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,10,7,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,3,6,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,3,8,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0
5,8,6,8,8,92.3,85.3,488.0,14.7,22.2,29,5.4,0.0,0.0
6,8,6,8,2,92.3,88.9,495.6,8.5,24.1,27,3.1,0.0,0.0
7,8,6,8,2,91.5,145.4,608.2,10.7,8.0,86,2.2,0.0,0.0
8,8,6,9,3,91.0,129.5,692.6,7.0,13.1,63,5.4,0.0,0.0
9,7,5,9,7,92.5,88.0,698.6,7.1,22.8,40,4.0,0.0,0.0


In [75]:
df.columns

Index(['X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH',
       'wind', 'rain', 'area'],
      dtype='object')

In [76]:
#df.to_csv('Cleaned.csv')

## Model

In [77]:
class LinearRegression:
    def __init__(self):
        self.weight = None
    
    def fit(self, X, y):
        first = np.dot(X.T, X)
        invert_mat = np.linalg.inv(first)
        self.weight = (invert_mat @ X.T) @ y 

    def delta(self, y, y_hat):
        return y - y_hat
    
    def rmse(self, y, y_hat):
        return np.sqrt(np.mean(self.delta(y, y_hat) ** 2))
    
    def predict(self, X):
        return np.dot(self.weight.T, X.T)

In [78]:
X_train, X_test  = np.array_split(df, [int(0.8 * len(df))])

  return bound(*args, **kwds)


In [79]:
y_train = X_train['area']
y_test = X_test['area']
X_train = X_train.drop(columns='area')
X_test = X_test.drop(columns='area')

## Training

In [80]:
lr = LinearRegression()
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

In [81]:
for w in lr.weight:
    print(f"{w:.20f}")

1.10454888056186595335
-0.47486406771119554771
2.89281252175288816630
1.81228620524183559048
-0.07200598989418489915
0.11144469608228720070
-0.02582129412588733686
-0.90917718207031084088
0.29998702655284353868
-0.25530802928072454661
0.00124727396085018124
-11.65817039947527256061


In [82]:
print(f"{lr.rmse(y_test, predictions):.20f}")

79.97091407114629646458
