# Dataset

In [16]:
import pandas as pd
import numpy as np

In [17]:
df = pd.read_csv('1950-2021_fires_with_elevation.csv')
x = df[['lat', 'lon', 'elevation']]
y = df['hectares']

In [18]:
x

Unnamed: 0,lat,lon,elevation
0,59.963,-128.172,690.0
1,59.318,-132.172,834.0
2,59.876,-131.922,852.0
3,59.760,-132.808,957.0
4,59.434,-126.172,429.0
...,...,...,...
15025,58.818,-122.812,403.0
15026,58.289,-121.729,392.0
15027,58.818,-122.729,443.0
15028,58.760,-122.729,301.0


In [19]:
y

0            8.0
1            8.0
2        12949.9
3          241.1
4            1.2
          ...   
15025        0.1
15026        0.1
15027        0.1
15028        1.2
15029        0.1
Name: hectares, Length: 15030, dtype: float64

# Correlation between data

In [20]:
print(df.hectares.corr(df.lat))
print(df.hectares.corr(df.lon))
print(df.hectares.corr(df.elevation))

0.11243475800501282
-0.04525868154006124
0.006771238508371042


Very small correlation between lat/lon/elevation and hectares

# One-Hot Encoding

Biomes is a categorical feature, so perform one-hot encoding to turn it into different columns

In [21]:
from sklearn import preprocessing

In [22]:
enc = preprocessing.OneHotEncoder()
enc.fit(df[['biome']])
enc.categories_

[array(['Boreal Cordillera', 'Boreal PLain', 'Montane Cordillera',
        'Pacific Maritime', 'Prairie', 'Taiga Plain'], dtype=object)]

In [23]:
one_hot = enc.transform(df[['biome']]).toarray()
one_hot

array([[1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.]])

In [24]:
biomes = ['Boreal Cordillera', 'Boreal PLain', 'Montane Cordillera', 'Pacific Maritime', 'Prairie', 'Taiga Plain']
df[biomes] = one_hot
df

Unnamed: 0,agency,lat,lon,date,hectares,cause,response_type,biome,elevation,Boreal Cordillera,Boreal PLain,Montane Cordillera,Pacific Maritime,Prairie,Taiga Plain
0,BC,59.963,-128.172,1953-05-26,8.0,H,,Boreal Cordillera,690.0,1.0,0.0,0.0,0.0,0.0,0.0
1,BC,59.318,-132.172,1950-06-22,8.0,L,,Boreal Cordillera,834.0,1.0,0.0,0.0,0.0,0.0,0.0
2,BC,59.876,-131.922,1950-06-04,12949.9,H,,Boreal Cordillera,852.0,1.0,0.0,0.0,0.0,0.0,0.0
3,BC,59.760,-132.808,1951-07-15,241.1,H,,Boreal Cordillera,957.0,1.0,0.0,0.0,0.0,0.0,0.0
4,BC,59.434,-126.172,1952-06-12,1.2,H,,Boreal Cordillera,429.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15025,BC,58.818,-122.812,1961-08-01,0.1,H,,Taiga Plain,403.0,0.0,0.0,0.0,0.0,0.0,1.0
15026,BC,58.289,-121.729,1961-08-11,0.1,H,,Taiga Plain,392.0,0.0,0.0,0.0,0.0,0.0,1.0
15027,BC,58.818,-122.729,1961-08-11,0.1,H,,Taiga Plain,443.0,0.0,0.0,0.0,0.0,0.0,1.0
15028,BC,58.760,-122.729,1961-08-13,1.2,H,,Taiga Plain,301.0,0.0,0.0,0.0,0.0,0.0,1.0


# Train Test Split

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
x, y = df[['lat', 'lon', 'elevation'] + biomes], df['hectares']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# Model

In [27]:
from sklearn.linear_model import LinearRegression

In [28]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [33]:
model = LinearRegression()
model.fit(x_train, y_train)

predictions_test = model.predict(x_test)
# loss = np.sqrt(mean_squared_error(predictions_test, y_test)) # Loss using RMSE method
loss = mean_absolute_error(predictions_test, y_test)
loss

np.float64(188.05793415008466)

In [34]:
model.coef_

array([ 5.86886345e+01,  1.99142314e+01,  4.84483532e-02,  2.63893232e+03,
        2.20069435e+02, -1.40309510e+03, -1.23328214e+03, -1.59601249e+03,
        1.37338798e+03])

In [35]:
model.intercept_

np.float64(800.9756435573934)