**Exploratory Data Analysis**

In [64]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import StandardScaler



In [65]:
positives=pd.read_csv("positives.csv")
negatives=pd.read_csv("negatives.csv")

In [66]:
df = pd.concat([positives, negatives], ignore_index=True)

In [67]:
df.head()

Unnamed: 0,time,aspect,burned_areas,curvature,d2m,dem,ignition_points,lai,lst_day,lst_night,...,lc_grassland,lc_settlement,lc_shrubland,lc_sparse_vegetation,lc_water_bodies,lc_wetland,population,burned_area_has,time_idx,sample
0,2006-07-29,182.7782,0.0,-1469.6869,289.78036,142.89676,0.0,0.5,321.16,295.52,...,0.0,0.0,0.0,0.0,0.0,0.0,0.20321,40.0,0,0
1,2006-07-30,182.7782,0.0,-1469.6869,290.09662,142.89676,0.0,0.5,317.6,293.63998,...,0.0,0.0,0.0,0.0,0.0,0.0,0.20321,40.0,1,0
2,2006-07-31,182.7782,0.0,-1469.6869,290.19055,142.89676,0.0,0.5,316.9,293.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.20321,40.0,2,0
3,2006-08-01,182.7782,0.0,-1469.6869,290.22266,142.89676,0.0,0.5,316.8,294.66,...,0.0,0.0,0.0,0.0,0.0,0.0,0.20321,40.0,3,0
4,2006-08-02,182.7782,0.0,-1469.6869,290.3668,142.89676,0.0,0.5,315.41998,295.91998,...,0.0,0.0,0.0,0.0,0.0,0.0,0.20321,40.0,4,0


**Handling Nulls**

In [68]:
null_percentage = df.isnull().mean() * 100
null_percentage_non_zero = null_percentage[null_percentage > 0]

fig = px.bar(
    x=null_percentage_non_zero.sort_values(ascending=False).index,
    y=null_percentage_non_zero.sort_values(ascending=False).values,
    labels={'x': 'Columns', 'y': 'Percentage (%)'},
    title='Percentage of Null Values by Column',
    color_discrete_sequence=['salmon']
)

fig.update_layout(
    xaxis_tickangle=-45,
    xaxis_title='Columns',
    yaxis_title='Percentage (%)',
    bargap=0.2,
    template='plotly_white'
)

fig.show()

In [69]:
print(null_percentage_non_zero)
df.drop(columns=['lst_night', 'lst_day', 'smi', 'lai'], axis=1, inplace=True)
df.dropna(inplace=True)

lai            1.692648
lst_day       31.204404
lst_night     36.097263
ndvi           0.212353
smi           12.177162
population     0.007717
dtype: float64


In [70]:
df.head()

Unnamed: 0,time,aspect,burned_areas,curvature,d2m,dem,ignition_points,ndvi,rh,roads_distance,...,lc_grassland,lc_settlement,lc_shrubland,lc_sparse_vegetation,lc_water_bodies,lc_wetland,population,burned_area_has,time_idx,sample
0,2006-07-29,182.7782,0.0,-1469.6869,289.78036,142.89676,0.0,0.2461,0.251317,1.773,...,0.0,0.0,0.0,0.0,0.0,0.0,0.20321,40.0,0,0
1,2006-07-30,182.7782,0.0,-1469.6869,290.09662,142.89676,0.0,0.2461,0.2515,1.773,...,0.0,0.0,0.0,0.0,0.0,0.0,0.20321,40.0,1,0
2,2006-07-31,182.7782,0.0,-1469.6869,290.19055,142.89676,0.0,0.2461,0.325032,1.773,...,0.0,0.0,0.0,0.0,0.0,0.0,0.20321,40.0,2,0
3,2006-08-01,182.7782,0.0,-1469.6869,290.22266,142.89676,0.0,0.2461,0.282802,1.773,...,0.0,0.0,0.0,0.0,0.0,0.0,0.20321,40.0,3,0
4,2006-08-02,182.7782,0.0,-1469.6869,290.3668,142.89676,0.0,0.2461,0.347537,1.773,...,0.0,0.0,0.0,0.0,0.0,0.0,0.20321,40.0,4,0


**Changing dates format**

In [71]:
df['time'] = pd.to_datetime(df['time'])
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month
df['day'] = df['time'].dt.day
df['day_of_year'] = df['time'].dt.dayofyear
df['week_of_year'] = df['time'].dt.isocalendar().week
df['quarter'] = df['time'].dt.quarter
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['day_of_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
df['day_of_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
df.drop(columns=['time'], inplace=True)
df.head()

Unnamed: 0,aspect,burned_areas,curvature,d2m,dem,ignition_points,ndvi,rh,roads_distance,slope,...,year,month,day,day_of_year,week_of_year,quarter,month_sin,month_cos,day_of_year_sin,day_of_year_cos
0,182.7782,0.0,-1469.6869,289.78036,142.89676,0.0,0.2461,0.251317,1.773,1.477166,...,2006,7,29,210,30,3,-0.5,-0.866025,-0.455907,-0.890028
1,182.7782,0.0,-1469.6869,290.09662,142.89676,0.0,0.2461,0.2515,1.773,1.477166,...,2006,7,30,211,30,3,-0.5,-0.866025,-0.47116,-0.882048
2,182.7782,0.0,-1469.6869,290.19055,142.89676,0.0,0.2461,0.325032,1.773,1.477166,...,2006,7,31,212,31,3,-0.5,-0.866025,-0.486273,-0.873807
3,182.7782,0.0,-1469.6869,290.22266,142.89676,0.0,0.2461,0.282802,1.773,1.477166,...,2006,8,1,213,31,3,-0.866025,-0.5,-0.501242,-0.865307
4,182.7782,0.0,-1469.6869,290.3668,142.89676,0.0,0.2461,0.347537,1.773,1.477166,...,2006,8,2,214,31,3,-0.866025,-0.5,-0.516062,-0.856551


In [72]:
len(df.columns)

40

In [73]:

df['target'] = (df['burned_area_has'] > 0).astype(int)
target_counts = df['target'].value_counts()


In [74]:
class_counts = df['target'].value_counts()
fig = px.bar(
    x=class_counts.index,
    y=class_counts.values,
    labels={'x': 'Class', 'y': 'Count'},
    title='Class Imbalance in Target Variable',
    color_discrete_sequence=['darkblue'],
    opacity=0.8
)

fig.update_layout(
    xaxis_title='Class',
    yaxis_title='Count',
    template='plotly_white'
)

fig.show()

In [75]:
df.head()

Unnamed: 0,aspect,burned_areas,curvature,d2m,dem,ignition_points,ndvi,rh,roads_distance,slope,...,month,day,day_of_year,week_of_year,quarter,month_sin,month_cos,day_of_year_sin,day_of_year_cos,target
0,182.7782,0.0,-1469.6869,289.78036,142.89676,0.0,0.2461,0.251317,1.773,1.477166,...,7,29,210,30,3,-0.5,-0.866025,-0.455907,-0.890028,1
1,182.7782,0.0,-1469.6869,290.09662,142.89676,0.0,0.2461,0.2515,1.773,1.477166,...,7,30,211,30,3,-0.5,-0.866025,-0.47116,-0.882048,1
2,182.7782,0.0,-1469.6869,290.19055,142.89676,0.0,0.2461,0.325032,1.773,1.477166,...,7,31,212,31,3,-0.5,-0.866025,-0.486273,-0.873807,1
3,182.7782,0.0,-1469.6869,290.22266,142.89676,0.0,0.2461,0.282802,1.773,1.477166,...,8,1,213,31,3,-0.866025,-0.5,-0.501242,-0.865307,1
4,182.7782,0.0,-1469.6869,290.3668,142.89676,0.0,0.2461,0.347537,1.773,1.477166,...,8,2,214,31,3,-0.866025,-0.5,-0.516062,-0.856551,1


**Data Scaling**

In [76]:
df.describe()

Unnamed: 0,aspect,burned_areas,curvature,d2m,dem,ignition_points,ndvi,rh,roads_distance,slope,...,month,day,day_of_year,week_of_year,quarter,month_sin,month_cos,day_of_year_sin,day_of_year_cos,target
count,775769.0,775769.0,775769.0,775769.0,775769.0,775769.0,775769.0,775769.0,775769.0,775769.0,...,775769.0,775769.0,775769.0,775769.0,775769.0,775769.0,775769.0,775769.0,775769.0,775769.0
mean,185.242203,0.00762,874.9417,285.561284,655.10127,1.610313,0.471344,0.390839,3.322183,1.516245,...,6.4086,15.721716,179.411824,26.002713,2.477832,-0.141672,-0.4070424,-0.028451,-0.431026,0.331135
std,105.406786,0.086986,15331.988854,5.626499,423.083277,144.196002,0.172371,0.166458,3.387954,0.121316,...,2.415129,8.79007,73.434114,10.497464,0.853675,0.709255,0.5578566,0.687838,0.583341,0.470622
min,0.016977,0.0,-76752.445,252.07605,-227.1745,0.0,-0.122,0.027781,0.0,0.01127,...,1.0,1.0,1.0,1.0,1.0,-1.0,-1.0,-0.999991,-0.999963,0.0
25%,99.52819,0.0,-6102.6274,281.97177,315.96747,0.0,0.3407,0.263795,1.142,1.523536,...,5.0,8.0,127.0,18.0,2.0,-0.866025,-0.8660254,-0.655156,-0.925925,0.0
50%,185.92528,0.0,-148.27194,286.3177,602.892,0.0,0.4716,0.373498,2.428,1.549848,...,7.0,16.0,190.0,28.0,3.0,-0.5,-0.5,-0.111659,-0.658402,0.0
75%,276.2374,0.0,6861.145,289.79315,923.6741,0.0,0.5991,0.496941,4.491,1.559715,...,8.0,23.0,228.0,33.0,3.0,0.5,-1.83697e-16,0.680773,-0.03012,1.0
max,359.9882,2.0,89969.23,299.61014,3015.61,107602.0,0.9331,0.993205,86.764,1.568866,...,12.0,31.0,366.0,53.0,4.0,1.0,1.0,0.999991,1.0,1.0


In [77]:
df.to_csv("unprocessed.csv",index=False)

: 

In [59]:
columns_to_skip=[
    "target",
    "year", "month", "day", "day_of_year", 
    "week_of_year", "quarter", "month_sin", "month_cos", 
    "day_of_year_sin", "day_of_year_cos",'x','y'
]

columns_to_scale=[col for col in df.columns if col not in columns_to_skip]

In [60]:

scaler = StandardScaler()
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])


In [61]:
df.head()

Unnamed: 0,aspect,burned_areas,curvature,d2m,dem,ignition_points,ndvi,rh,roads_distance,slope,...,month,day,day_of_year,week_of_year,quarter,month_sin,month_cos,day_of_year_sin,day_of_year_cos,target
0,-0.023376,-0.087595,-0.152924,0.749859,-1.210648,-0.011168,-1.306742,-0.838179,-0.457263,-0.322125,...,7,29,210,30,3,-0.5,-0.866025,-0.455907,-0.890028,1
1,-0.023376,-0.087595,-0.152924,0.806068,-1.210648,-0.011168,-1.306742,-0.837083,-0.457263,-0.322125,...,7,30,211,30,3,-0.5,-0.866025,-0.47116,-0.882048,1
2,-0.023376,-0.087595,-0.152924,0.822762,-1.210648,-0.011168,-1.306742,-0.395334,-0.457263,-0.322125,...,7,31,212,31,3,-0.5,-0.866025,-0.486273,-0.873807,1
3,-0.023376,-0.087595,-0.152924,0.828469,-1.210648,-0.011168,-1.306742,-0.649035,-0.457263,-0.322125,...,8,1,213,31,3,-0.866025,-0.5,-0.501242,-0.865307,1
4,-0.023376,-0.087595,-0.152924,0.854087,-1.210648,-0.011168,-1.306742,-0.260139,-0.457263,-0.322125,...,8,2,214,31,3,-0.866025,-0.5,-0.516062,-0.856551,1


In [63]:
df.to_csv("preprocessed.csv",index=False)