In [None]:
import pandas as pd

# Load datasets
wildfire_data = pd.read_csv('/content/Historical_Wildfires.csv')
weather_data = pd.read_csv('/content/HistoricalWeather.csv')

# Inspect the datasets
print(wildfire_data.info())
print(weather_data.info())

# Preview the datasets
print(wildfire_data.head())
print(weather_data.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26663 entries, 0 to 26662
Data columns (total 10 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Region                               26663 non-null  object 
 1   Date                                 26663 non-null  object 
 2   Estimated_fire_area                  26663 non-null  float64
 3   Mean_estimated_fire_brightness       26663 non-null  float64
 4   Mean_estimated_fire_radiative_power  26663 non-null  float64
 5   Mean_confidence                      26663 non-null  float64
 6   Std_confidence                       24429 non-null  float64
 7   Var_confidence                       24429 non-null  float64
 8   Count                                26663 non-null  int64  
 9   Replaced                             26663 non-null  object 
dtypes: float64(6), int64(1), object(3)
memory usage: 2.0+ MB
None
<class 'pandas.core.frame.DataFr

In [None]:
# Define a function to parse dates with multiple formats
def parse_dates(date_series):
    for fmt in ('%d-%m-%Y', '%d/%m/%Y', '%Y-%m-%d', '%m/%d/%Y'):
        try:
            return pd.to_datetime(date_series, format=fmt)
        except (ValueError, TypeError):
            continue
    return pd.to_datetime(date_series, errors='coerce')  # Coerce any non-matching formats to NaT

# Apply the function to the 'Date' columns in both datasets
wildfire_data['Date'] = parse_dates(wildfire_data['Date'])
weather_data['Date'] = parse_dates(weather_data['Date'])

# Drop any rows with NaT in the 'Date' column if any remain
wildfire_data.dropna(subset=['Date'], inplace=True)
weather_data.dropna(subset=['Date'], inplace=True)

# Change the format to dd/mm/yyyy
wildfire_data['Date'] = wildfire_data['Date'].dt.strftime('%d/%m/%Y')
weather_data['Date'] = weather_data['Date'].dt.strftime('%d/%m/%Y')

# Save the cleaned data back to CSV if needed
wildfire_data.to_csv('Cleaned_Historical_Wildfires.csv', index=False)
weather_data.to_csv('Cleaned_HistoricalWeather.csv', index=False)

# Display the first few rows of each dataset to verify the changes
wildfire_data.head(), weather_data.head()

(  Region        Date  Estimated_fire_area  Mean_estimated_fire_brightness  \
 0    NSW  04/01/2005              8.68000                      312.266667   
 1    NSW  05/01/2005             16.61125                      322.475000   
 2    NSW  06/01/2005              5.52000                      325.266667   
 3    NSW  07/01/2005              6.26400                      313.870000   
 4    NSW  08/01/2005              5.40000                      337.383333   
 
    Mean_estimated_fire_radiative_power  Mean_confidence  Std_confidence  \
 0                            42.400000        78.666667        2.886751   
 1                            62.362500        85.500000        8.088793   
 2                            38.400000        78.333333        3.214550   
 3                            33.800000        92.200000        7.529940   
 4                           122.533333        91.000000        7.937254   
 
    Var_confidence  Count Replaced  
 0        8.333333      3        R 

In [None]:
merged_df = pd.merge(wildfire_data, weather_data, on=['Date', 'Region'], how='inner')

# Display the first few rows of the merged dataset to verify the merge
merged_df.head()

Unnamed: 0,Region,Date,Estimated_fire_area,Mean_estimated_fire_brightness,Mean_estimated_fire_radiative_power,Mean_confidence,Std_confidence,Var_confidence,Count,Replaced,Parameter,count()[unit: km^2],min(),max(),mean(),variance()
0,NSW,04/01/2005,8.68,312.266667,42.4,78.666667,2.886751,8.333333,3,R,Precipitation,800234.348986,0.0,22.842566,2.80862,17.383363
1,NSW,04/01/2005,8.68,312.266667,42.4,78.666667,2.886751,8.333333,3,R,RelativeHumidity,800234.348986,31.27993,90.332771,57.095628,267.158378
2,NSW,04/01/2005,8.68,312.266667,42.4,78.666667,2.886751,8.333333,3,R,SoilWaterContent,800234.348986,0.022578,0.444927,0.214293,0.009965
3,NSW,04/01/2005,8.68,312.266667,42.4,78.666667,2.886751,8.333333,3,R,SolarRadiation,800234.348986,7.576938,33.214062,22.617291,40.25546
4,NSW,04/01/2005,8.68,312.266667,42.4,78.666667,2.886751,8.333333,3,R,Temperature,800234.348986,12.495799,28.945488,23.055527,9.455474


In [None]:
# Pivot the weather dataset to create separate columns for each Parameter
weather_pivoted = merged_df.pivot_table(
    index=["Date", "Region","Estimated_fire_area"],  # Use Date and Region as index
    columns="Parameter",       # Use Parameter values as new column keys
    values=["min()", "max()", "mean()", "variance()"]  # Extract these statistics
).reset_index()

# Flatten the multi-level column names for better usability
weather_pivoted.columns = [
    f"{stat}_{param}" if stat else param for stat, param in weather_pivoted.columns
]


In [None]:
weather_pivoted.head()

Unnamed: 0,Date_,Region_,Estimated_fire_area_,max()_Precipitation,max()_RelativeHumidity,max()_SoilWaterContent,max()_SolarRadiation,max()_Temperature,max()_WindSpeed,mean()_Precipitation,...,min()_SoilWaterContent,min()_SolarRadiation,min()_Temperature,min()_WindSpeed,variance()_Precipitation,variance()_RelativeHumidity,variance()_SoilWaterContent,variance()_SolarRadiation,variance()_Temperature,variance()_WindSpeed
0,01/01/2005,NT,4.945,315.266815,95.683342,0.49614,31.634459,38.136787,9.704402,9.884958,...,0.0,2.51812,24.17996,1.840394,546.059262,584.201131,0.026743,58.942658,12.920252,1.930014
1,01/01/2005,QL,38.831579,74.452164,95.89827,0.472416,31.98283,37.047943,7.675632,1.453053,...,0.0,6.033827,20.95162,1.106028,35.641257,403.134377,0.012679,29.500832,13.792599,0.883048
2,01/01/2005,WA,36.3825,127.795181,90.399254,0.405102,32.819298,36.411083,10.650237,1.872415,...,0.0,4.060164,18.3855,0.931396,56.771765,260.721015,0.003713,19.931596,17.695984,1.409677
3,01/01/2006,NSW,78.354348,10.028352,76.897392,0.392838,33.000092,37.727879,10.174102,0.459693,...,3.285665e-07,22.624712,17.602983,1.744877,1.037845,185.19354,0.004662,6.819037,18.373641,2.301057
4,01/01/2006,QL,185.344022,26.380117,86.861053,0.405388,32.958508,38.074261,6.535478,0.168673,...,0.0,11.886605,22.642426,1.487591,0.606795,315.240521,0.004318,19.954609,11.607143,0.533263


In [None]:
#save the weather data

weather_pivoted.to_csv('weather_first_pivoted.csv', index=False)


In [None]:
# Calculate Drought Factor as the 3-day cumulative precipitation
weather_pivoted['Drought_Factor'] = weather_pivoted['mean()_Precipitation'].rolling(window=3).sum()


In [None]:
weather_pivoted.columns

Index(['Date_', 'Region_', 'Estimated_fire_area_', 'max()_Precipitation',
       'max()_RelativeHumidity', 'max()_SoilWaterContent',
       'max()_SolarRadiation', 'max()_Temperature', 'max()_WindSpeed',
       'mean()_Precipitation', 'mean()_RelativeHumidity',
       'mean()_SoilWaterContent', 'mean()_SolarRadiation',
       'mean()_Temperature', 'mean()_WindSpeed', 'min()_Precipitation',
       'min()_RelativeHumidity', 'min()_SoilWaterContent',
       'min()_SolarRadiation', 'min()_Temperature', 'min()_WindSpeed',
       'variance()_Precipitation', 'variance()_RelativeHumidity',
       'variance()_SoilWaterContent', 'variance()_SolarRadiation',
       'variance()_Temperature', 'variance()_WindSpeed', 'Drought_Factor'],
      dtype='object')

In [None]:
# Calculate 3-day and 7-day moving averages for Temperature and Precipitation
weather_pivoted['Temperature_MA_3'] = weather_pivoted['mean()_Temperature'].rolling(window=3).mean()
weather_pivoted['Precipitation_MA_3'] = weather_pivoted['mean()_Precipitation'].rolling(window=3).mean()
weather_pivoted['Temperature_MA_7'] = weather_pivoted['mean()_Temperature'].rolling(window=7).mean()
weather_pivoted['Precipitation_MA_7'] = weather_pivoted['mean()_Precipitation'].rolling(window=7).mean()

# For Relative Humidity
weather_pivoted['RelativeHumidity_MA_3'] = weather_pivoted['mean()_RelativeHumidity'].rolling(window=3).mean()
weather_pivoted['RelativeHumidity_MA_7'] = weather_pivoted['mean()_RelativeHumidity'].rolling(window=7).mean()

# For Soil Water Content
weather_pivoted['SoilWaterContent_MA_3'] = weather_pivoted['mean()_SoilWaterContent'].rolling(window=3).mean()
weather_pivoted['SoilWaterContent_MA_7'] = weather_pivoted['mean()_SoilWaterContent'].rolling(window=7).mean()

# For Solar Radiation
weather_pivoted['SolarRadiation_MA_3'] = weather_pivoted['mean()_SolarRadiation'].rolling(window=3).mean()
weather_pivoted['SolarRadiation_MA_7'] = weather_pivoted['mean()_SolarRadiation'].rolling(window=7).mean()


In [None]:
print(weather_pivoted.head())
print(weather_pivoted.info())


        Date_ Region_  Estimated_fire_area_  max()_Precipitation  \
0  01/01/2005      NT              4.945000           315.266815   
1  01/01/2005      QL             38.831579            74.452164   
2  01/01/2005      WA             36.382500           127.795181   
3  01/01/2006     NSW             78.354348            10.028352   
4  01/01/2006      QL            185.344022            26.380117   

   max()_RelativeHumidity  max()_SoilWaterContent  max()_SolarRadiation  \
0               95.683342                0.496140             31.634459   
1               95.898270                0.472416             31.982830   
2               90.399254                0.405102             32.819298   
3               76.897392                0.392838             33.000092   
4               86.861053                0.405388             32.958508   

   max()_Temperature  max()_WindSpeed  mean()_Precipitation  ...  \
0          38.136787         9.704402              9.884958  ...   
1   

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np

# Load data
data = weather_pivoted

# Convert 'Date' to datetime format
data['Date_'] = pd.to_datetime(data['Date_'], format='%d/%m/%Y', errors='coerce')  # Handle potential errors in date format

# Extract numerical features for scaling
numerical_features = data.select_dtypes(include=['number']).columns
numerical_data = data[numerical_features]

# Handle NaN values in the entire DataFrame before scaling
numerical_data = numerical_data.fillna(numerical_data.mean())  # Replace NaNs with column means

# Normalize the numerical data
scaler = MinMaxScaler()
scaled_numerical_data = scaler.fit_transform(numerical_data)

# Convert scaled numerical data back to DataFrame
scaled_numerical_data = pd.DataFrame(scaled_numerical_data, columns=numerical_features, index=data.index)

# Concatenate scaled numerical data with original date column
scaled_data = pd.concat([data['Date_'], scaled_numerical_data], axis=1)
# Prepare data for LSTM
target_column = 'Estimated_fire_area_'
X = scaled_data.drop(columns=['Date_', target_column])
y = scaled_data[target_column]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape data for LSTM input
X_train = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Check for and handle NaN/infinite values in X_train and y_train (optional, but good practice)
X_train = np.nan_to_num(X_train, nan=np.nanmean(X_train))
y_train = np.nan_to_num(y_train, nan=np.nanmean(y_train))

# Define LSTM model with 'tanh' activation and gradient clipping
model = Sequential()
model.add(LSTM(50, activation='tanh', input_shape=(X_train.shape[1], X_train.shape[2]), recurrent_dropout=0.2))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(optimizer=tf.keras.optimizers.Adam(clipvalue=1.0), loss='mse')  # Clip gradients

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1, verbose=1)

# Evaluate the model
loss = model.evaluate(X_test, y_test.values, verbose=0)  # Convert y_test to NumPy array
print(f'Test Loss: {loss}')

# Make predictions
y_pred = model.predict(X_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Date_'] = pd.to_datetime(data['Date_'], format='%d/%m/%Y', errors='coerce')  # Handle potential errors in date format
  super().__init__(**kwargs)


Epoch 1/10
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - loss: 0.0237 - val_loss: 0.0177
Epoch 2/10
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0200 - val_loss: 0.0174
Epoch 3/10
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - loss: 0.0188 - val_loss: 0.0171
Epoch 4/10
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0183 - val_loss: 0.0168
Epoch 5/10
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0189 - val_loss: 0.0164
Epoch 6/10
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 0.0184 - val_loss: 0.0163
Epoch 7/10
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.0186 - val_loss: 0.0163
Epoch 8/10
[1m586/586[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.0182 - val_loss: 0.0172
Epoch 9/10
[1m586/586[0m [32m━━━━━━━━

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse}')


RMSE: 0.1333348048251322
MAE: 0.08575435346712042
MSE: 0.017778170177756097


In [None]:
from sklearn.metrics import r2_score

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')


R-squared: 0.3852716743082607
