In [66]:
# to know number of cores and logical processors
#!wmic cpu get NumberOfCores,NumberOfLogicalProcessors

In [67]:
import os
os.environ['OMP_NUM_THREADS']='7'

In [68]:
# import the necessary libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import set_config             # We can optionally set transform output globally,
set_config(transform_output="pandas")      # or choose it for particular instances
import multiprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import make_column_transformer

In [69]:
# to load dataframe

url = 'https://drive.google.com/file/d/1egIe9oo7_GHU61AzqiHoWC-SsMrhE9oG/view?usp=sharing'
path = 'https://drive.google.com/uc?id='+url.split('/')[-2]
df_states = pd.read_csv(path)

In [70]:
# explore df

display(df_states.info())
display(df_states.columns)
display(df_states.head(1))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2652 entries, 0 to 2651
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   year           2652 non-null   int64  
 1   month          2652 non-null   int64  
 2   state          2652 non-null   object 
 3   fire qty       2652 non-null   int64  
 4   hectares       2652 non-null   float64
 5   temperature    2652 non-null   float64
 6   precipitation  2635 non-null   float64
dtypes: float64(3), int64(3), object(1)
memory usage: 145.2+ KB


None

Index(['year', 'month', 'state', 'fire qty', 'hectares', 'temperature',
       'precipitation'],
      dtype='object')

Unnamed: 0,year,month,state,fire qty,hectares,temperature,precipitation
0,2010,1,Baden Wurttemberg,0,0.0,-2.76,


In [71]:
# change names and rearrange columns

df_states.rename(columns={'fire qty':'fires_qty', 'hectares': 'burned_ha'}, inplace=True)
df_states = df_states[['state', 'year', 'month', 'fires_qty', 'burned_ha', 'temperature', 'precipitation']]

In [72]:
# we'll remove 'Germany' rows since this is a 'state' approach
df_states = df_states.query('state != "Germany"')
df_states.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2496 entries, 0 to 2495
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   state          2496 non-null   object 
 1   year           2496 non-null   int64  
 2   month          2496 non-null   int64  
 3   fires_qty      2496 non-null   int64  
 4   burned_ha      2496 non-null   float64
 5   temperature    2496 non-null   float64
 6   precipitation  2480 non-null   float64
dtypes: float64(3), int64(3), object(1)
memory usage: 156.0+ KB


# Now we build the Regression Machine Learning Model
Our target will be the burned_ha

In [73]:
df_states.columns

Index(['state', 'year', 'month', 'fires_qty', 'burned_ha', 'temperature',
       'precipitation'],
      dtype='object')

In [74]:
# we need to create a new data frame with only the columns we'll use for the model

df_states = df_states[['state', 'year', 'month', 'fires_qty', 'temperature', 'precipitation', 'burned_ha']]
df_states

Unnamed: 0,state,year,month,fires_qty,temperature,precipitation,burned_ha
0,Baden Wurttemberg,2010,1,0,-2.76,,0.0
1,Baden Wurttemberg,2010,2,0,0.13,60.2,0.0
2,Baden Wurttemberg,2010,3,3,3.81,51.5,0.8
3,Baden Wurttemberg,2010,4,7,8.79,23.8,4.3
4,Baden Wurttemberg,2010,5,0,10.65,129.9,0.0
...,...,...,...,...,...,...,...
2491,Thuringia,2022,8,19,19.81,37.3,13.0
2492,Thuringia,2022,9,1,12.80,99.4,0.0
2493,Thuringia,2022,10,0,12.05,44.2,0.0
2494,Thuringia,2022,11,1,5.67,34.9,0.0


In [75]:
# now we need to split our dataset into X and y  - X will be the features we'll use for the model and y will be the target

X = df_states
y = X.pop('burned_ha') # we'll use the cluster column as our target

In [76]:
# and now we can split into traing and testing sets
# 80% for training and 20% for testing  - we'll use the random_state parameter to make sure we get the same split every time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

In [77]:
# first we need to divide into numerical and categorical data

X_num_train =  X_train.select_dtypes(include = "number")
X_num_test = X_test.select_dtypes(include = "number")
X_cat_train = X_train.select_dtypes(exclude="number")
X_cat_test = X_test.select_dtypes(exclude="number")

In [78]:
# now we need to impute the missing values in the dataset  - we'll use the SimpleImputer class from the sklearn library

# we import necessary libraries
from sklearn.impute import SimpleImputer

# we start with numeric features
num_imputer = SimpleImputer(strategy = 'mean')                 # first we initialize the imputer
num_imputer.fit(X_num_train)                                   # then we fit the imputer to the training data
X_num_train_imp = num_imputer.transform(X_num_train)           # and transform the training set
X_num_test_imp = num_imputer.transform(X_num_test)             # and transform the testing set

# now we do the same for the categorical features
cat_imputer = SimpleImputer(strategy = 'constant', fill_value = 'N_A')         # first we initialize the imputer
cat_imputer.fit(X_cat_train)                                             # then we fit the imputer to the training data
X_cat_train_imp = cat_imputer.transform(X_cat_train)                       # and transform the training set
X_cat_test_imp = cat_imputer.transform(X_cat_test)                         # and transform the testing set

In [79]:
# and now we need to use one hot encoder to transform the categorical data into numerical data

# first we initialize the encoder
cat_encoder = OneHotEncoder(drop = 'first', sparse_output = False, handle_unknown = 'ignore') 
cat_encoder.fit(X_cat_train_imp)                                            # then we fit the encoder to the training data
X_cat_train_imp_hot = cat_encoder.transform(X_cat_train_imp)                # and transform the training set
X_cat_test_imp_hot = cat_encoder.transform(X_cat_test_imp)                  # and transform the testing set

In [80]:
# now we concatenate the numerical and categorical data

X_train_imp = pd.concat([X_num_train_imp, X_cat_train_imp_hot], axis = 1)
X_test_imp = pd.concat([X_num_test_imp, X_cat_test_imp_hot], axis = 1)

display(X_train_imp.head(1))
display(X_test_imp.head(1))

Unnamed: 0,year,month,fires_qty,temperature,precipitation,state_Bavaria,state_Berlin,state_Brandenburg,state_Bremen,state_Hamburg,state_Hessen,state_Lower Saxony,state_Mecklenburg-Vorpommern,state_Nordrhein-Westfalen,state_Rhineland-Palatinate,state_Saarland,state_Saxony,state_Saxony Anhalt,state_Schleswig-Holstein,state_Thuringia
250,2017.0,11.0,0.0,3.76,86.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,year,month,fires_qty,temperature,precipitation,state_Bavaria,state_Berlin,state_Brandenburg,state_Bremen,state_Hamburg,state_Hessen,state_Lower Saxony,state_Mecklenburg-Vorpommern,state_Nordrhein-Westfalen,state_Rhineland-Palatinate,state_Saarland,state_Saxony,state_Saxony Anhalt,state_Schleswig-Holstein,state_Thuringia
1044,2019.0,1.0,0.0,0.5,74.8,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [81]:
display(X_train_imp.shape)
display(X_test_imp.shape)

(1996, 20)

(500, 20)

# Pipeline
Now that we've seen every step of the way works, we can build an automated pipeline with branches to train our model with GridSearch Cross Validation

In [82]:
from sklearn.ensemble import RandomForestRegressor

In [83]:
# first we save the numerical and categorical columns in different variables
X_num_col = X.select_dtypes(include = "number").columns
X_cat_col = X.select_dtypes(exclude = "number").columns

# now we create the pipeline for numerical data
# we create and 'empty' pipeline because we'll use GridSearch to find the best options for the model
num_pipe = make_pipeline(SimpleImputer())

# and the categorical data
cat_pipe = make_pipeline(SimpleImputer(strategy = 'constant', fill_value = 'N_A'), 
                         OneHotEncoder(drop = 'first', sparse_output = False, handle_unknown = 'ignore'))

# and we initialize the scaler and the model
r_forest = RandomForestRegressor()
scaler = StandardScaler()

In [84]:
# we create the column transformer that will combine the numerical and categorical data
preprocessor = make_column_transformer((num_pipe, X_num_col), 
                                       (cat_pipe, X_cat_col),)

In [85]:
# and now we can build the full pipeline
full_pipe =  make_pipeline(preprocessor, scaler, r_forest)
full_pipe

In [86]:
# now we need to define the paraemters we'll use for the grid search

param_grid = {
    'columntransformer__pipeline-1__simpleimputer__strategy': ['mean', 'median'],
    'standardscaler__with_mean': [True, False],
    'standardscaler__with_std': [True, False],
    'randomforestregressor__n_estimators': [50, 100],
    'randomforestregressor__max_depth': [None, 5],
    'randomforestregressor__min_samples_split': [2, 5],
}

In [87]:
search = GridSearchCV(full_pipe,
                      param_grid,
                      cv = 5,
                      scoring = 'neg_mean_squared_error',
                      verbose = 1)

In [88]:
search.fit(X_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


In [89]:
search.best_params_

{'columntransformer__pipeline-1__simpleimputer__strategy': 'mean',
 'randomforestregressor__max_depth': 5,
 'randomforestregressor__min_samples_split': 2,
 'randomforestregressor__n_estimators': 50,
 'standardscaler__with_mean': False,
 'standardscaler__with_std': False}

In [90]:
from sklearn.metrics import mean_absolute_error

print('Train Mean Absolute Error (MAE):', mean_absolute_error(y_train, search.predict(X_train)))

Train Mean Absolute Error (MAE): 1.85444930298402


# Using SVM

In [91]:
# we start by importing the necessary libraries
from sklearn import svm
from sklearn.svm import SVR

In [92]:
# and now we can build the full pipeline
SVM = svm.SVR()
SVM_pipe = make_pipeline(preprocessor, scaler, SVM)
SVM_pipe

In [93]:
# now we need to define the paraemters we'll use for the grid search

param_grid = {
    'columntransformer__pipeline-1__simpleimputer__strategy': ['mean', 'median'],
    'standardscaler__with_mean': [True, False],
    'standardscaler__with_std': [True, False],
    'svr__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svr__C': [0.1, 1, 10],
    'svr__epsilon': [0.1, 0.2, 0.5],
}

In [94]:
search_svm = GridSearchCV(SVM_pipe,
                      param_grid,
                      cv = 5,
                      scoring = 'neg_mean_squared_error',
                      verbose = 1)

In [95]:
search_svm.fit(X_train, y_train)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


In [96]:
search_svm.best_params_

{'columntransformer__pipeline-1__simpleimputer__strategy': 'median',
 'standardscaler__with_mean': True,
 'standardscaler__with_std': False,
 'svr__C': 1,
 'svr__epsilon': 0.1,
 'svr__kernel': 'poly'}

In [97]:
from sklearn.metrics import mean_absolute_error

print('Train Mean Absolute Error (MAE):', mean_absolute_error(y_train, search_svm.predict(X_train)))

Train Mean Absolute Error (MAE): 2.913155790377512


# xgboost

In [99]:
# import xgboost library - the library was previously installed with pip install xgboost
from sklearn.ensemble import GradientBoostingRegressor

In [100]:
# and now we can build the full pipeline

# Create an XGBoost classifier
xg_boost = GradientBoostingRegressor()

xgb_pipe = make_pipeline(preprocessor, scaler, xg_boost)
xgb_pipe

In [101]:
# now we need to define the paraemters we'll use for the grid search

param_grid = {
    'columntransformer__pipeline-1__simpleimputer__strategy': ['mean', 'median'],
    'standardscaler__with_mean': [True, False],
    'standardscaler__with_std': [True, False],
    'gradientboostingregressor__n_estimators': [100, 200],
    'gradientboostingregressor__learning_rate': [0.01, 0.1],
    'gradientboostingregressor__max_depth': [3, 4],
}

In [102]:
search_xgb = GridSearchCV(xgb_pipe,
                      param_grid,
                      cv = 5,
                      scoring = 'neg_mean_squared_error',
                      verbose = 1,
                      n_jobs = -1)

In [103]:
search_xgb.fit(X_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


In [104]:
search_xgb.best_params_

{'columntransformer__pipeline-1__simpleimputer__strategy': 'median',
 'gradientboostingregressor__learning_rate': 0.1,
 'gradientboostingregressor__max_depth': 4,
 'gradientboostingregressor__n_estimators': 100,
 'standardscaler__with_mean': True,
 'standardscaler__with_std': False}

In [105]:
print('Train Mean Absolute Error (MAE):', mean_absolute_error(y_train, search_xgb.predict(X_train)))

Train Mean Absolute Error (MAE): 0.9266357508312194


# SDG Regressor

In [106]:
from sklearn.linear_model import LinearRegression, SGDRegressor

In [107]:
sgd_pipeline = make_pipeline(preprocessor,
                            scaler,
                            SGDRegressor())
sgd_pipeline

In [108]:
# now we need to define the paraemters we'll use for the grid search

param_grid_sgd = {
    'columntransformer__pipeline-1__simpleimputer__strategy': ['mean', 'median'],
    'standardscaler__with_mean': [True, False],
    'standardscaler__with_std': [True, False],
    'sgdregressor__loss': ['squared_loss', 'huber'],
    'sgdregressor__penalty': ['l2', 'l1'],
    'sgdregressor__alpha': [0.0001, 0.001],
    'sgdregressor__eta0': [0.01, 0.1],
    'sgdregressor__max_iter': [100],
    'sgdregressor__tol': [1e-3, 1e-4]
}

In [109]:
search_sgd = GridSearchCV(sgd_pipeline,
                      param_grid_sgd,
                      cv = 5,
                      scoring = 'neg_mean_squared_error',
                      verbose = 1,
                      n_jobs = -1)

In [110]:
search_sgd.fit(X_train, y_train)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


In [111]:
search_sgd.best_params_

{'columntransformer__pipeline-1__simpleimputer__strategy': 'median',
 'sgdregressor__alpha': 0.001,
 'sgdregressor__eta0': 0.01,
 'sgdregressor__loss': 'huber',
 'sgdregressor__max_iter': 100,
 'sgdregressor__penalty': 'l1',
 'sgdregressor__tol': 0.0001,
 'standardscaler__with_mean': False,
 'standardscaler__with_std': False}

In [112]:
print('Train Mean Absolute Error (MAE):', mean_absolute_error(y_train, search_sgd.predict(X_train)))

Train Mean Absolute Error (MAE): 205.73665208585504


# LSTM - time series analysis

In [121]:
# we'll start by importing necessary libraries for the deep-learing model

import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import EarlyStopping
import itertools
import numpy as np

In [128]:
# to load dataframe

url = 'https://drive.google.com/file/d/1egIe9oo7_GHU61AzqiHoWC-SsMrhE9oG/view?usp=sharing'
path = 'https://drive.google.com/uc?id='+url.split('/')[-2]
df_states_reg = pd.read_csv(path)

In [130]:
df_states_reg.head(1)

Unnamed: 0,year,month,state,fire qty,hectares,temperature,precipitation
0,2010,1,Baden Wurttemberg,0,0.0,-2.76,


In [131]:
# first we need to put the year and month together and make them the index

df_states_reg['date'] = pd.to_datetime(df_states_reg['year'].astype(str) + df_states_reg['month'].astype(str).str.zfill(2), format = '%Y%m')
df_states_reg['date'] = df_states_reg['date'].dt.strftime('%Y-%m')
df_states_reg['date'] = pd.to_datetime(df_states_reg['date'], format='%Y-%m')
display(df_states_reg.info())

# then we drop them
df_states_reg.drop(['year', 'month'], axis = 1, inplace = True)

# and we make the date the index
df_states_reg.set_index('date', inplace = True)

df_states_reg.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2652 entries, 0 to 2651
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   year           2652 non-null   int64         
 1   month          2652 non-null   int64         
 2   state          2652 non-null   object        
 3   fire qty       2652 non-null   int64         
 4   hectares       2652 non-null   float64       
 5   temperature    2652 non-null   float64       
 6   precipitation  2635 non-null   float64       
 7   date           2652 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(3), int64(3), object(1)
memory usage: 165.9+ KB


None

Unnamed: 0_level_0,state,fire qty,hectares,temperature,precipitation
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-01,Baden Wurttemberg,0,0.0,-2.76,
2010-02-01,Baden Wurttemberg,0,0.0,0.13,60.2
2010-03-01,Baden Wurttemberg,3,0.8,3.81,51.5
2010-04-01,Baden Wurttemberg,7,4.3,8.79,23.8
2010-05-01,Baden Wurttemberg,0,0.0,10.65,129.9


In [132]:
df_states_reg.columns

Index(['state', 'fire qty', 'hectares', 'temperature', 'precipitation'], dtype='object')

In [135]:
# we rearrange the columns to make it easier to split them into train and test sets
# we leave cluster at the end because it will be the target

df_states_reg = df_states_reg.rename(columns = {'fire qty': 'fires_qty', 'hectares': 'burned_ha'})
df_states_reg = df_states_reg[['fires_qty', 'state', 'temperature', 'precipitation', 'burned_ha']]
df_states_reg.head(1)

Unnamed: 0_level_0,fires_qty,state,temperature,precipitation,burned_ha
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-01,0,Baden Wurttemberg,-2.76,,0.0


In [140]:
df_states_reg.iloc[:, 0:4].head(1)
df_states_reg.iloc[:, -1:].head(1)

Unnamed: 0_level_0,burned_ha
date,Unnamed: 1_level_1
2010-01-01,0.0


In [143]:
features = df_states_reg.iloc[:, 0:4]
display(features.head(1))
target = df_states_reg.iloc[:, -1:]
display(target.head(1))
display(target.info())

Unnamed: 0_level_0,fires_qty,state,temperature,precipitation
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01,0,Baden Wurttemberg,-2.76,


Unnamed: 0_level_0,burned_ha
date,Unnamed: 1_level_1
2010-01-01,0.0


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2652 entries, 2010-01-01 to 2022-12-01
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   burned_ha  2652 non-null   float64
dtypes: float64(1)
memory usage: 41.4 KB


None

In [144]:
# we split the data into training, validation and test

# Split the data into training, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=444)

In [145]:
# we need to divide into numerical and categorical data

X_num_train =  X_train.select_dtypes(include = "number")
X_num_test = X_test.select_dtypes(include = "number")
X_cat_train = X_train.select_dtypes(exclude="number")
X_cat_test = X_test.select_dtypes(exclude="number")

In [146]:
# now we need to impute the missing values in the dataset  - we'll use the SimpleImputer class from the sklearn library

# we import necessary libraries
# from sklearn.impute import SimpleImputer

# we start with numeric features
num_imputer = SimpleImputer(strategy = 'mean')                 # first we initialize the imputer
num_imputer.fit(X_num_train)                                   # then we fit the imputer to the training data
X_num_train_imp = num_imputer.transform(X_num_train)           # and transform the training set
X_num_test_imp = num_imputer.transform(X_num_test)             # and transform the testing set

# now we do the same for the categorical features
cat_imputer = SimpleImputer(strategy = 'constant', fill_value = 'N_A')         # first we initialize the imputer
cat_imputer.fit(X_cat_train)                                             # then we fit the imputer to the training data
X_cat_train_imp = cat_imputer.transform(X_cat_train)                       # and transform the training set
X_cat_test_imp = cat_imputer.transform(X_cat_test)                         # and transform the testing set

In [147]:
# and now we need to use one hot encoder to transform the categorical data into numerical data

# first we initialize the encoder
cat_encoder = OneHotEncoder(drop = 'first', sparse_output = False, handle_unknown = 'ignore') 
cat_encoder.fit(X_cat_train_imp)                                            # then we fit the encoder to the training data
X_cat_train_imp_hot = cat_encoder.transform(X_cat_train_imp)                # and transform the training set
X_cat_test_imp_hot = cat_encoder.transform(X_cat_test_imp)                  # and transform the testing set

In [148]:
# now we concatenate the numerical and categorical data

X_train_imp = pd.concat([X_num_train_imp, X_cat_train_imp_hot], axis = 1)
X_test_imp = pd.concat([X_num_test_imp, X_cat_test_imp_hot], axis = 1)

display(X_train_imp.head(1))
display(X_test_imp.head(1))

Unnamed: 0_level_0,fires_qty,temperature,precipitation,state_Bavaria,state_Berlin,state_Brandenburg,state_Bremen,state_Germany,state_Hamburg,state_Hessen,state_Lower Saxony,state_Mecklenburg-Vorpommern,state_Nordrhein-Westfalen,state_Rhineland-Palatinate,state_Saarland,state_Saxony,state_Saxony Anhalt,state_Schleswig-Holstein,state_Thuringia
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-02-01,0.0,2.15,29.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,fires_qty,temperature,precipitation,state_Bavaria,state_Berlin,state_Brandenburg,state_Bremen,state_Germany,state_Hamburg,state_Hessen,state_Lower Saxony,state_Mecklenburg-Vorpommern,state_Nordrhein-Westfalen,state_Rhineland-Palatinate,state_Saarland,state_Saxony,state_Saxony Anhalt,state_Schleswig-Holstein,state_Thuringia
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2013-06-01,0.0,15.62,60.3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [149]:
# now we need to scale

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_imp)
X_test = scaler.transform(X_test_imp)

In [150]:
sequence_length = 12
num_features = 18

# Convert X_train to a NumPy array
X_train_array = X_train.values

# Calculate the number of samples (batch_size)
num_samples = X_train_array.shape[0]

# Calculate the number of sequences you can create
num_sequences = num_samples - sequence_length + 1

# Initialize an empty array to store the sequences
sequences = []

# Create sequences
for i in range(num_sequences):
    sequence = X_train_array[i:i+sequence_length, :]  # Extract a sequence of length sequence_length
    sequences.append(sequence)

# Convert the list of sequences to a NumPy array
X_train_reshaped = np.array(sequences)

In [151]:
# Convert X_train to a NumPy array
X_test_array = X_test.values

# Calculate the number of samples (batch_size)
num_samples = X_test_array.shape[0]

# Calculate the number of sequences you can create
num_sequences = num_samples - sequence_length + 1

# Initialize an empty array to store the sequences
sequences = []

# Create sequences
for i in range(num_sequences):
    sequence = X_test_array[i:i+sequence_length, :]  # Extract a sequence of length sequence_length
    sequences.append(sequence)

# Convert the list of sequences to a NumPy array
X_test_reshaped = np.array(sequences)

In [152]:
X_test_reshaped

array([[[-0.2711804 ,  0.90192807,  0.00571933, ..., -0.25025038,
         -0.25552656, -0.25552656],
        [-0.18723387,  0.78719853, -0.34875985, ..., -0.25025038,
         -0.25552656, -0.25552656],
        [-0.2711804 ,  0.51897948, -0.08214303, ..., -0.25025038,
         -0.25552656, -0.25552656],
        ...,
        [-0.2711804 , -0.0484666 ,  0.26930641, ..., -0.25025038,
         -0.25552656, -0.25552656],
        [-0.2711804 , -0.7678518 ,  0.06631406, ..., -0.25025038,
         -0.25552656, -0.25552656],
        [-0.2711804 , -1.40816664, -0.23060012, ..., -0.25025038,
         -0.25552656, -0.25552656]],

       [[-0.18723387,  0.78719853, -0.34875985, ..., -0.25025038,
         -0.25552656, -0.25552656],
        [-0.2711804 ,  0.51897948, -0.08214303, ..., -0.25025038,
         -0.25552656, -0.25552656],
        [-0.2711804 , -0.62366468, -1.5636842 , ..., -0.25025038,
         -0.25552656, -0.25552656],
        ...,
        [-0.2711804 , -0.7678518 ,  0.06631406, ..., -

In [153]:
from tensorflow.keras.optimizers import Adam

In [154]:
# we reshape the data to make it compatible with the LSTM model
y_train = y_train[:X_train_reshaped.shape[0]]
y_test = y_test[:X_test_reshaped.shape[0]]

# Convert to NumPy arrays with integer data type
y_train = np.array(y_train, dtype=int)
y_test = np.array(y_test, dtype=int)

In [155]:
# we need to make sure that X and y have the same shape - they appear to be mismatched

print(X_train_reshaped.shape)
print(X_test_reshaped.shape)
print(y_train.shape)
print(y_test.shape)

(2110, 12, 19)
(520, 12, 19)
(2110, 1)
(520, 1)


In [160]:
# Create the LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(units=128, input_shape=(12, 19), return_sequences=True))
lstm_model.add(Dense(units=1, activation = 'linear'))  

# Compile the model
lstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

batch_size = 150
# Train the model
lstm_model.fit(X_train_reshaped, y_train, epochs=15, batch_size=batch_size, validation_data = (X_test_reshaped, y_test), verbose = 1, shuffle = False)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x15dda14a1d0>