In [1]:
#Importing libraries
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import pandas as pd
from numpy import asarray
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import datetime
from statsmodels.tsa.tsatools import lagmat
from statsmodels.tsa.arima.model import ARIMA

# Data Import and Cleaning

In [2]:
df = pd.read_excel("/Users/surajkwork/Documents/Thesis/thesis/df.xlsx")

The date variable is not in datetime format so it has to be converted.

In [3]:
df['date']= pd.to_datetime(df['date'])
df.index = df['date']
df.sort_index(inplace=True)

Extracting one cluster from the dataset for analysis.

In [4]:
TS1 = df[(df['Plz'] == "['25', '24']")]

And from this cluster, extracting one category for analysis.

In [5]:
TS1_A2 = TS1[TS1['full'] == 'A2 -  geschreddert']

In [6]:
TS1_A1A2 = TS1[TS1['full'] == 'A1 & A2 -  geschreddert']
TS1_A2A3 = TS1[TS1['full'] == 'A2 & A3 -  geschreddert']

In [7]:
datasets = [TS1_A2, TS1_A1A2, TS1_A2A3]

# Modelling - Train, test split

In [8]:
nobs = 6
df_train, df_test = TS1_A1A2[0:-nobs], TS1_A1A2[-nobs:]

## Decision Tree

In [9]:
from sklearn.tree import DecisionTreeRegressor

In [10]:
n_lags = 4

In [11]:
y_train = df_train['wPreis'].values


In [12]:
y_train

array([-37.51931788, -28.69009585, -29.68807862, -25.6304334 ,
       -29.07379398, -24.93642602, -35.26209517, -28.06563229,
       -27.35874871, -28.21685813, -29.18520815, -32.36064526,
       -20.27214677, -18.80845203, -35.43666563, -23.58340467,
       -23.06625847, -22.99784234, -31.41962306, -25.46027349,
       -23.68194297, -21.47771259, -27.02904785, -24.82276444,
       -23.25520954, -25.32197924, -22.30413021, -23.38024324,
       -25.17015002, -20.85821571, -23.60348981, -24.86044273,
       -23.64540855, -25.72826243, -20.17778383, -26.36851112,
       -21.90534062, -23.64226891, -25.04513087, -24.65590187,
       -21.77319638, -24.9461431 , -23.90837982, -25.23754357,
       -19.73324755, -24.54094226, -24.73818078, -22.97021231,
       -19.21154856, -16.66741517, -26.54786465, -22.45099313,
       -22.93310518, -23.38212367, -23.28028169, -26.65816299,
       -22.07703349, -17.78619097, -18.98931101,  -7.41127285,
        -6.21131882,  -1.98035287,   1.61215148,   0.48

In [13]:
df_y_train = pd.DataFrame(y_train)


In [14]:
shifted_dfs = [df_y_train.shift(t) for t in range(1, n_lags)]
shifted_dfs

[             0
 0          NaN
 1   -37.519318
 2   -28.690096
 3   -29.688079
 4   -25.630433
 ..         ...
 143  25.673098
 144  22.501691
 145  18.514178
 146  28.985554
 147  16.847234
 
 [148 rows x 1 columns],
              0
 0          NaN
 1          NaN
 2   -37.519318
 3   -28.690096
 4   -29.688079
 ..         ...
 143  27.248645
 144  25.673098
 145  22.501691
 146  18.514178
 147  28.985554
 
 [148 rows x 1 columns],
              0
 0          NaN
 1          NaN
 2          NaN
 3   -37.519318
 4   -28.690096
 ..         ...
 143  34.003188
 144  27.248645
 145  25.673098
 146  22.501691
 147  18.514178
 
 [148 rows x 1 columns]]

In [15]:
for df in shifted_dfs:
    print(df.shape)

(148, 1)
(148, 1)
(148, 1)


In [16]:
concatenated_df = pd.concat(shifted_dfs, axis=1)


In [17]:
diff_df = concatenated_df.diff()
diff_df

Unnamed: 0,0,0.1,0.2
0,,,
1,,,
2,8.829222,,
3,-0.997983,8.829222,
4,4.057645,-0.997983,8.829222
...,...,...,...
143,-1.575547,-6.754543,-0.469350
144,-3.171408,-1.575547,-6.754543
145,-3.987512,-3.171408,-1.575547
146,10.471376,-3.987512,-3.171408


In [38]:
diff_df.head(10)

Unnamed: 0,0,0.1,0.2
0,,,
1,,,
2,8.829222,,
3,-0.997983,8.829222,
4,4.057645,-0.997983,8.829222
5,-3.443361,4.057645,-0.997983
6,4.137368,-3.443361,4.057645
7,-10.325669,4.137368,-3.443361
8,7.196463,-10.325669,4.137368
9,0.706884,7.196463,-10.325669


In [18]:
diff_values = diff_df.values

# Step 6: Slice the numpy array to exclude the first n_lags rows
sliced_values = diff_values[n_lags:, :]
sliced_values

array([[ 4.05764522e+00, -9.97982775e-01,  8.82922203e+00],
       [-3.44336058e+00,  4.05764522e+00, -9.97982775e-01],
       [ 4.13736796e+00, -3.44336058e+00,  4.05764522e+00],
       [-1.03256692e+01,  4.13736796e+00, -3.44336058e+00],
       [ 7.19646288e+00, -1.03256692e+01,  4.13736796e+00],
       [ 7.06883583e-01,  7.19646288e+00, -1.03256692e+01],
       [-8.58109420e-01,  7.06883583e-01,  7.19646288e+00],
       [-9.68350023e-01, -8.58109420e-01,  7.06883583e-01],
       [-3.17543711e+00, -9.68350023e-01, -8.58109420e-01],
       [ 1.20884985e+01, -3.17543711e+00, -9.68350023e-01],
       [ 1.46369474e+00,  1.20884985e+01, -3.17543711e+00],
       [-1.66282136e+01,  1.46369474e+00,  1.20884985e+01],
       [ 1.18532610e+01, -1.66282136e+01,  1.46369474e+00],
       [ 5.17146191e-01,  1.18532610e+01, -1.66282136e+01],
       [ 6.84161372e-02,  5.17146191e-01,  1.18532610e+01],
       [-8.42178072e+00,  6.84161372e-02,  5.17146191e-01],
       [ 5.95934957e+00, -8.42178072e+00

In [19]:
for t in range(1,n_lags):
    test = pd.DataFrame(y_train).shift(t)

In [20]:
X_train_shift = pd.concat([pd.DataFrame(y_train).shift(t) for t in range(1,n_lags)],1).diff().values[n_lags:,:]
y_train_shift = np.diff(y_train)[n_lags-1:]

In [41]:
X_train_shift[0]

array([ 4.05764522, -0.99798277,  8.82922203])

In [44]:
y_train_shift[0]

-3.4433605787153105

In [21]:
y_test = df_test['wPreis'].values


In [30]:
tree = DecisionTreeRegressor(random_state=42, max_depth = 2)
tree.fit(X_train_shift, y_train_shift)


In [31]:
y_pred_train = tree.predict(X_train_shift).reshape(-1)

In [45]:
y_pred_train

array([-3.09416306e+01, -2.82433449e+01, -2.94968969e+01, -2.67986111e+01,
       -2.80521631e+01, -2.93057151e+01, -2.66074294e+01, -2.39091436e+01,
       -2.12108578e+01, -2.24644098e+01, -3.90926234e+01, -3.63943377e+01,
       -3.76478897e+01, -3.89014417e+01, -4.01549937e+01, -3.74567079e+01,
       -3.87102599e+01, -3.99638120e+01, -4.12173640e+01, -3.85190782e+01,
       -3.97726302e+01, -4.10261822e+01, -3.83278965e+01, -3.95814485e+01,
       -3.68831627e+01, -3.41848769e+01, -3.54384289e+01, -3.27401432e+01,
       -3.00418574e+01, -3.12954094e+01, -2.85971236e+01, -2.98506757e+01,
       -2.71523899e+01, -2.84059419e+01, -2.57076561e+01, -2.30093704e+01,
       -2.42629224e+01, -2.55164744e+01, -2.28181886e+01, -2.40717406e+01,
       -2.13734548e+01, -2.26270069e+01, -1.99287211e+01, -2.11822731e+01,
       -2.24358251e+01, -2.36893771e+01, -2.49429291e+01, -2.22446434e+01,
       -2.34981954e+01, -2.47517474e+01, -2.60052994e+01, -2.72588514e+01,
       -2.45605657e+01, -

In [32]:
Xt = np.concatenate([X_train_shift[-1,1:].reshape(1,-1),np.array(y_train_shift[-1]).reshape(1,1)],1)


In [46]:
Xt

array([[  2.69828577,  -3.98751243, -12.78392978]])

In [33]:
predictions_test = []
for t in range(len(y_test)):
    pred = tree.predict(Xt)
    predictions_test.append(pred[0])
    Xt = np.concatenate([np.array(pred).reshape(1,1),Xt[-1,1:].reshape(1,-1)],1)

In [34]:
y_pred_test = np.array(predictions_test)
y_pred_train = y_train[n_lags-2]+np.cumsum(y_pred_train)
y_pred_test = y_train[-1]+np.cumsum(y_pred_test)

In [35]:
def forecast_accuracy(forecast, actual):
      me = np.mean(forecast - actual)             # ME
      rmse = np.mean((forecast - actual)**2)**.5  # RMSE
      # Direction accuracy
      forecast_diff = np.diff(forecast)
      actual_diff = np.diff(actual)
      direction_accuracy = np.mean(np.sign(forecast_diff) == np.sign(actual_diff))
    
      return {'me': me, 'rmse': rmse, 'direction_accuracy': direction_accuracy}


In [36]:
def adjust(val, length= 6): return str(val).ljust(length) # length
print('Decision Tree Forecast Accuracy of: wPreis')
accuracy_prod = forecast_accuracy(y_pred_test, df_test['wPreis'])
for k, v in accuracy_prod.items():
    print(adjust(k), ': ', round(v,4))

Decision Tree Forecast Accuracy of: wPreis
me     :  0.3503
rmse   :  3.6113
direction_accuracy :  0.6
