In [None]:
#Import data and libraries

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,  LSTM
from tensorflow.keras import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split 

In [None]:
# Have a look at our data

In [2]:
data = pd.read_excel('../raw_data/daily_data_clean.xlsx', engine='openpyxl').drop(columns='Unnamed: 0')

In [3]:
data.head()

Unnamed: 0,date_issue,total_amount_claims
0,2018-09-01,168
1,2018-09-02,346
2,2018-09-03,16169
3,2018-09-04,28529
4,2018-09-05,64135


In [None]:
# length of data frame

In [4]:
len_data = len(data)
len_data

920

In [None]:
def create_sequences(number):
    X, y = [], []
    
    for i in range(number):
        x_i, y_i = create_individual_sequence(10)
        X.append(x_i)
        y.append(y_i)
        
    return np.array(X), np.array(y)
            
def create_individual_sequence(length):
    amount_claims = []
    
    # Amount Claims
    nb_iter = np.random.beta(.15, 4)*300
    for i in range(length):
        if not np.random.randint(2): # Change 1 out of 2 possibilities
            R_1 = np.random.beta(0.5, 8)*3
            nb_iter = nb_iter + max(-2, R_1*company_sizes[i] + np.random.randint(-2, 2))
            nb_iter = max(0, nb_iter)
            nb_iter = int(min(company_sizes[i]-1, nb_iter))
        nb_persons.append(nb_iter)
        
    
    # Salary
    salary_iter = max(800, int(np.random.normal(1200, 300)+ 0.05*company_sizes[0] +  np.random.normal(40, 400)))
    salaries.append(salary_iter)
    for i in range(1, length + 1):
        R_1 = np.random.normal(100, 50)
        change_person = nb_persons[i-1] - nb_persons[i-2]
        change_company = max(0, company_sizes[i-1] - company_sizes[i-2])
        salary_iter = salary_iter + 0.05*change_company + change_person*R_1 + np.random.normal(100, 50)
        salary_iter = max(int(salary_iter), 500)
        
        salaries.append(salary_iter)

    y = salaries[-1]/1000
    salaries = [_/1000 for _ in salaries[:-1]]
    
    return np.array([salaries, nb_persons, company_sizes]).T, y

In [None]:
# Select feature to predict

In [None]:
y = data['total_amount_claims']
y.head()

In [None]:
# Convert feature to array

In [None]:
y = np.array(y)

In [None]:
y.shape

In [None]:
y = y.reshape(-1,1)

In [None]:
y.shape

In [None]:
# Plot y

In [None]:
plt.plot(y);

In [None]:
# We will use shifted versions of the column Y as independent variables, that is to say use 3 delays of Y
# as inputs to predict the output of our data.

In [None]:
X1 = y[0:len_data-3,:]
X2 = y[1:len_data-2,:]
X3 = y[2:len_data-1,:]
y = y[3:len_data,:]


In [None]:
X1.size

In [None]:
X2.size

In [None]:
X3.size

In [None]:
y.size

In [None]:
X = np.concatenate([X1,X2,X3],axis=1)

In [None]:
X

In [None]:
print(f'X shape is {X.shape}')
print(f'Y shape is {y.shape}')

In [None]:
# we standardize our data both our variable x and our variable y between 0 and 1

In [None]:
# LSTM needs 3 dimensional input so we have to reshape the X input into 3 dimensions.

In [None]:
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

scaler.fit(y)
y = scaler.transform(y)

X= np.reshape(X, (X.shape[0],1,X.shape[1]))

In [None]:
X.shape

In [None]:
X

In [None]:
# Define training and test sets for our model

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [None]:
print(f'X train size:{X_train.shape}')
print(f'X test size:{X_test.shape}')
print(f'y train size:{y_train.shape}')
print(f'y test size:{y_test.shape}')

In [None]:
# model building

In [None]:
model = Sequential()
model.add(LSTM(units= 10,activation = 'tanh',input_shape = (1,3),recurrent_activation= 'hard_sigmoid')) #DUDA: APLICAR RETURN SEQUENCES O NO?

In [None]:
# output layer

In [None]:
model.add(Dense(1))

In [None]:
# We use mean absolute error to assess our model and then we fit our model to the training set and launch the prediction

In [None]:
model.compile(loss= 'mean_squared_error',optimizer = 'rmsprop', metrics=['mae'])
model.fit(X_train,y_train,epochs=100,verbose=1)


In [None]:
model.summary()

In [None]:
model_mae = model.evaluate(X_test, y_test, verbose=0)
print(f'Model Mean Absolute Error {model_mae[1]:.4f}')

In [None]:
Predict = model.predict(X_test)

In [None]:
# Let’s now have a visualization comparing the prediction versus the test set to see how our model performed

In [None]:
plt.figure(figsize=(15,10))
plt.plot(y_test,label = 'Test')
plt.plot(Predict, label = 'Prediction')
plt.legend(loc='best')
plt.show()

In [None]:
# Let’s now add time indexes and scale back to the original scale

In [None]:
# y_train = scaler.inverse_transform(y_train)
# y_train = pd.DataFrame(y_train)
# y_train.index = pd.to_datetime(data.iloc[3:736,0])
# y_train

In [None]:
# y_test = scaler.inverse_transform(y_test)
# y_test = pd.DataFrame(y_test)
# y_test.index = pd.to_datetime(data.iloc[736:,0])
# y_test

In [None]:
# Predict = model.predict(X_test)
# Predict = scaler.inverse_transform(Predict)
# Predict = pd.DataFrame(Predict)
# Predict.index=pd.to_datetime(data.iloc[736:,0])
# Predict

In [None]:
# plt.figure(figsize=(15,10))
# plt.plot(y_test)
# plt.plot(Predict)
# plt.show()

In [None]:
# Create a data frame to compare Y in train with the prediction of model 

test_vs_pred_df= test.merge(forecast_test, on='ds', how='left')
merged_df
test_vs_pred_df = pd.DataFrame(merged_df[['ds', 'y', 'yhat']]).copy()
test_vs_pred_df['absolute_error'] = abs(test_vs_pred_df['y'] - test_vs_pred_df['yhat'])
test_vs_pred_df