In [11]:
# LIBRERIAS

import os
import tensorflow as tf
import tensorflow.keras as keras
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from sklearn.model_selection import TimeSeriesSplit
import math
from sklearn.metrics import mean_squared_error
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import GridSearchCV
import kerastuner as kt



tf.config.list_physical_devices('GPU') 



[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [55]:
# Lectura de ficheros con los datasets de las empresas a analizar

companies = {}
ficheros = os.listdir('data')

for file in ficheros:
    company = file[:-4]
    dfcompany = pd.read_csv(f"data/{file}")
    companies[company] = dfcompany




In [56]:
# Tomamos datos de amazon para pruebas con el modelo.
dfcompany = companies['MSFT']
dfcompany

Unnamed: 0,symbol,date,close,high,low,open,volume,adjClose,adjHigh,adjLow,adjOpen,adjVolume,divCash,splitFactor
0,MSFT,2017-05-22 00:00:00+00:00,68.45,68.50,67.50,67.890,15484530,64.097262,64.144083,63.207673,63.572873,15484530,0.00,1.0
1,MSFT,2017-05-23 00:00:00+00:00,68.68,68.75,68.38,68.720,15347877,64.312636,64.378185,64.031713,64.350093,15347877,0.00,1.0
2,MSFT,2017-05-24 00:00:00+00:00,68.77,68.88,68.45,68.870,14422965,64.396913,64.499918,64.097262,64.490554,14422965,0.00,1.0
3,MSFT,2017-05-25 00:00:00+00:00,69.62,69.88,68.91,68.970,21702912,65.192862,65.436328,64.528011,64.584195,21702912,0.00,1.0
4,MSFT,2017-05-26 00:00:00+00:00,69.96,70.22,69.52,69.800,19644260,65.511241,65.754708,65.099221,65.361416,19644260,0.00,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,MSFT,2022-05-13 00:00:00+00:00,261.12,263.04,255.35,257.350,34925093,260.484372,262.399698,254.728418,256.723549,34925093,0.00,1.0
1255,MSFT,2022-05-16 00:00:00+00:00,261.50,265.82,255.78,259.955,32550933,260.863447,265.172931,255.157371,259.322208,32550933,0.00,1.0
1256,MSFT,2022-05-17 00:00:00+00:00,266.82,268.33,262.46,266.110,28828799,266.170497,267.676821,261.821110,265.462225,28828799,0.00,1.0
1257,MSFT,2022-05-18 00:00:00+00:00,254.08,263.60,252.77,263.000,31355985,254.080000,263.600000,252.770000,263.000000,31355985,0.62,1.0


In [14]:
# Crreación de una matriz de datos de para la red neuronal
def create_dataset(dataset, look_back):
	x_data, y_data = [], []
	for i in range(len(dataset)-look_back-1):
		x = dataset[i:(i+look_back), 0]
		x_data.append(x)
		y = dataset[i + look_back, 0]
		y_data.append(y)
        
	return np.array(x_data), np.array(y_data)

def graficar(dfcompany, company):
	plt.plot(dfcompany)
	plt.xlabel('Days')
	plt.ylabel('Price')
	plt.title(label=company)

# Graficar el resultado
def graficarResultado(dfcompany, look_back, train_predict, test_predict):
    # Ajustamos los datos para mostrar la predicción
    trainPredictPlot = np.empty_like(dfcompany)
    trainPredictPlot[:, :] = np.nan
    trainPredictPlot[look_back:len(train_predict)+look_back,:] = train_predict
    
    testPredictPlot = np.empty_like(dfcompany)
    testPredictPlot[:, :] = np.nan
    testPredictPlot[len(train_predict)+(look_back*2)+1:len(dfcompany)-1, :] = test_predict

    # Mostrar las los datos originales y la predicción
    plt.plot(scaler.inverse_transform(dfcompany), label='Original data')
    plt.plot(trainPredictPlot, label='Train prediction')
    plt.plot(testPredictPlot, label='Test prediction')
    plt.legend()
    
    plt.xlabel('Days')
    plt.ylabel('Price')
    plt.show()
	
#Separar datos de entrenamiento y test
def split(dfcompany, look_back, percent=70):
    training_size = int(len(dfcompany)*percent) 
    test_size = len(dfcompany)-training_size 

    train_data, test_data = dfcompany[0:training_size,:], dfcompany[training_size:len(dfcompany),:1]


    #Creamos la matrix para los datos de entrenamiento y test    
    X_train, y_train = create_dataset(train_data, look_back) #(,) (,)
    X_test, y_test = create_dataset(test_data, look_back) #(,) (,)
    #print(f"La matriz X_train tiene la forma: {X_train.shape}")
    #print(f"La matriz y_train tiene la forma: {y_train.shape}")
    #print(f"La matriz X_test tiene la forma: {X_test.shape}")
    #print(f"La matriz y_test tiene la forma: {y_test.shape}")
    

    # Hay que hacer reshape a las entradas de la red LSTM añadiendo 1 dimension [samples, look_back, features]
    X_train = X_train.reshape(X_train.shape[0],X_train.shape[1] , 1)
    X_test = X_test.reshape(X_test.shape[0],X_test.shape[1] , 1)


    return X_train, X_test, y_train, y_test



In [15]:
def model_builder(hp):
  '''
  Args:
    hp - Keras tuner object
  '''
  # Initialize the Sequential API and start stacking the layers
  model = keras.Sequential()

  # Tune the number of units in the first LSTM layer
  # Choose an optimal value between 10-100
  hp_units0 = hp.Int('units0', min_value=10, max_value=100, step=20)
  model.add(LSTM(units=hp_units0, input_shape=(look_back,1), return_sequences=True, name='LSTM1'))

  # Tune the number of units in the second LSTM layer
  # Choose an optimal value between 10-100
  hp_units1 = hp.Int('units1', min_value=10, max_value=100, step=20)
  model.add(LSTM(units=hp_units1, name='LSTM2'))
  
  # Add next layers
  model.add(keras.layers.Dropout(0.2))
  model.add(Dense(1, activation='linear'))

  # Tune the learning rate for the optimizer
  # Choose an optimal value from 0.01, 0.001, or 0.0001
  hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

  model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                loss='mean_squared_error',
                metrics=['mse'])

  return model

In [16]:
def prediccion (model, X_train, X_test, y_train, y_test, look_back, scaler):
    # Realizar predicción sobre datos de entranamiento y test
    train_predict = model.predict(X_train)
    test_predict = model.predict(X_test)

    # Transformamos los datos a su escala original
    train_predict = scaler.inverse_transform(train_predict)
    test_predict = scaler.inverse_transform(test_predict)

    return  train_predict, test_predict

In [17]:
# Definimos cuantos días usará el modelo para predecir el día siguiente
look_back = 50

# Instanciamos el tuner
tuner = kt.Hyperband(model_builder, # the hypermodel
                    objective='val_loss', # objective to optimize
                    max_epochs=10,
                    factor=3, # the denominator of the number of models trained in every iteration of the training loop.
                    directory='keras_tuner_dir', # directory to save logs 
                    project_name='Stock_prediction')

# hypertuning settings
tuner.search_space_summary() 

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss',mode='min', patience=5, min_delta=0.2)

INFO:tensorflow:Reloading Oracle from existing project keras_tuner_dir\Stock_prediction\oracle.json
INFO:tensorflow:Reloading Tuner from keras_tuner_dir\Stock_prediction\tuner0.json
Search space summary
Default search space size: 3
units0 (Int)
{'default': None, 'conditions': [], 'min_value': 10, 'max_value': 100, 'step': 20, 'sampling': None}
units1 (Int)
{'default': None, 'conditions': [], 'min_value': 10, 'max_value': 100, 'step': 20, 'sampling': None}
learning_rate (Choice)
{'default': 0.01, 'conditions': [], 'values': [0.01, 0.001, 0.0001], 'ordered': True}


In [18]:
def generarHipermodelo(X_train, y_train, X_test, y_test): 


    # Perform hypertuning
    tuner.search(X_train, y_train, epochs=10, validation_data=(X_test,y_test), callbacks=[stop_early]) 

    best_hp = tuner.get_best_hyperparameters()[0]

    # Build the model with the optimal hyperparameters
    h_model = tuner.hypermodel.build(best_hp)
    h_model.summary()
    print(tuner.get_best_hyperparameters()[0].get('learning_rate'))

    return h_model




In [19]:
def generarResultados(dfcompany):
    #dfcompany = companies[company]
    # Por cada empresa, preprocesar, entrenar y guardar resultados
    company = dfcompany.iloc[0,0]

    # Descartamos todas las columnas y nos quedamos unicamente con el precio de cierre de la acción
    dfcompany = dfcompany.reset_index()['close']
    scaler=MinMaxScaler(feature_range=(0,1))
    dfcompany = scaler.fit_transform(np.array(dfcompany).reshape(-1,1))
    X_train, X_test, y_train, y_test = split(dfcompany, look_back, percent=0.90)

    h_model = generarHipermodelo(X_train, y_train, X_test, y_test)
    #Entrenamiento del modelo mejorado
    history = h_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2, callbacks=[stop_early])

    train_predict, test_predict = prediccion(h_model, X_train, X_test, y_train, y_test, look_back, scaler)

    # Calculamos el error en entrenamiento y test
    train_error = math.sqrt(mean_squared_error(y_train, train_predict))
    test_error = math.sqrt(mean_squared_error(y_test, test_predict))

    print(f"Para la empresa {company}")
    print(f"Error entrenamiento: {train_error}")
    print(f"Error test: {test_error}")

    #graficarResultado(dfcompany, look_back, train_predict, test_predict)
    return (train_error, test_error)

In [20]:
dictErrores = {}
for company in companies:
    #if company == 'AMZN':
    dfcompany = companies[company]
    dictErrores[company] = generarResultados(dfcompany)
    


INFO:tensorflow:Oracle triggered exit
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
LSTM1 (LSTM)                 (None, 50, 90)            33120     
_________________________________________________________________
LSTM2 (LSTM)                 (None, 30)                14520     
_________________________________________________________________
dropout_3 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 31        
Total params: 47,671
Trainable params: 47,671
Non-trainable params: 0
_________________________________________________________________
0.01
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Para la empresa AAL
Error entrenamiento: 30.911626933506206
Error test: 16.366827848965627
INFO:tensorflow:Oracle triggered exit
Model: "sequential_4"
____

In [22]:
for company in dictErrores:
    print (company+ " " + str(dictErrores[company]))

AAL (30.911626933506206, 16.366827848965627)
AAPL (198.29751825467324, 160.4224155134145)
ADBE (376.10239259414783, 508.578556096042)
ADI (117.1349554593199, 164.310177831009)
ADP (152.10304053176407, 208.99649823436908)
ADSK (200.68725433174737, 234.57341373841163)
AKAM (86.56531602202334, 111.51790806682281)
ALXN (120.45144300243784, 173.6457067651963)
AMAT (71.28548147753048, 135.93596449118598)
AMGN (211.26616313535487, 233.46382798304145)
AMZN (2224.327610946917, 3023.24878001725)
ATVI (68.37005503810654, 75.70774049019978)
AVGO (334.8565459650972, 629.9005142123067)
BBBY (18.96616731298254, 17.571189938569024)
BIDU (187.9069866797932, 151.02102707213803)
BIIB (287.7262220881447, 221.0666917287631)
BMRN (86.45365521032963, 83.03877065641163)
CERN (68.73573770251163, 88.30555999571565)
CHKP (112.96380217099643, 128.7119182028408)
CHTR (477.09079857611596, 575.0555313480414)
CMCSA (43.097475066038186, 47.54928545522254)
COST (293.05044616491205, 530.0474921402789)
CSCO (46.093979964

In [50]:
errors = pd.DataFrame(list(dictErrores.values()))
#print(errors)
train_errors = errors[0]
test_errors = errors[1]


print (f"La media del RMSE obtenido en entrenamiento entrenando cada empresa por separado es {np.mean(train_errors)}")
print (f"La media del RMSE obtenido en test entrenando cada empresa por separado es {np.mean(test_errors)}")

La media del RMSE obtenido en entrenamiento entrenando cada empresa por separado es 191.1306885602477
La media del RMSE obtenido en test entrenando cada empresa por separado es 246.94510698790893


In [51]:
print(len(train_errors))

77
