In [258]:
# CARGAMOS LIBRERIAS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import datetime

import boto3
import s3fs
import fastparquet
import awswrangler as wr
import os

import pyarrow.parquet as pq
import pyarrow as pa

import requests
import json
from io import BytesIO
from pandas.tseries.offsets import BDay

import math

In [259]:
# INICIALIZAMOS UNA SESION EN AWS
session = boto3.Session(
    aws_access_key_id=os.environ['S3_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_SECRET_KEY'],
    region_name=os.environ['S3_REGION'])

s3_client = boto3.client('s3')
ibex_historical_data = {}
bucket_name = 'stock-market-historical-data'
raw_folder = 'raw/marketstack'
pre_folder = 'preprocessed/marketstack/'
pro_folder = 'processed/marketstack/'

OBJETIVO DE ESTE NOTEBOOK:
- DESCARGAR DE DISTINTOS ORIGENES LOS DATOS DE LOS ACTIVOS DEL IBEX35
- COMPARAR LOS DATOS OBTENIDOS 

## CARGA DE COMPOSICIÓN HISTÓRICA DE IBEX35

In [260]:
# LEEMOS EL FICHERO CON LA COMPOSICIÓN HISTORICA
ibex_constituents = pd.read_excel('data_processing/Historical Composition IBEX35.xlsx')
ibex_constituents.head()

Unnamed: 0,Activo,Inclusion,Exclusion
0,ACE,1991-01-02,2003-06-02
1,ACX,1991-01-02,1991-07-01
2,ALB,1991-01-02,1991-07-01
3,ASL,1991-01-02,1994-01-03
4,BBV,1991-01-02,2000-01-31


In [261]:
# ibex_constituents['Exclusion'] -= BDay(1)

# ibex_constituents['Inclusion'] = ibex_constituents['Inclusion'].dt.date
# ibex_constituents['Exclusion'] = ibex_constituents['Exclusion'].dt.date

# OBTENEMOS EL DIA ACTUAL
today = datetime.date.today().strftime('%Y-%m-%d')

# COMPLETAMOS LOS DATOS NULOS EN EL FICHERO DE COMPOSICIÓN HISTORICA CON LA FECHA ACTUAL
ibex_constituents.fillna(today,inplace=True)

# DEFINIMOS UNA VARIABLE PARA TODOS LOS ACTIVOS
activos = ibex_constituents.Activo
ibex_constituents['Ticker'] = activos.apply(lambda x: x.split('_')[0])

unique_activos = list(set(activos))

In [262]:
ibex_constituents.Ticker.replace('RAD','EZE',inplace=True)
ibex_constituents.Ticker.replace('PUL','EBRO',inplace=True)
ibex_constituents.Ticker.replace('EVA','EBRO',inplace=True)
ibex_constituents.Ticker.replace('EBA','EBRO',inplace=True)
ibex_constituents.Ticker.replace('BBV','BBVA',inplace=True)
ibex_constituents.Ticker.replace('BBVAA','BBVA',inplace=True)

In [263]:
ibex_constituents[ibex_constituents['Ticker'] == 'EZE']

Unnamed: 0,Activo,Inclusion,Exclusion,Ticker
78,RAD,1999-04-19,1999-07-01,EZE


In [264]:
current_constituents = ibex_constituents.loc[ibex_constituents['Exclusion'] == today]
current_constituents

Unnamed: 0,Activo,Inclusion,Exclusion,Ticker
5,BKT,1991-01-02,2023-08-27,BKT
18,IBE,1991-01-02,2023-08-27,IBE
21,REP,1991-01-02,2023-08-27,REP
25,TEF,1991-01-02,2023-08-27,TEF
71,ACS,1998-04-02,2023-08-27,ACS
79,FER,1999-07-01,2023-08-27,FER
80,IDR,1999-07-01,2023-08-27,IDR
85,BBVA,2000-01-31,2023-08-27,BBVA
95,ITX,2001-07-02,2023-08-27,ITX
96,SAN_1,2001-11-01,2023-08-27,SAN


## DESCARGA DE DATOS CON API DE MARKETSTACK 

In [265]:
def get_exchanges_data(api_token):
    # OBTENEMOS TODOS LOS EXCHANGES
    url = f'http://api.marketstack.com/v1/'
    exchange_list = 'exchanges'

    r = requests.get(
                url+exchange_list,
                params={
                    'access_key':api_token
                }
            )

    exchanges = pd.DataFrame(json.load(BytesIO(r.content))['data'])
    return exchanges

In [96]:
spain_ex_code = exchanges.loc[exchanges['country']=='Spain','mic'].values[0]
spain_ex_code

NameError: name 'exchanges' is not defined

In [266]:
def get_stock_list(api_token, exchange_code):
    
    url = f'http://api.marketstack.com/v1/'
    symbol_list = 'tickers'

    r = requests.get(url+symbol_list,
                    params={
                        'access_key': api_token,
                        'exchange': exchange_code,
                        'limit':1000
                        }
                    )

    symbols = pd.DataFrame(json.load(BytesIO(r.content))['data'])
    return symbols

In [267]:
def get_eod_data(api_token, exchange_code, ticker, start_date, end_date, limit=1000):
    url = f'http://api.marketstack.com/v1/'
    eod_data = 'eod'
    mstack_ticker = ticker+'.'+exchange_code
    r = requests.get(url+eod_data,
                            params={
                                'access_key': api_token,
                                'exchange': exchange_code,
                                'symbols': mstack_ticker,
                                'date_from': start_date,
                                'date_to': end_date,
                                'limit': limit
                                }
                            )
    ticker_data = pd.DataFrame(json.load(BytesIO(r.content))['data'])
    ticker_data.set_index('date', inplace=True)

    return ticker_data

In [268]:
def upload_parquet_to_s3(bucket_name, file_name, folder_name, dataframe):
    full_path = folder_name + file_name + '.parquet'
    s3 = boto3.resource('s3')
    table = pa.Table.from_pandas(dataframe)
    buf = BytesIO()
    pq.write_table(table, buf)
    s3.Object(bucket_name, full_path).put(Body=buf.getvalue())

In [None]:
failed_loads = []

indexes = current_constituents.index

for i in indexes:

    try:
        ticker = ibex_constituents.Ticker[i]
        start_date = ibex_constituents.Inclusion[i]
        end_date = ibex_constituents.Exclusion[i]
        
        file_name = ibex_constituents.Activo[i]

        total_days = (end_date.to_period('D') - start_date.to_period('D')).n/1000

        stock_data = pd.DataFrame()

        for _ in range(math.ceil(total_days)):
            try:
                prov_end_date = start_date + datetime.timedelta(days=1000)
                if prov_end_date > end_date:
                    ticker_data = get_eod_data(
                                        API_TOKEN,
                                        'BMEX',
                                        ticker,
                                        start_date,
                                        end_date
                                        )
                else:
                    ticker_data = get_eod_data(
                                        API_TOKEN,
                                        'BMEX',
                                        ticker,
                                        start_date,
                                        prov_end_date
                                        )

                stock_data = pd.concat([stock_data, ticker_data], axis=0)
                start_date = prov_end_date + datetime.timedelta(days=1)
            except:
                start_date = prov_end_date + datetime.timedelta(days=1)
                continue
        print(stock_data)
        # upload_parquet_to_s3(bucket_name, file_name, pre_folder, stock_data)
        print(ibex_constituents.Activo[i])
    except:
        print('failed')
        failed_loads.append(ticker)
        continue

In [269]:
def get_file_list(bucket_name, folder_name):
        s3 = boto3.resource('s3')
        folder_path = bucket_name + folder_name
        objects_dict = s3_client.list_objects_v2(
                Bucket=bucket_name,
                Prefix=folder_name)
        filepaths = [item['Key'] for item in objects_dict['Contents'] if item['Key'].endswith('.parquet')]
        act_list = [i.split('/')[-1].replace('.parquet','') for i in filepaths]
        return filepaths, act_list

In [270]:
current_constituents_filepath = [pre_folder + act + '.parquet' for act in current_constituents.Activo]

In [271]:
def download_s3_parquet_file(s3, bucket, key):
    buffer = BytesIO()
    s3.Object(bucket, key).download_fileobj(buffer)
    return buffer

In [272]:
# REALIZAMOS LA DESCARGA DE LOS DATOS DE MARKETSTACK Y LOS CARGAMOS EN UN BUCKET DE S3
failed_loads = []
act_list = current_constituents.Activo.values
for i in range(len(current_constituents_filepath)):
    act_data = pq.read_table(download_s3_parquet_file(s3, bucket_name, current_constituents_filepath[i])).to_pandas()
    act_data.index = pd.to_datetime(act_data.index)
    act_data.sort_index(inplace=True)
    date = act_data.index[-1] + datetime.timedelta(days=1)
    if (pd.to_datetime(today, utc=True) - date).days > 1:
        new_data = get_eod_data(
                            API_TOKEN,
                            'BMEX',
                            current_constituents.Ticker.iloc[i],
                            date.strftime('%Y-%m-%d'),
                            today
                            )
        new_data.index = pd.to_datetime(new_data.index)
        act_data = pd.DataFrame(act_data)
        act_data = pd.concat([act_data,new_data], axis=0)
        act_data.sort_index(inplace=True)
    
    else:    
        continue
    print(current_constituents_filepath[i])
    upload_parquet_to_s3(bucket_name, act_list[i], pre_folder, act_data)

preprocessed/marketstack/BKT.parquet
preprocessed/marketstack/IBE.parquet
preprocessed/marketstack/REP.parquet
preprocessed/marketstack/TEF.parquet
preprocessed/marketstack/ACS.parquet
preprocessed/marketstack/FER.parquet
preprocessed/marketstack/IDR.parquet
preprocessed/marketstack/BBVA.parquet
preprocessed/marketstack/ITX.parquet
preprocessed/marketstack/SAN_1.parquet
preprocessed/marketstack/ENG.parquet
preprocessed/marketstack/SAB.parquet
preprocessed/marketstack/MAP_2.parquet
preprocessed/marketstack/GRF.parquet
preprocessed/marketstack/MTS.parquet
preprocessed/marketstack/AMS_1.parquet
preprocessed/marketstack/IAG.parquet
preprocessed/marketstack/CABK.parquet
preprocessed/marketstack/SCYR_2.parquet
preprocessed/marketstack/ELE_1.parquet
preprocessed/marketstack/ACX_2.parquet
preprocessed/marketstack/AENA.parquet
preprocessed/marketstack/ANA_2.parquet
preprocessed/marketstack/MRL.parquet
preprocessed/marketstack/CLNX.parquet
preprocessed/marketstack/MEL.parquet
preprocessed/market

In [273]:
pre_filepaths, act_list = get_file_list(bucket_name, pre_folder)

In [274]:
# CARGAMOS EL DATAFRAME CON TODOS LOS DATOS DE CIERRE DE TODOS LOS ACTIVOS DESDE 02/01/1991
ibex_historical_data = {}
pro_folder = 'processed/marketstack/'
file_name = 'ibex_historical_data'

for i in range(len(pre_filepaths)):

    try:
        close_price = pq.read_table(download_s3_parquet_file(s3, bucket_name, pre_filepaths[i])).to_pandas()['adj_close']
        close_price.index = pd.to_datetime(close_price.index).strftime('%Y-%m-%d')
        ibex_historical_data[act_list[i]] = close_price
    except:
        ibex_historical_data[act_list[i]] = np.nan

ibex_historical_data = pd.DataFrame(ibex_historical_data)
upload_parquet_to_s3(bucket_name, file_name, pro_folder, ibex_historical_data)

In [275]:
ibex_historical_data

Unnamed: 0_level_0,A3TV,ABE,ABG.P,ABG.P_1,ABG,ACE,ACR,ACS,ACX,ACX_1,...,URB,VAL,VAL_1,VDR,VIS,VIS_1,VIS_2,VIS_3,ZEL,ZOT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1993-08-16,,,,,,,,,,1.2025,...,,,,,2.9683,,,,,0.4146
1993-08-17,,,,,,,,,,1.2035,...,,,,,3.0440,,,,,0.4307
1993-08-18,,,,,,,,,,1.2010,...,,,,,3.1276,,,,,0.4421
1993-08-19,,,,,,,,,,1.1961,...,,,,,3.0608,,,,,0.4421
1993-08-20,,,,,,,,,,1.1877,...,,,,,3.0440,,,,,0.4452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-21,,,,,,,,31.31,,,...,,,,,,,,,,
2023-08-22,,,,,,,,31.74,,,...,,,,,,,,,,
2023-08-23,,,,,,,,31.75,,,...,,,,,,,,,,
2023-08-24,,,,,,,,31.92,,,...,,,,,,,,,,


In [276]:
failed_loads = ibex_historical_data.columns[ibex_historical_data.isnull().values.all(axis=0)]
failed_loads, len(failed_loads)

(Index(['A3TV', 'ABE', 'ABG.P', 'ABG.P_1', 'ACE', 'ACR', 'ACX', 'AGR', 'AGS',
        'AGS_1', 'AGS_2', 'AGS_3', 'ALB', 'ALB_1', 'ALT', 'AMS', 'ARA', 'ARG',
        'ASL', 'AUM', 'AZC', 'AZC_1', 'BCH', 'BTO', 'BTO_1', 'BTO_2', 'CAN',
        'CAR', 'CEN', 'CEP', 'CEP_1', 'CIN', 'CRF', 'CRI', 'CRI_1', 'CRI_2',
        'CTE', 'CTF', 'CTG', 'CUB', 'DRC', 'EBRO_1', 'ECR', 'ELE', 'EXT', 'FAD',
        'FEC', 'FOC', 'GAM', 'GAM_1', 'GAS', 'GES', 'GES_1', 'GPP', 'HHU',
        'HHU_1', 'HID', 'HIS', 'IBLA', 'IBR', 'JAZ', 'LOR', 'MVC', 'MVC_1',
        'OHLA', 'PMD', 'POP', 'PRY', 'SAR', 'SCH', 'SEV', 'SGC', 'SOL', 'SYV',
        'SYV_1', 'TAB', 'TEM', 'TPI', 'TPZ', 'TRR', 'TRR_1', 'UNF', 'UNI',
        'URA', 'URB', 'VAL', 'VAL_1', 'VDR', 'ZEL'],
       dtype='object'),
 89)