In [4]:
import os
import datetime
import dateutil
import tarfile
import pandas as pd
import pandas.io.sql as sqlio
import requests
import re
import glob
import numpy as np

%matplotlib inline
from matplotlib import pylab as plt

In [53]:
!mkdir -p bourso/
!rm -rf bourso/20*
stream = requests.get('https://www.lrde.epita.fr/~ricou/pybd/projet/bourso.tgz', stream=True)
tarfile.open(fileobj=stream.raw, mode='r|gz').extractall('bourso/') # try 'r:gz' if there is an error

In [7]:
def clean_last(df):
    """ last is of object type and sometimes ends with (c) or (s)"""
    return [float(re.split('\\(.\\)$',str(x))[0].replace(' ','').replace(',','.')) for x in df["last"]]

def read_bourso_year(year):
    compA = pd.concat({dateutil.parser.parse(f.split('compA ')[1].split('.bz2')[0]):pd.read_pickle(f) for f in glob.glob('bourso/' + year + '/compA*')})
    compB = pd.concat({dateutil.parser.parse(f.split('compB ')[1].split('.bz2')[0]):pd.read_pickle(f) for f in glob.glob('bourso/' + year + '/compB*')})
    merge = pd.concat([compA, compB])
    merge['last'] = clean_last(merge)
    merge.reset_index(level=1, drop=True, inplace=True)
    merge.rename_axis('date', axis=0, inplace=True)
    #dropping duplicates only checks columns
    merge = merge.reset_index().drop_duplicates().set_index('date')
    merge.set_index('symbol', append=True, inplace=True)
    merge = merge.swaplevel(0,1).sort_index()
    
    #delta indicates the volume(number of stock sold) per entry instead of volume which is cumulative per day
    merge['delta'] = np.zeros(len(merge))
    for stock in merge.index.levels[0]:
        merge.loc[(stock, slice(None)), 'delta'] =  merge.loc[(stock, slice(None)) ,'volume'].diff()
    
    #filling holes from start of day data and missing data
    merge.loc[merge.delta < 0, 'delta'] = 0
    merge.delta = merge.delta.fillna(0)

    #sets the delta of the first entry to its volume instead of 0
    merge.loc[merge.groupby('symbol').head(1).index, 'delta'] = merge.groupby('symbol')['volume'].transform('first')

    return merge

test = read_bourso_year('2024')
test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,last,volume,name,delta
symbol,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1rAAF,2024-01-15 09:02:01.771435,1.417,0,AIR FRANCE - KLM,0.0
1rAAF,2024-01-15 09:12:01.699818,1.417,0,AIR FRANCE - KLM,0.0
1rAAF,2024-01-15 09:22:01.752884,1.417,0,AIR FRANCE - KLM,0.0
1rAAF,2024-01-15 09:32:01.376396,1.417,0,AIR FRANCE - KLM,0.0
1rAAF,2024-01-15 09:42:01.728177,1.417,0,AIR FRANCE - KLM,0.0


In [30]:
def convert_bourso_daily(df):
    df['turnover'] = df['last'] * df['delta']
    df_daily = df.groupby(['symbol', df.index.get_level_values('date').date]).agg({
        'last': 'last',      # Last entry of the day
        'volume': 'max',     # Maximum volume of the day
        'name': 'first',     # First name entry of the day
        'turnover': 'sum'    # Sum of all turnovers in that day
    })
    return df_daily

tmp = convert_bourso_daily(test)
tmp

Unnamed: 0_level_0,Unnamed: 1_level_0,last,volume,name,turnover
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1rAAF,2022-01-03,4.356,0,AIR FRANCE - KLM,0.000
1rAAF,2022-01-04,4.356,0,AIR FRANCE - KLM,0.000
1rAAF,2022-01-05,4.356,0,AIR FRANCE - KLM,0.000
1rAAF,2022-01-06,4.356,0,AIR FRANCE - KLM,0.000
1rAAF,2022-01-07,4.356,0,AIR FRANCE - KLM,0.000
...,...,...,...,...,...
INEDS,2022-06-09,0.177,0,FONCIERE INEA DS,0.000
INEDS,2022-06-10,0.177,0,FONCIERE INEA DS,0.000
INEDS,2022-06-13,0.177,0,FONCIERE INEA DS,0.000
INEDS,2022-06-14,0.000,476037,FONCIERE INEA DS,41.680


# Euronext

In [4]:
!mkdir -p euronext/
!rm -rf euronext/
stream = requests.get('https://www.lrde.epita.fr/~ricou/pybd/projet/euronext.tgz', stream=True)
tarfile.open(fileobj=stream.raw, mode='r|gz').extractall('euronext/') # try 'r:gz' if there is an error

In [31]:
# starting at some point in 2022 the column names were changed
rename_dict = {
    "Open Price": "Open",
    "High Price": "High",
    "low Price": "Low",
    "last Price": "Last",
    "last Trade MIC Time":"Last Date/Time",
    "Currency":"Trading Currency"
}

#note that some entries in open, high, low, last are just set to -
#ok apparently the currency can be set as 0

def regularize_euronext_columns(df):
    df.rename(columns=rename_dict, inplace=True)
    if 'Closing Price' in df.columns:
        df['Last'] = df['Last'].fillna(df['Closing Price']).fillna(0)
        df.drop(columns=['Closing Price'], inplace=True)
    if 'Closing Price DateTime' in df.columns:
        df['Last Date/Time'] = df['Last Date/Time'].fillna(df['Closing Price DateTime']).fillna(0)
        df.drop(columns=['Closing Price DateTime'], inplace=True)
    return df

def read_euronext_df(path):
    if path.endswith(".csv"):
        return pd.read_csv(path, delimiter='\t')
    return pd.read_excel(path)

def dash_to_zero(df):
    df[['Volume','Turnover']] = df[['Volume','Turnover']].replace('-',0)
    return df

def columns_to_numbers(df):
    """ last is of object type and sometimes ends with (c) or (s)"""
    df['Last'] = [round(float(x),2) for x in df["Last"]]
    df['Volume'] = [int(x) for x in df["Volume"]]
    df['Turnover'] = [round(float(x),2) for x in df["Turnover"]]
    return df

def read_euronext_year(year):
    eur = pd.concat([regularize_euronext_columns(read_euronext_df(f)) for f in glob.glob('euronext/*' + year + '*')])
    #the first three rows are a preamble that doesnt give us anything
    eur = eur.iloc[3:].reset_index(drop=True)
    eur = eur[~((eur['Last'] == '-') & (eur['Volume'] == '-') & (eur['Turnover'] == '-'))]
    eur = eur[~((eur['Symbol'].isna()))]
    #any remaining '-' in the data we assume to be null or 0
    eur = dash_to_zero(eur).fillna(0)
    eur = eur.drop_duplicates()
    return columns_to_numbers(eur)

df = read_euronext_year('2022')
df.head()

Unnamed: 0,Name,ISIN,Symbol,Market,Trading Currency,Open,High,Low,Last,Last Date/Time,Time Zone,Volume,Turnover
0,1000MERCIS,FR0010285965,ALMIL,Euronext Growth Paris,EUR,30.0,30.0,30.0,30.0,27/05/2022 13:10,CET,950,28500.0
1,2CRSI,FR0013341781,2CRSI,Euronext Paris,EUR,4.24,4.35,4.24,4.34,27/05/2022 17:35,CET,8000,34444.63
2,2MX ORGANIC,FR0014000T90,2MX,Euronext Paris,EUR,9.86,9.98,9.86,9.98,27/05/2022 17:35,CET,537,5341.26
3,2MX ORGANIC BS,FR0014000TB2,2MXBS,Euronext Paris,EUR,0.23,0.23,0.23,0.23,08/04/2022 14:47,CET,5000,1150.0
4,A TOUTE VITESSE,FR0010050773,MLATV,Euronext Access Paris,EUR,1.48,1.48,1.35,1.35,13/11/2019 16:53,CET,378,521.87


In [32]:
#note that volume in euronext is equivalent to delta in bourso
def get_bourso_matching_df(euronext_df):
    tmp = euronext_df[['Symbol', 'Last Date/Time', 'Last', 'Volume', 'Name', 'Turnover']]
    tmp = tmp.rename(columns={'Symbol':'symbol', 'Last Date/Time':'date', 'Last':'last', 'Volume':'volume', 'Name':'name', 'Turnover':'turnover'})
    tmp = tmp.set_index(['symbol','date'])
    return tmp
tmp = get_bourso_matching_df(df)
tmp

Unnamed: 0_level_0,Unnamed: 1_level_0,last,volume,name,turnover
symbol,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALMIL,27/05/2022 13:10,30.00,950,1000MERCIS,28500.00
2CRSI,27/05/2022 17:35,4.34,8000,2CRSI,34444.63
2MX,27/05/2022 17:35,9.98,537,2MX ORGANIC,5341.26
2MXBS,08/04/2022 14:47,0.23,5000,2MX ORGANIC BS,1150.00
MLATV,13/11/2019 16:53,1.35,378,A TOUTE VITESSE,521.87
...,...,...,...,...,...
ALWIT,05/09/2022 17:02,7.75,1969,WITBE,15225.42
MLWIZ,05/09/2022 16:30,8.20,160,WIZIBOAT,1312.00
WLN,05/09/2022 17:35,42.58,529652,WORLDLINE,22433106.50
XFAB,05/09/2022 17:35,6.01,192172,X-FAB,1155694.84
