In [487]:
import os
import datetime
import dateutil
import tarfile
import pandas as pd
import pandas.io.sql as sqlio
import requests
import re
import glob
import numpy as np

%matplotlib inline
from matplotlib import pylab as plt

In [None]:
!mkdir -p bourso/
!rm -rf bourso/20*
stream = requests.get('https://www.lrde.epita.fr/~ricou/pybd/projet/bourso.tgz', stream=True)
tarfile.open(fileobj=stream.raw, mode='r|gz').extractall('bourso/') # try 'r:gz' if there is an error

In [491]:
def clean_last(df):
    """ last is of object type and sometimes ends with (c) or (s)"""
    return [float(re.split('\\(.\\)$',str(x))[0].replace(' ','').replace(',','.')) for x in df["last"]]

def read_bourso_year(year):
    compA = pd.concat({dateutil.parser.parse(f.split('compA ')[1].split('.bz2')[0]):pd.read_pickle(f) for f in glob.glob('bourso/' + year + '/compA*')})
    compB = pd.concat({dateutil.parser.parse(f.split('compB ')[1].split('.bz2')[0]):pd.read_pickle(f) for f in glob.glob('bourso/' + year + '/compB*')})
    merge = pd.concat([compA, compB])
    merge['last'] = clean_last(merge)
    merge.reset_index(level=1, drop=True, inplace=True)
    merge.rename_axis('date', axis=0, inplace=True)
    #dropping duplicates only checks columns
    merge = merge.reset_index().drop_duplicates().set_index('date')
    merge.set_index('symbol', append=True, inplace=True)
    merge = merge.swaplevel(0,1).sort_index()
    
    #delta indicates the volume(number of stock sold) per entry instead of volume which is cumulative per day
    merge['delta'] = np.zeros(len(merge))
    for stock in merge.index.levels[0]:
        merge.loc[(stock, slice(None)), 'delta'] =  merge.loc[(stock, slice(None)) ,'volume'].diff()
    
    #filling holes from start of day data and missing data
    merge.loc[merge.delta < 0, 'delta'] = 0
    merge.delta = merge.delta.fillna(0)

    #sets the delta of the first entry to its volume instead of 0
    #we should do this but it fails on 2020 and 2023 so it's disabled for now
    #merge.loc[merge.groupby('symbol').head(1).index, 'delta'] = merge.groupby('symbol')['volume'].transform('first')

    return merge

#test = read_bourso_year('2020')
#test.head()

In [492]:
def convert_bourso_daily(df):
    df['turnover'] = df['last'] * df['delta']

    dates = df.index.get_level_values('date').normalize()
    dates.name = 'date'

    df_daily = df.groupby(['symbol', dates]).agg({
        'last': 'last',      # Last entry of the day
        'volume': 'max',     # Maximum volume of the day
        'name': 'first',     # First name entry of the day
        'turnover': 'sum'    # Sum of all turnovers in that day
    })
    return df_daily

#tmp = convert_bourso_daily(test)
#tmp

# Euronext

In [None]:
!mkdir -p euronext/
!rm -rf euronext/
stream = requests.get('https://www.lrde.epita.fr/~ricou/pybd/projet/euronext.tgz', stream=True)
tarfile.open(fileobj=stream.raw, mode='r|gz').extractall('euronext/') # try 'r:gz' if there is an error

In [493]:
# starting at some point in 2022 the column names were changed
import dateutil.parser


rename_dict = {
    "Open Price": "Open",
    "High Price": "High",
    "low Price": "Low",
    "last Price": "Last",
    "last Trade MIC Time":"Last Date/Time",
    "Currency":"Trading Currency"
}

#note that some entries in open, high, low, last are just set to -
#ok apparently the currency can be set as 0

def read_euronext_file(path):
    if path.endswith(".csv"):
        return pd.read_csv(path, delimiter='\t')
    return pd.read_excel(path)

def regularize_data_to_numbers(df):
    """ last is of object type and sometimes ends with (c) or (s)"""
    df[['Volume','Turnover']] = df[['Volume','Turnover']].replace('-',0).fillna(0)
    df['Last'] = [round(float(x),2) for x in df["Last"]]
    df['Volume'] = [int(x) for x in df["Volume"]]
    df['Turnover'] = [round(float(x),2) for x in df["Turnover"]]
    return df

def regularize_euronext_empty_columns_fill(df):
    df.rename(columns=rename_dict, inplace=True)
    if 'Closing Price' in df.columns:
        df['Last'] = df['Last'].fillna(df['Closing Price']).fillna(0)
        df.drop(columns=['Closing Price'], inplace=True)
    if 'Closing Price DateTime' in df.columns:
        df['Last Date/Time'] = df['Last Date/Time'].fillna(df['Closing Price DateTime']).fillna(0)
        df.drop(columns=['Closing Price DateTime'], inplace=True)
    return df

def regularize_data_to_string(df, columns):
    for col in columns:
        df[col] = df[col].astype(str).str.strip().str.lower()
    return df

def convert_foreign_currencies_to_eur(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    
    # Normalize currency column
    df['Trading Currency'] = df['Trading Currency'].astype(str).str.strip().str.upper()
    
    # USD case
    usd_mask = df['Trading Currency'] == 'USD'
    if usd_mask.any():
        print(f"🔁 Converting {usd_mask.sum()} rows from USD to EUR at rate 0.91")
        df.loc[usd_mask, 'Last'] = df.loc[usd_mask, 'Last'] * 0.91
    
    # GBP case
    gbp_mask = df['Trading Currency'] == 'GBP'
    if gbp_mask.any():
        print(f"🔁 Converting {gbp_mask.sum()} rows from GBP to EUR at rate 1.15")
        df.loc[gbp_mask, 'Last'] = df.loc[gbp_mask, 'Last'] * 1.15

    return df

def read_euronext_year(year):
    eur = pd.concat([regularize_euronext_empty_columns_fill(read_euronext_file(f)) for f in glob.glob('euronext/*' + year + '*')])
    #the first three rows are a preamble that doesnt give us anything
    eur = eur.iloc[3:].reset_index(drop=True)
    eur = eur[~((eur['Last'] == '-') & (eur['Volume'] == '-') & (eur['Turnover'] == '-'))]
    eur = eur[~((eur['Symbol'].isna()))]
    #any remaining '-' in the data we assume to be null or 0
    eur = regularize_data_to_numbers(eur)
    eur = convert_foreign_currencies_to_eur(eur)
    eur = eur.drop_duplicates()
    return eur

In [None]:
#note Hammerson plc and t stamp inc is in gbp and usd respectively

In [None]:
#note that volume in euronext is equivalent to delta in bourso
def get_bourso_matching_df(euronext_df):
    tmp = euronext_df[['Symbol', 'Last Date/Time', 'ISIN', 'Last', 'Volume', 'Name', 'Turnover']]
    tmp = tmp.rename(columns={'Symbol':'symbol', 'Last Date/Time':'date', 'ISIN':'isin', 'Last':'last', 'Volume':'volume', 'Name':'name', 'Turnover':'turnover'})
    tmp = tmp.set_index(['symbol','date'])
    tmp = tmp.sort_index()
    return tmp

# Getting Both

In [None]:
#in boursorama the name is 'AIR FRANCE -KLM' and in euronext it's 'AIR FRANCE - KLM' so i'm just going to remove all of the spaces so we can match up the names

In [494]:
def get_unique_isin_name_combinations(df):
    cleaned = df.copy()
    cleaned['isin'] = cleaned['isin'].astype(str).str.strip()
    cleaned['name'] = cleaned['name'].astype(str).str.strip().str.lower()
    return cleaned[['isin', 'name']].drop_duplicates().reset_index(drop=True)

def remove_substrings(series: pd.Series, substrings: list[str]) -> pd.Series:
    cleaned = series.astype(str)
    for sub in substrings:
        cleaned = cleaned.str.replace(sub, '', regex=False)
    return cleaned.str.lower()

def remove_ending_substrings(series: pd.Series, endings: list[str]) -> pd.Series:
    cleaned = series.astype(str)
    for ending in endings:
        cleaned = cleaned.str.rstrip()  # In case of trailing spaces before matching
        cleaned = cleaned.apply(lambda x: x[:-len(ending)] if x.endswith(ending) else x)
    return cleaned.str.strip()

def remove_starting_substrings(series: pd.Series, prefixes: list[str]) -> pd.Series:
    cleaned = series.astype(str)
    for prefix in prefixes:
        cleaned = cleaned.str.lstrip()  # In case of leading spaces before matching
        cleaned = cleaned.apply(lambda x: x[len(prefix):] if x.startswith(prefix) else x)
    return cleaned.str.strip()

def remove_ending_substrings_regex(series: pd.Series, endings: list[str]) -> pd.Series:
    # Escape any special regex characters in endings to match them literally
    escaped_endings = [re.escape(e) for e in endings]
    
    # Build one regex pattern that matches any ending at the end of a string ($)
    pattern = f"({'|'.join(escaped_endings)})$"
    
    # Use str.replace with regex=True to remove matching suffixes
    return series.astype(str).str.replace(pattern, '', regex=True).str.strip()

def apply_custom_name_mapping(df: pd.DataFrame, mapping: dict) -> pd.DataFrame:
    df = df.copy()
    df['name'] = df['name'].map(mapping).fillna(df['name'])
    return df

def attach_isin_to_boursorama(b_df, e_df):
    #Get unique (isin, name) mapping from Euronext
    isin_name_map = get_unique_isin_name_combinations(e_df)

    #Reset index in Boursorama to access 'name' as column
    b_reset = b_df.reset_index()

    #Merge on the pre-cleaned 'name'
    merged = pd.merge(
        b_reset,
        isin_name_map,
        on='name',
        how='left'
    )

    unmatched_count = merged['isin'].isna().sum()
    
    if unmatched_count > 0:
        print(f"⚠️ {unmatched_count} rows had no matching ISIN.")

    return merged.set_index(['symbol', 'date'])

def remove_inactive_stocks(df: pd.DataFrame) -> pd.DataFrame:
    # Step 1: Group by symbol and check if turnover is always zero
    inactive_symbols = (
        df.groupby('symbol')['turnover']
        .apply(lambda x: (x == 0).all())
    )
    
    # Step 2: Filter out those symbols
    active_symbols = inactive_symbols[~inactive_symbols].index
    
    # Step 3: Keep only rows with active symbols
    return df[df.index.get_level_values('symbol').isin(active_symbols)]

def get_both(year):
    suffix = ['westfield', 'mo.', 'ds', 'we', 'co', 'rpfd', 'sa', 'i', 'inc', 'tion', 'i18',
            '(exassystembrime)', 'om.', 'i14', '(societe)','act.a','inv', 'htls', 'ds06',
            'mo.', 'opr', 'opa', 'i13', 'cc', 'vilcc', 'i16', 'nv', 'se', '(ex:eurotunnel)',
            'corp', 'ltd']
    prefix = ['srd']
    bourso_name_convert = {
        "bainsdemermona": "bainsmermona",
        "baccaratn":"baccarat",
        "bigbeninteractiv":"bigbeninteractive",
        "bastideleconfor":"bastideleconfort",
        "cambodgecien":"cambodgen",
        "robertetcie87":"robertetc",
        "robertetcdv":"robertet",
        "casinoguichardperrachon":"casinoguichard",
        "casinoguicper":"casinoguichard",
        "sartoriussted":"sartoriusstedbio",
        "sartoriusbiotech":"sartoriusstedbio",
        "eurofinsscientif":'eurofinsscient',
        "exelindustrie":"exelindustries",
        "deltaplusgrp":"deltaplusgroup",
        "dassaultsys":"dassaultsystemes",
        "euroress":"euroressources",
        "foncierelyonnais":"foncierelyonnaise",
        "pernodricardnv11":"pernodricard",
        "ramsaygenerale":"ramsaygensante",
        "gecinanominatif":"gecinanom",
        "gtt(gaztransportettec)":"gtt",
        "idlogistics":"idlogisticsgroup",
        "igeplusxao":"ige+xao",
        "kering(ex:ppr)":"kering",
        "lebonn":"lebon",
        "linedata":"linedataservices",
        "arcelormittal":"arcelor",
        "malterfrancobel":"malteriesfcobel",
        "maurel&prom":"maureletprom",
        "maurelpr":"maureletprom",
        "maurel":"maureletprom",
        "michelinn":"michelin",
        "michelin(mlnv)":"michelinnv20",
        "metropoletele":"metropoletv",
        "m6metropoletele":"metropoletv",
        "nrjgrp":"nrjgroup",
        "fiducialrealestate":"fiducialrealest",
        "partouche":"groupepartouche",
        "grpepartouche":"groupepartouche",
        "patrimoinecom":"patrimoineetcomm",
        "pharmagestinteract":"pharmagestinter",
        "publicisgrp":"publicisgroupe",
        "plastvaldeloire":"plastvalloire",
        "plastivaloire":"plastvalloire",
        "plastvdeloir":"plastvalloire",
        "eurazeodaanf":"eurazeo",
        "groupesteria":"soprasteriagroup",
        "secheenviron":"secheenvironnem",    
        "silic":"silc",
        "soitecpsr16":"soitec",
        "soprasteria":"soprasteriagroup",
        "sqlinr":"sql",
        "stmicroelectr":"stmicroelectronics",
        "schneiderel":"schneiderelectric",
        "schneiderelec":"schneiderelectric",
        "technip":"technipfmc",
        "thermador":"thermadorgroupe",
        "tikehaurt170717":"tikehaucapital",
        "technicolornr":"technicolor",
        "pierreetvacances":"pierrevacances",
        "veoliaenvironnem":"veoliaenviron",
        "veolia":"veoliaenviron",
        "viel":"vieletcompagnie",
        "voltaliart080719":"voltalia",
        "vrankenpommerymonopole":"vrankenpommery",
        "xfabsilicon":"xfab",
        "vrankenpommerymo":"vrankenpommery"
    }
    #suffix_regex = ['i\\d*']

    bourso = convert_bourso_daily(read_bourso_year(year))
    bourso['name'] = remove_substrings(bourso['name'],[' ', '-', '.'])
    bourso['name'] = remove_ending_substrings(bourso['name'],suffix)
    bourso['name'] = remove_starting_substrings(bourso['name'],prefix)
    bourso = apply_custom_name_mapping(bourso, bourso_name_convert)
    #bourso['name'] = remove_ending_substrings_regex(bourso['name'],suffix_regex)

    
    euro = get_bourso_matching_df(read_euronext_year(year))
    euro['name'] = remove_substrings(euro['name'],[' ', '-', '.'])
    euro['name'] = remove_ending_substrings(euro['name'],suffix)
    euro['name'] = remove_starting_substrings(euro['name'],prefix)
    #euro['name'] = remove_ending_substrings_regex(euro['name'],suffix_regex)
    euro.reset_index(inplace=True)
    euro['date'] = euro['date'].apply(lambda x: dateutil.parser.parse(x))

    euro.set_index('symbol', append=False, inplace=True)
    euro.set_index('date', append=True, inplace=True)
    euro.sort_index()
    
    bourso = attach_isin_to_boursorama(bourso, euro)

    bourso = remove_inactive_stocks(bourso)
    euro = remove_inactive_stocks(euro)
    return bourso, euro

b, e = get_both("2020")

⚠️ 55485 rows had no matching ISIN.
