In [3]:
import os
import datetime
import dateutil
import tarfile
import pandas as pd
import pandas.io.sql as sqlio
import requests
import re
import glob
import numpy as np

%matplotlib inline
from matplotlib import pylab as plt

In [None]:
!mkdir -p bourso/
!rm -rf bourso/20*
stream = requests.get('https://www.lrde.epita.fr/~ricou/pybd/projet/bourso.tgz', stream=True)
tarfile.open(fileobj=stream.raw, mode='r|gz').extractall('bourso/') # try 'r:gz' if there is an error

In [335]:
tmp['name'] = tmp['name'].str.lower()
tmp[tmp['name'].str.contains('metropole')]

Unnamed: 0_level_0,Unnamed: 1_level_0,last,volume,symbol,name
Unnamed: 0_level_1,symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-09-15 11:11:02.091659,1rPMMT,11.600,9523,1rPMMT,m6 metropole tele.
2020-09-15 11:11:02.091659,1rPMMTNV,17.650,0,1rPMMTNV,metropole i15
2020-11-06 12:51:02.049578,1rPMMT,10.120,35577,1rPMMT,m6 metropole tele.
2020-11-06 12:51:02.049578,1rPMMTNV,17.650,0,1rPMMTNV,metropole i15
2020-10-05 17:01:02.132038,1rPMMT,10.300,109171,1rPMMT,m6 metropole tele.
...,...,...,...,...,...
2020-06-23 09:11:02.115775,1rPMMT,9.780,1245,1rPMMT,metropole tele
2020-04-09 10:42:02.366142,1rPMMTNV,17.650,0,1rPMMTNV,metropole i15
2020-04-09 10:42:02.366142,1rPMMT,9.710,22596,1rPMMT,metropole tele
2020-09-25 15:31:01.930607,1rPMMT,10.140,24491,1rPMMT,m6 metropole tele.


In [5]:
def clean_last(df):
    """ last is of object type and sometimes ends with (c) or (s)"""
    return [float(re.split('\\(.\\)$',str(x))[0].replace(' ','').replace(',','.')) for x in df["last"]]

def read_bourso_year(year):
    compA = pd.concat({dateutil.parser.parse(f.split('compA ')[1].split('.bz2')[0]):pd.read_pickle(f) for f in glob.glob('bourso/' + year + '/compA*')})
    compB = pd.concat({dateutil.parser.parse(f.split('compB ')[1].split('.bz2')[0]):pd.read_pickle(f) for f in glob.glob('bourso/' + year + '/compB*')})
    merge = pd.concat([compA, compB])
    merge['last'] = clean_last(merge)
    merge.reset_index(level=1, drop=True, inplace=True)
    merge.rename_axis('date', axis=0, inplace=True)
    #dropping duplicates only checks columns
    merge = merge.reset_index().drop_duplicates().set_index('date')
    merge.set_index('symbol', append=True, inplace=True)
    merge = merge.swaplevel(0,1).sort_index()
    
    #delta indicates the volume(number of stock sold) per entry instead of volume which is cumulative per day
    merge['delta'] = np.zeros(len(merge))
    for stock in merge.index.levels[0]:
        merge.loc[(stock, slice(None)), 'delta'] =  merge.loc[(stock, slice(None)) ,'volume'].diff()
    
    #filling holes from start of day data and missing data
    merge.loc[merge.delta < 0, 'delta'] = 0
    merge.delta = merge.delta.fillna(0)

    #sets the delta of the first entry to its volume instead of 0
    #we should do this but it fails on 2020 and 2023 so it's disabled for now
    #merge.loc[merge.groupby('symbol').head(1).index, 'delta'] = merge.groupby('symbol')['volume'].transform('first')

    return merge

#test = read_bourso_year('2020')
#test.head()

In [6]:
def convert_bourso_daily(df):
    df['turnover'] = df['last'] * df['delta']

    dates = df.index.get_level_values('date').normalize()
    dates.name = 'date'

    df_daily = df.groupby(['symbol', dates]).agg({
        'last': 'last',      # Last entry of the day
        'volume': 'max',     # Maximum volume of the day
        'name': 'first',     # First name entry of the day
        'turnover': 'sum'    # Sum of all turnovers in that day
    })
    return df_daily

#tmp = convert_bourso_daily(test)
#tmp

# Euronext

In [None]:
!mkdir -p euronext/
!rm -rf euronext/
stream = requests.get('https://www.lrde.epita.fr/~ricou/pybd/projet/euronext.tgz', stream=True)
tarfile.open(fileobj=stream.raw, mode='r|gz').extractall('euronext/') # try 'r:gz' if there is an error

In [7]:
# starting at some point in 2022 the column names were changed
import dateutil.parser


rename_dict = {
    "Open Price": "Open",
    "High Price": "High",
    "low Price": "Low",
    "last Price": "Last",
    "last Trade MIC Time":"Last Date/Time",
    "Currency":"Trading Currency"
}

#note that some entries in open, high, low, last are just set to -
#ok apparently the currency can be set as 0

def read_euronext_file(path):
    if path.endswith(".csv"):
        return pd.read_csv(path, delimiter='\t')
    return pd.read_excel(path)

def regularize_data_to_numbers(df):
    """ last is of object type and sometimes ends with (c) or (s)"""
    df[['Volume','Turnover']] = df[['Volume','Turnover']].replace('-',0).fillna(0)
    df['Last'] = [round(float(x),2) for x in df["Last"]]
    df['Volume'] = [int(x) for x in df["Volume"]]
    df['Turnover'] = [round(float(x),2) for x in df["Turnover"]]
    return df

def regularize_euronext_empty_columns_fill(df):
    df.rename(columns=rename_dict, inplace=True)
    if 'Closing Price' in df.columns:
        df['Last'] = df['Last'].fillna(df['Closing Price']).fillna(0)
        df.drop(columns=['Closing Price'], inplace=True)
    if 'Closing Price DateTime' in df.columns:
        df['Last Date/Time'] = df['Last Date/Time'].fillna(df['Closing Price DateTime']).fillna(0)
        df.drop(columns=['Closing Price DateTime'], inplace=True)
    return df

def regularize_data_to_string(df, columns):
    for col in columns:
        df[col] = df[col].astype(str).str.strip().str.lower()
    return df


def read_euronext_year(year):
    eur = pd.concat([regularize_euronext_empty_columns_fill(read_euronext_file(f)) for f in glob.glob('euronext/*' + year + '*')])
    #the first three rows are a preamble that doesnt give us anything
    eur = eur.iloc[3:].reset_index(drop=True)
    eur = eur[~((eur['Last'] == '-') & (eur['Volume'] == '-') & (eur['Turnover'] == '-'))]
    eur = eur[~((eur['Symbol'].isna()))]
    #any remaining '-' in the data we assume to be null or 0
    eur = regularize_data_to_numbers(eur)
    eur = eur.drop_duplicates()
    return eur

In [462]:
eur = read_euronext_year('2023')
eur.head()

  df[['Volume','Turnover']] = df[['Volume','Turnover']].replace('-',0).fillna(0)


Unnamed: 0,Name,ISIN,Symbol,Market,Trading Currency,Open,High,Low,Last,Last Date/Time,Time Zone,Volume,Turnover
0,1000MERCIS,FR0010285965,ALMIL,Euronext Growth Paris,EUR,30.0,30.2,29.0,30.0,17/05/2023 17:35,CET,343,10109.8
1,2CRSI,FR0013341781,AL2SI,Euronext Growth Paris,EUR,1.57,1.575,1.54,1.57,17/05/2023 17:25,CET,3597,5636.9
2,A.S.T. GROUPE,FR0000076887,ALAST,Euronext Growth Paris,EUR,1.54,1.54,1.5,1.5,17/05/2023 16:26,CET,10698,16227.36
3,AB SCIENCE,FR0010557264,AB,Euronext Paris,EUR,4.3,4.37,4.02,4.34,17/05/2023 17:38,CET,252916,1065494.05
4,ABC ARBITRAGE,FR0004040608,ABCA,Euronext Paris,EUR,6.05,6.08,6.03,6.05,17/05/2023 17:35,CET,17478,105742.85


In [8]:
#note Hammerson plc and t stamp inc is in gbp and usd respectively

In [9]:
#different markets for the same stock?
# - markets are not in the boursorama data so i assume we can just ignore that

In [10]:
#note that volume in euronext is equivalent to delta in bourso
def get_bourso_matching_df(euronext_df):
    tmp = euronext_df[['Symbol', 'Last Date/Time', 'ISIN', 'Last', 'Volume', 'Name', 'Turnover']]
    tmp = tmp.rename(columns={'Symbol':'symbol', 'Last Date/Time':'date', 'ISIN':'isin', 'Last':'last', 'Volume':'volume', 'Name':'name', 'Turnover':'turnover'})
    tmp = tmp.set_index(['symbol','date'])
    tmp = tmp.sort_index()
    return tmp

# Fusing Both

In [11]:
#in boursorama the name is 'AIR FRANCE -KLM' and in euronext it's 'AIR FRANCE - KLM' so i'm just going to remove all of the spaces so we can match up the names

In [None]:
def get_unique_isin_name_combinations(df):
    cleaned = df.copy()
    cleaned['isin'] = cleaned['isin'].astype(str).str.strip()
    cleaned['name'] = cleaned['name'].astype(str).str.strip().str.lower()
    return cleaned[['isin', 'name']].drop_duplicates().reset_index(drop=True)

def remove_substrings(series: pd.Series, substrings: list[str]) -> pd.Series:
    cleaned = series.astype(str)
    for sub in substrings:
        cleaned = cleaned.str.replace(sub, '', regex=False)
    return cleaned.str.lower()

def remove_ending_substrings(series: pd.Series, endings: list[str]) -> pd.Series:
    cleaned = series.astype(str)
    for ending in endings:
        cleaned = cleaned.str.rstrip()  # In case of trailing spaces before matching
        cleaned = cleaned.apply(lambda x: x[:-len(ending)] if x.endswith(ending) else x)
    return cleaned.str.strip()

def remove_starting_substrings(series: pd.Series, prefixes: list[str]) -> pd.Series:
    cleaned = series.astype(str)
    for prefix in prefixes:
        cleaned = cleaned.str.lstrip()  # In case of leading spaces before matching
        cleaned = cleaned.apply(lambda x: x[len(prefix):] if x.startswith(prefix) else x)
    return cleaned.str.strip()

def remove_ending_substrings_regex(series: pd.Series, endings: list[str]) -> pd.Series:
    # Escape any special regex characters in endings to match them literally
    escaped_endings = [re.escape(e) for e in endings]
    
    # Build one regex pattern that matches any ending at the end of a string ($)
    pattern = f"({'|'.join(escaped_endings)})$"
    
    # Use str.replace with regex=True to remove matching suffixes
    return series.astype(str).str.replace(pattern, '', regex=True).str.strip()

def apply_custom_name_mapping(df: pd.DataFrame, mapping: dict) -> pd.DataFrame:
    df = df.copy()
    df['name'] = df['name'].map(mapping).fillna(df['name'])
    return df

def get_fusion(year):
    suffix = ['westfield', 'mo.', 'ds', 'we', 'co', 'rpfd', 'sa', 'i', 'inc', 'tion', 'i18',
            '(exassystembrime)', 'om.', 'i14', '(societe)','act.a','inv', 'htls', 'ds06',
            'mo.', 'opr', 'opa', 'i13', 'cc', 'vilcc', 'i16', 'nv', 'se', '(ex:eurotunnel)',
            'corp', 'ltd']
    prefix = ['srd']
    bourso_name_convert = {
        "bainsdemermona": "bainsmermona",
        "baccaratn":"baccarat",
        "bigbeninteractiv":"bigbeninteractive",
        "bastideleconfor":"bastideleconfort",
        "cambodgecien":"cambodgen",
        "robertetcie87":"robertetc",
        "robertetcdv":"robertet",
        "casinoguichardperrachon":"casinoguichard",
        "casinoguicper":"casinoguichard",
        "sartoriussted":"sartoriusstedbio",
        "sartoriusbiotech":"sartoriusstedbio",
        "eurofinsscientif":'eurofinsscient',
        "exelindustrie":"exelindustries",
        "deltaplusgrp":"deltaplusgroup",
        "dassaultsys":"dassaultsystemes",
        "euroress":"euroressources",
        "foncierelyonnais":"foncierelyonnaise",
        "pernodricardnv11":"pernodricard",
        "ramsaygenerale":"ramsaygensante",
        "gecinanominatif":"gecinanom",
        "gtt(gaztransportettec)":"gtt",
        "idlogistics":"idlogisticsgroup",
        "igeplusxao":"ige+xao",
        "kering(ex:ppr)":"kering",
        "lebonn":"lebon",
        "linedata":"linedataservices",
        "arcelormittal":"arcelor",
        "malterfrancobel":"malteriesfcobel",
        "maurel&prom":"maureletprom",
        "maurelpr":"maureletprom",
        "maurel":"maureletprom",
        "michelinn":"michelin",
        "michelin(mlnv)":"michelinnv20",
        "metropoletele":"metropoletv",
        "m6metropoletele":"metropoletv",
        "nrjgrp":"nrjgroup",
        "fiducialrealestate":"fiducialrealest",
        "partouche":"groupepartouche",
        "grpepartouche":"groupepartouche",
        "patrimoinecom":"patrimoineetcomm",
        "pharmagestinteract":"pharmagestinter",
        "publicisgrp":"publicisgroupe",
        "plastvaldeloire":"plastvalloire",
        "plastivaloire":"plastvalloire",
        "plastvdeloir":"plastvalloire",
        "eurazeodaanf":"eurazeo",
        "groupesteria":"soprasteriagroup",
        "secheenviron":"secheenvironnem",    
        "silic":"silc",
        "soitecpsr16":"soitec",
        "soprasteria":"soprasteriagroup",
        "sqlinr":"sql",
        "stmicroelectr":"stmicroelectronics",
        "schneiderel":"schneiderelectric",
        "schneiderelec":"schneiderelectric",
        "technip":"technipfmc",
        "thermador":"thermadorgroupe",
        "tikehaurt170717":"tikehaucapital",
        "technicolornr":"technicolor",
        "pierreetvacances":"pierrevacances",
        "veoliaenvironnem":"veoliaenviron",
        "veolia":"veoliaenviron",
        "viel":"vieletcompagnie",
        "voltaliart080719":"voltalia",
        "vrankenpommerymonopole":"vrankenpommery",
        "xfabsilicon":"xfab",
        "vrankenpommerymo":"vrankenpommery"
    }
    euro_name_convert = {
        "":""
    }
    #suffix_regex = ['i\\d*']

    bourso = convert_bourso_daily(read_bourso_year(year))
    bourso['name'] = remove_substrings(bourso['name'],[' ', '-', '.'])
    bourso['name'] = remove_ending_substrings(bourso['name'],suffix)
    bourso['name'] = remove_starting_substrings(bourso['name'],prefix)
    bourso = apply_custom_name_mapping(bourso, bourso_name_convert)
    #bourso['name'] = remove_ending_substrings_regex(bourso['name'],suffix_regex)

    
    euro = get_bourso_matching_df(read_euronext_year(year))
    euro['name'] = remove_substrings(euro['name'],[' ', '-', '.'])
    euro['name'] = remove_ending_substrings(euro['name'],suffix)
    euro['name'] = remove_starting_substrings(euro['name'],prefix)
    #euro['name'] = remove_ending_substrings_regex(euro['name'],suffix_regex)
    euro.reset_index(inplace=True)
    euro['date'] = euro['date'].apply(lambda x: dateutil.parser.parse(x))

    euro.set_index('symbol', append=False, inplace=True)
    euro.set_index('date', append=True, inplace=True)
    euro.sort_index()
    return bourso, euro

b, e = get_fusion("2020")

In [450]:
def attach_isin_to_boursorama(b_df, e_df):
    # Step 1: Get unique (isin, name) mapping from Euronext
    isin_name_map = get_unique_isin_name_combinations(e_df)

    # Step 2: Reset index in Boursorama to access 'name' as column
    b_reset = b_df.reset_index()

    # Step 3: Merge on the pre-cleaned 'name'
    merged = pd.merge(
        b_reset,
        isin_name_map,
        on='name',
        how='left'
    )

    unmatched_count = merged['isin'].isna().sum()
    
    if unmatched_count > 0:
        print(f"⚠️ {unmatched_count} rows had no matching ISIN.")

    return merged.set_index(['symbol', 'date'])

copy_b = attach_isin_to_boursorama(b, e)

⚠️ 55485 rows had no matching ISIN.


In [451]:
print(len(copy_b['name'].unique()))

523


In [None]:
no_isin = copy_b[copy_b['isin'].isna()]['name'].unique()
print(len(no_isin))
no_isin[100:200]

230


array(['sfrgroup', 'segro(reit)', 'siph', 'bailinvestissementfonciere',
       'selogercom', 'sodiceexpansion', 'sucrierepithiviers',
       'skisrossignol', 'fonc6&7emeparisn', 'salvepar', 'salveparpsr14',
       'silice12ope', 'etamdeveloppement', 'tesfran', 'terreis',
       'francoisfreres', 'transgrts270619', 'banquetarneaud', 'uffbanque',
       'valespadr', 'valespadrpfda', 'vectrane', 'vermandoisesucr',
       'provim', 'vmmateriaux', 'eurosos55%sep23ex', 'eurosicosrjun21ex',
       'salveporn01jan22ex', 'zodiacaero', 'zodiacaeroe17'], dtype=object)