Code to truncate daily stock prices to the last available value of each month.

In [1]:
import pandas as pd
from datetime import date, timedelta, datetime

import scipy.stats
from matplotlib import pyplot as plt
import numpy as np
import math
from scipy.stats import t, norm
from matplotlib.ticker import PercentFormatter
import random
import calendar as cd
cal = cd.Calendar()

First we take the daily price data, and filter it down to the last day of each month (might be different in different stock markets, thus this is done for each country separately)

Note that all-NaN rows have already been removed from the data.

In [91]:
# Change the country manually to match the file name
country = "Sweden"

In [92]:
daily0 = pd.read_csv(f"Data/Market price data/Daily/{country}.csv", sep=";", index_col=0)
daily0

Unnamed: 0_level_0,NDA-SE.ST,ARION-SDB.ST,KIND-SDB.ST,ALIF-B.ST,ADDT-B.ST,ALFA.ST,ALIG.ST,AMBEA.ST,ARJO-B.ST,ASSA-B.ST,...,THULE.ST,TOBII.ST,TREL-B.ST,VITR.ST,VOLV-B.ST,WALL-B.ST,WIHL.ST,FOI-B.ST,TROAX.ST,VALUE_COUNTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3.1.2005,44.405399,,5.998276,,2.565974,17.186098,,,,25.635954,...,,,30.029280,2.321107,26.099649,3.288391,4.682930,,,78
4.1.2005,44.405399,,5.880277,,2.543466,17.105030,,,,25.860830,...,,,30.420965,2.321107,26.099649,3.328774,4.766555,,,78
5.1.2005,43.913830,,5.860610,,2.487194,17.105030,,,,25.186203,...,,,30.682083,2.239869,25.852493,3.299928,4.716381,,,78
6.1.2005,,,5.860610,,2.487194,17.105030,,,,,...,,,,2.239869,25.852493,3.299928,4.716381,,,51
7.1.2005,44.569252,,5.762277,,2.487194,17.267162,,,,25.186203,...,,,30.029280,2.228263,26.593956,3.380697,4.850179,,,78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24.5.2023,109.160004,10.06,130.850006,132.100006,216.600006,384.200012,75.199997,37.259998,42.720001,243.300003,...,264.000000,20.940001,273.100006,214.600006,201.600006,36.660000,80.500000,791.0,221.000000,121
25.5.2023,109.080002,9.91,126.550003,134.899994,219.800003,388.100006,74.599998,37.360001,41.599998,241.500000,...,260.299988,20.959999,270.000000,211.000000,201.050003,36.000000,78.949997,788.0,217.800003,121
26.5.2023,109.879997,9.95,125.750000,135.399994,224.000000,394.399994,75.900002,37.060001,42.680000,246.000000,...,267.799988,20.320000,273.600006,220.000000,204.050003,35.880001,78.650002,801.0,220.000000,121
29.5.2023,109.160004,9.84,125.300003,133.899994,222.000000,392.200012,75.699997,36.200001,42.299999,245.100006,...,270.100006,19.760000,272.799988,220.399994,204.250000,35.560001,78.000000,800.0,216.800003,121


Group by month, obtaining the last available price

In [93]:
daily = daily0.copy()
dates = daily.index

datetimes = [datetime.strptime(date, "%d.%m.%Y") for date in dates]
#days = [datetime.strptime(date, "%d.%m.%Y").day for date in dates]
months = [datetime.strptime(date, "%d.%m.%Y").month for date in dates]
years = [datetime.strptime(date, "%d.%m.%Y").year for date in dates]

#daily.insert(0, "Day", days)
daily.insert(0, "Month", months)
daily.insert(0, "Year", years)

daily.index = datetimes
daily.insert(0, "Date", datetimes)
monthly = daily.loc[daily.groupby(["Year","Month"]).max()["Date"],"Year":]
monthly = monthly.set_index(["Year", "Month"])
monthly

Unnamed: 0_level_0,Unnamed: 1_level_0,NDA-SE.ST,ARION-SDB.ST,KIND-SDB.ST,ALIF-B.ST,ADDT-B.ST,ALFA.ST,ALIG.ST,AMBEA.ST,ARJO-B.ST,ASSA-B.ST,...,THULE.ST,TOBII.ST,TREL-B.ST,VITR.ST,VOLV-B.ST,WALL-B.ST,WIHL.ST,FOI-B.ST,TROAX.ST,VALUE_COUNTS
Year,Month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2005,1,42.439106,,7.178265,,2.430923,18.158895,,,,24.848885,...,,,29.376469,2.379134,28.027468,3.455694,4.716381,,,78
2005,2,47.190983,,11.504889,,2.993636,19.537024,,,,23.499626,...,,,33.685013,2.379134,31.092194,4.269138,5.184674,,,78
2005,3,46.863266,,12.507879,,3.061162,18.807426,,,,22.487677,...,,,31.987700,2.402346,30.894466,4.817204,5.468995,,,78
2005,4,44.409000,,15.549228,,2.768550,16.881653,,,,21.282051,...,,,28.539194,2.216657,29.660679,5.363563,5.820216,,,78
2005,5,44.737957,,12.526871,,2.926110,19.002464,,,,22.612181,...,,,31.543320,2.088997,31.205498,5.780208,5.853664,,,78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023,1,120.930618,11.20,104.777405,109.699997,166.399994,321.867889,80.320549,37.997841,39.724205,242.937790,...,245.226242,20.820000,254.750397,222.167908,200.134094,47.441074,82.508171,951.367737,219.193283,121
2023,2,131.714874,11.00,113.600235,112.199997,188.000000,339.585388,69.375961,35.089844,38.978027,252.540482,...,229.573502,23.920000,264.330414,210.013428,202.988312,43.294197,83.950623,909.106567,223.187683,121
2023,3,110.639999,10.64,115.594101,93.800003,192.300003,363.897766,72.210327,35.186775,39.724205,245.808701,...,250.739151,21.420000,288.280457,214.397003,206.906815,38.948902,76.305634,847.189026,215.198868,121
2023,4,113.820000,10.52,125.800003,121.199997,205.000000,375.500000,73.969170,37.803974,45.360001,243.899994,...,295.700012,21.400000,257.399994,230.399994,210.800003,40.298126,82.699997,823.000000,232.199997,121


In [95]:
monthly.to_csv(f"Data/Market price data/Monthly/{country}.csv", sep=";") # export the monthly prices by country

#### Combine and transpose

We combine the files from different stock markets. First we convert the currency to EUR across all files. Grouping is mostly reusing the above code.

In [96]:
currency0 = pd.read_csv(f"Data/Market price data/Currency.csv", sep=",", index_col=0)
currency0

Unnamed: 0,EURCZK=X,EURGBP=X,EURNOK=X,EURPLN=X,EURSEK=X,EURCHF=X,EURUSD=X,EURDKK=X
2005-1-3,30.396000,0.70710,8.23550,4.07650,9.0334,1.54520,1.347001,7.43390
2005-1-4,30.434000,0.70510,8.28010,4.10910,9.0475,1.55120,1.328198,7.43500
2005-1-5,30.365999,0.70420,8.26690,4.10930,9.0285,1.54970,1.328004,7.43940
2005-1-6,30.325001,0.70240,8.26900,4.10240,9.0566,1.54700,1.318305,7.44010
2005-1-7,30.330000,0.69860,8.23400,4.09300,9.0359,1.54970,1.306097,7.44010
...,...,...,...,...,...,...,...,...
2023-5-25,23.671101,0.86943,11.78400,4.50749,11.5201,0.97262,1.075732,7.44850
2023-5-26,23.605801,0.87051,11.85495,4.50113,11.6055,0.97059,1.072444,7.44903
2023-5-29,23.692400,0.86843,11.87391,4.52672,11.5792,0.97110,1.072306,7.44811
2023-5-30,23.634899,0.86701,11.90020,4.51768,11.5911,0.96837,1.070767,7.44816


In [97]:
exchange_rates = currency0.copy()
dates = exchange_rates.index

datetimes = [datetime.strptime(date, "%Y-%m-%d") for date in dates]
months = [datetime.strptime(date, "%Y-%m-%d").month for date in dates]
years = [datetime.strptime(date, "%Y-%m-%d").year for date in dates]

exchange_rates.insert(0, "Month", months)
exchange_rates.insert(0, "Year", years)

exchange_rates.index = datetimes
exchange_rates.insert(0, "Date", datetimes)
exchange_rates = exchange_rates.loc[exchange_rates.groupby(["Year","Month"]).max()["Date"],"Year":]

exchange_rates.index = [ str(exchange_rates["Year"][i]) + "-" + str(exchange_rates["Month"][i]) for i in range(exchange_rates.index.size) ]
exchange_rates

Unnamed: 0,Year,Month,EURCZK=X,EURGBP=X,EURNOK=X,EURPLN=X,EURSEK=X,EURCHF=X,EURUSD=X,EURDKK=X
2005-1,2005,1,30.069000,0.69240,8.29560,4.06040,9.10600,1.55010,1.305193,7.44170
2005-2,2005,2,29.665001,0.68860,8.22030,3.88700,9.06010,1.53830,1.321702,7.44310
2005-3,2005,3,30.021999,0.68600,8.21790,4.08330,9.16670,1.55040,1.296697,7.45080
2005-4,2005,4,30.510000,0.67440,8.12870,4.28150,9.17880,1.53850,1.287598,7.44480
2005-5,2005,5,30.344000,0.67700,7.94160,4.16650,9.15500,1.53510,1.230800,7.44090
...,...,...,...,...,...,...,...,...,...,...
2023-1,2023,1,23.783600,0.87851,10.82482,4.70861,11.27741,1.00397,1.085069,7.43828
2023-2,2023,2,23.624399,0.87954,10.96319,4.71120,11.01650,0.99264,1.061121,7.44296
2023-3,2023,3,23.512300,0.88024,11.33408,4.67090,11.29235,0.99600,1.090465,7.44830
2023-4,2023,4,23.467699,0.88268,11.70686,4.57424,11.34748,0.98612,1.103205,7.45414


In [98]:
np.isnan(exchange_rates).sum() # we notice CHF has a NaN value, we'll replace it with the value from the previous date

Year        0
Month       0
EURCZK=X    0
EURGBP=X    0
EURNOK=X    0
EURPLN=X    0
EURSEK=X    0
EURCHF=X    1
EURUSD=X    0
EURDKK=X    0
dtype: int64

In [99]:
exchange_rates.iloc[np.where(np.isnan(exchange_rates["EURCHF=X"]))].loc[:,"EURCHF=X"]

2012-1   NaN
Name: EURCHF=X, dtype: float64

In [100]:
exchange_rates.loc["2012-1","EURCHF=X"] = currency0.loc["2012-1-30","EURCHF=X"]

In [102]:
conversions = {
    "Czechia":"CZK", "United Kingdom":"GBP", "Norway":"NOK", "Poland":"PLN", "Sweden":"SEK", "Switzerland":"CHF", "United States":"USD", "Denmark":"DKK"
}

for country in conversions:
    market_data_local = pd.read_csv(f"Data/Market price data/Monthly/{country}.csv", sep=";")
    market_data_local.index = [ str(market_data_local["Year"][i]) + "-" + str(market_data_local["Month"][i]) for i in range(market_data_local.index.size) ]
    #= market_data_local.set_index(["Year","Month"])
    market_data_local = market_data_local.iloc[:,2:-1]

    currency_code = conversions[country]
    market_data_eur = pd.concat([market_data_local, exchange_rates[f"EUR{currency_code}=X"]], axis=1)
    for col in market_data_eur.iloc[:,:-2]:
        market_data_eur[col] = market_data_eur[col] / market_data_eur[f"EUR{currency_code}=X"]
    market_data_eur.iloc[:,:-1].transpose().to_csv(f"Data/Market price data/Monthly (EUR)/{country}.csv")

Similarly we convert the index prices to EUR

In [103]:
country = "Index"
market_data_local = pd.read_csv(f"Data/Market price data/Monthly/{country}.csv", sep=";")
market_data_local.index = [ str(market_data_local["Year"][i]) + "-" + str(market_data_local["Month"][i]) for i in range(market_data_local.index.size) ]
#= market_data_local.set_index(["Year","Month"])
market_data_local = market_data_local.iloc[:,2:-1]

currency_codes = {"^GSPC":"USD", "^OMX":"SEK", "^FTSE":"GBP", "^SSMI":"CHF"}
market_data_eur = pd.concat([market_data_local, exchange_rates[f"EURUSD=X"], exchange_rates[f"EURSEK=X"], exchange_rates[f"EURGBP=X"], exchange_rates[f"EURCHF=X"]], axis=1)
for col in market_data_eur.iloc[:,:-4]:
    if col in currency_codes:
        market_data_eur[col] = market_data_eur[col] / market_data_eur[f"EUR{currency_codes[col]}=X"]
market_data_eur.iloc[:,:-4].transpose().to_csv(f"Data/Market price data/Monthly (EUR)/{country}.csv")

Then we transpose all EUR data

In [104]:
eur_countries = ["Austria", "Belgium", "Estonia", "Finland", "France", "Greece", "Germany", "Ireland", "Italy", "Lithuania", "Netherlands", "Portugal", "Spain"]

In [105]:
for country in eur_countries:
    market_data_local = pd.read_csv(f"Data/Market price data/Monthly/{country}.csv", sep=";")
    market_data_local.index = [ str(market_data_local["Year"][i]) + "-" + str(market_data_local["Month"][i]) for i in range(market_data_local.index.size) ]
    #= market_data_local.set_index(["Year","Month"])
    market_data_local = market_data_local.iloc[:,2:-1]
    market_data_local.transpose().to_csv(f"Data/Market price data/Monthly (EUR)/{country}.csv")

Finally, we combine all market data into one file

In [106]:
all_countries = ["Austria", "Belgium", "Czechia", "Denmark", "Estonia", "Finland", "France", "Greece", "Germany", "Ireland", "Italy", "Lithuania", "United Kingdom", "Netherlands", "Norway", "Portugal", "Poland", "Spain", "Sweden", "Switzerland", "United States"]

In [107]:
stock_data = pd.read_csv(f"Data/Market price data/Monthly (EUR)/{all_countries[0]}.csv", index_col=0)
for country in all_countries[1:]:
    new_df = pd.read_csv(f"Data/Market price data/Monthly (EUR)/{country}.csv", index_col=0)
    stock_data = pd.concat([stock_data, new_df], axis=0)
index_data = pd.read_csv(f"Data/Market price data/Monthly (EUR)/Index.csv", index_col=0)
combined_data = pd.concat((stock_data,index_data))
combined_data

Unnamed: 0,2005-1,2005-2,2005-3,2005-4,2005-5,2005-6,2005-7,2005-8,2005-9,2005-10,...,2022-8,2022-9,2022-10,2022-11,2022-12,2023-1,2023-2,2023-3,2023-4,2023-5
AGR.VI,10.413229,10.140198,9.632235,9.194118,9.238566,9.511595,9.853304,9.677350,10.541471,9.926290,...,16.100000,14.150000,13.700000,15.500000,14.950000,16.500000,17.000000,16.350000,17.049999,17.450001
AMAG.VI,,,,,,,,,,,...,30.688524,28.962296,29.250000,32.606556,33.277870,33.469673,34.908199,35.195904,34.700001,33.500000
ANDR.VI,4.314121,4.720487,4.915369,4.383574,5.168197,5.750954,5.674334,5.684648,5.893880,5.551299,...,44.514782,42.249348,45.560368,50.294540,51.843552,53.053719,56.587406,62.299999,58.849998,50.150002
ATS.VI,,,,,,,,,,,...,44.500000,33.650002,31.250000,34.599998,32.000000,31.450001,31.299999,28.350000,27.700001,28.559999
BG.VI,,,,,,,,,,,...,41.491219,40.902428,45.005550,45.410347,45.815136,52.117020,54.232979,41.086426,44.160000,40.660000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
^OMX,,,,,,,,,,,...,179.569661,167.081855,180.338827,192.233721,183.393693,194.950802,202.128617,196.925355,200.095538,191.746597
^OMXH25,,,,,,,,,,,...,4701.919922,4382.379883,4606.529785,4925.620117,4825.439941,4938.439941,5027.020020,4777.660156,4749.990234,4463.100098
^GDAXI,4254.850098,4350.490234,4348.770020,4184.839844,4460.629883,4586.279785,4886.500000,4829.689941,5044.120117,4929.069824,...,12834.959960,12114.360350,13253.740230,14397.040040,13923.589840,15128.269530,15365.139650,15628.839840,15922.379880,15664.019530
^FTSE,7007.943320,7215.364469,7134.693844,7119.959067,7332.348747,7575.111266,7633.381531,7745.138050,8038.890793,7847.254427,...,8472.561513,7833.685177,8264.505864,8764.046429,8428.857862,8846.456170,8955.021455,8670.021806,8916.708329,8612.390187


In [108]:
combined_data.to_csv("Data/Market price data/STOCK_PRICE_EUR.csv") # Export the data

### Filter and sample companies

Filter out companies with NaN values, export the data and a rounded version for easy viewing

In [112]:
filtered_stock_data = stock_data.copy().dropna(axis=0)
filtered_data = pd.concat((filtered_stock_data, index_data))
filtered_data

Unnamed: 0,2005-1,2005-2,2005-3,2005-4,2005-5,2005-6,2005-7,2005-8,2005-9,2005-10,...,2022-8,2022-9,2022-10,2022-11,2022-12,2023-1,2023-2,2023-3,2023-4,2023-5
AGR.VI,10.413229,10.140198,9.632235,9.194118,9.238566,9.511595,9.853304,9.677350,10.541471,9.926290,...,16.100000,14.150000,13.700000,15.500000,14.950000,16.500000,17.000000,16.350000,17.049999,17.450001
ANDR.VI,4.314121,4.720487,4.915369,4.383574,5.168197,5.750954,5.674334,5.684648,5.893880,5.551299,...,44.514782,42.249348,45.560368,50.294540,51.843552,53.053719,56.587406,62.299999,58.849998,50.150002
CAI.VI,11.377453,11.439225,11.422379,11.484156,11.456975,11.485084,11.597516,11.653733,11.912328,11.885530,...,30.528845,29.230768,30.721153,30.480768,27.259615,27.500000,25.961538,23.846153,25.288460,25.549999
DOC.VI,6.848340,6.846281,7.412515,7.618420,7.515469,9.059741,9.779362,10.399153,10.191169,9.775205,...,79.000000,68.000000,79.599998,88.599998,88.599998,97.099998,107.400002,107.599998,106.199997,115.000000
EBS.VI,23.324680,25.355593,24.832478,23.072359,24.935123,25.808506,26.201536,28.041880,27.761145,27.074907,...,21.109707,21.259954,23.438536,27.795700,28.077414,32.631779,34.913654,28.640841,30.903936,30.270000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
^OMX,,,,,,,,,,,...,179.569661,167.081855,180.338827,192.233721,183.393693,194.950802,202.128617,196.925355,200.095538,191.746597
^OMXH25,,,,,,,,,,,...,4701.919922,4382.379883,4606.529785,4925.620117,4825.439941,4938.439941,5027.020020,4777.660156,4749.990234,4463.100098
^GDAXI,4254.850098,4350.490234,4348.770020,4184.839844,4460.629883,4586.279785,4886.500000,4829.689941,5044.120117,4929.069824,...,12834.959960,12114.360350,13253.740230,14397.040040,13923.589840,15128.269530,15365.139650,15628.839840,15922.379880,15664.019530
^FTSE,7007.943320,7215.364469,7134.693844,7119.959067,7332.348747,7575.111266,7633.381531,7745.138050,8038.890793,7847.254427,...,8472.561513,7833.685177,8264.505864,8764.046429,8428.857862,8846.456170,8955.021455,8670.021806,8916.708329,8612.390187


In [113]:
filtered_data.to_csv("Data/Market price data/STOCK_PRICE_EUR_NOTNULL.csv")
filtered_data.round(4).to_csv("Data/Market price data/STOCK_PRICE_EUR_NOTNULL_ROUNDED.csv")

Summary of value counts by stock symbol

In [115]:
# Counts by stock market

market_data_1 = filtered_data.copy()
market_data_1.insert(0, "Stock", [ ticker[ticker.find(".")+1:] if ticker.find(".")>0 else "" for ticker in market_data_1.index ])
market_data_1.groupby("Stock").count()

Unnamed: 0_level_0,2005-1,2005-2,2005-3,2005-4,2005-5,2005-6,2005-7,2005-8,2005-9,2005-10,...,2022-8,2022-9,2022-10,2022-11,2022-12,2023-1,2023-2,2023-3,2023-4,2023-5
Stock,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,745,745,745,745,745,745,745,745,745,745,...,747,747,747,747,747,747,747,747,747,747
AS,27,27,27,27,27,27,27,27,27,27,...,27,27,27,27,27,27,27,27,27,27
AT,3,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
BE,12,12,12,12,12,12,12,12,12,12,...,12,12,12,12,12,12,12,12,12,12
BR,38,38,38,38,38,38,38,38,38,38,...,38,38,38,38,38,38,38,38,38,38
CO,22,22,22,22,22,22,22,22,22,22,...,22,22,22,22,22,22,22,22,22,22
HE,26,26,26,26,26,26,26,26,26,26,...,26,26,26,26,26,26,26,26,26,26
IR,7,7,7,7,7,7,7,7,7,7,...,7,7,7,7,7,7,7,7,7,7
L,233,233,233,233,233,233,233,233,233,233,...,233,233,233,233,233,233,233,233,233,233
LS,18,18,18,18,18,18,18,18,18,18,...,18,18,18,18,18,18,18,18,18,18
