Before running this notebook: 

```bash
# Downloads the data, unless it is already there (this uses R)
make data  

# Install dependencies
pip install numpy pandas matplotlib seaborn tqdm
pip install entropy_estimators normi
```

The efficient market hypothesis (EMH) claim that "current prices" contain all the information needed to forecast future prices:
if this were true, the conditional entropy $H(\text{future}|\text{past}) = H(\text{past}, \text{future}) - H(\text{past})$ would be "high".
We need to clarify a few terms in that sentence: 
- By "current prices" (resp. "future prices"), we mean the current state of the market: we will define it as the 1-, 3-, 6- and 12-month trailing (resp. forward) log-returns of assets in the universe
- The entropy of continuous, multivariate distributions is notoriously difficult to estimate from sample data: we use the KL estimator [3], which assumes that the distribution is locally uniform [1]. Instead, it is possible to make a locally Gaussian assumption [2], but the computations are trickier (implementation bugs are more likely), slower, and noisier.
- Instead of the conditional entropy, one may consider the *normalized mutual information* [4]:
  - $H(X)$: uncertainty about $X$
  - $H(Y|X)$: remaining uncertainty about $X$ after knowing $Y$
  - $I(X;Y)$: decrease in uncertainty about $X$ brought by $Y$
  - $\text{NMI}(X;Y) \in [0,1]$ can be seen as a non-Gaussian (and multivariate) correlation
    
References: 

[1] *Sample estimate of the entropy of a random vector* (non-English -- see [2] instead) (L.F. Kozachenko and N.N. Leonenko, 1987) https://www.mathnet.ru/php/archive.phtml?wshow=paper&jrnid=ppi&paperid=797&option_lang=eng

[2] *A non-parametric k-nearest neighbour entropy estimator* (D.Lonbardi and S. Pant, 2015) https://arxiv.org/abs/1506.06501

[3] https://github.com/paulbrodersen/entropy_estimators

[4] *Accurate estimation of the normalized mutual information of multidimensional data* (D. Naget et al., 2024) https://arxiv.org/abs/2405.04980 (code: https://moldyn.github.io/NorMI/)

In [None]:
import warnings
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from entropy_estimators.continuous import get_h
from normi import NormalizedMI

for message in [
    "divide by zero encountered in log",
    "invalid value encountered in sqrt",
    "The figure layout has changed to tight",
]:
    warnings.filterwarnings("ignore", message=message)

In [None]:
def axis_year(ax, format="%Y"):
    ax.xaxis.set_major_locator(matplotlib.dates.YearLocator())
    ax.xaxis.set_minor_locator(matplotlib.dates.YearLocator(month=7, day=1))
    ax.xaxis.set_major_formatter(matplotlib.ticker.NullFormatter())
    ax.xaxis.set_minor_formatter(matplotlib.dates.DateFormatter(format))
    for tick in ax.xaxis.get_minor_ticks():
        tick.tick1line.set_markersize(0)
        tick.tick2line.set_markersize(0)
        tick.label1.set_horizontalalignment('center')
    ticks = ax.xaxis.get_major_ticks()
    xlocs = ax.get_xticks()
    for pos, tick in zip(xlocs, ticks):
        date = matplotlib.dates.num2date(pos)
        if date.year % 10 == 0:
            tick.tick1line.set_markersize(15)  

# Data

In [None]:
d = pd.read_csv("raw/data_ml.csv")
d = d[['stock_id', 'date', 'Vol1Y_Usd', 'R1M_Usd', 'R3M_Usd', 'R6M_Usd', 'R12M_Usd', 'R1M_Rel']].copy()
d.sort_values(["stock_id", "date"], inplace=True)
d['date'] = pd.to_datetime( d['date'] )

In [None]:
# Add forward returns
d['F1M_Usd'] = d.groupby('stock_id').shift(1)['R1M_Usd']
d['F3M_Usd'] = d.groupby('stock_id').shift(3)['R3M_Usd']
d['F6M_Usd'] = d.groupby('stock_id').shift(6)['R6M_Usd']
d['F12M_Usd'] = d.groupby('stock_id').shift(12)['R12M_Usd']

d = d.dropna()

d.head()

# Computations: cross-sectional

In [None]:
past = ['R1M_Usd', 'R3M_Usd', 'R6M_Usd', 'R12M_Usd']
future = ['F1M_Usd', 'F3M_Usd', 'F6M_Usd', 'F12M_Usd']
assert len(past) == len(future)

In [None]:
volatility = {}
entropy = {}
conditional_entropy = {}
mutual_information = {}

nmi = NormalizedMI(n_dims=len(past), verbose=False)

dates = d['date'].unique()
for date in tqdm(dates):
    i = d['date'] == date
    X = d[i][past]
    Y = d[i][future]
    XY = d[i][past+future]
    entropy[date] = get_h(X)
    volatility[date] = X.iloc[:,0].std()
    conditional_entropy[date] = get_h(XY) - entropy[date]
    
    mutual_information[date] = np.nan
    try: 
        nmi.fit(XY.values)
        mutual_information[date] = nmi.nmi_[0,1]
    except:
        pass
        
conditional_entropy = pd.Series( conditional_entropy )
entropy = pd.Series( entropy )
volatility = pd.Series( volatility )
mutual_information = pd.Series( mutual_information )

In [None]:
fig, axs = plt.subplots( 4, 1, figsize = (9,12), layout = 'constrained' )
axs[0].plot( conditional_entropy )
axs[1].plot( entropy )
axs[2].plot( mutual_information )
axs[3].plot( volatility )
axs[0].set_ylabel( "Conditional Entropy" )
axs[1].set_ylabel( "Entropy" )
axs[2].set_ylabel( "Normalized Mutual Information" )
axs[3].set_ylabel( "Cross-sectional volatility" )
axs[3].set_yscale('log')
axs[3].set_yticks( 
    [.05,.06,.07,.08,.09,.1,.2,.3,.4,.5,.6,.7,.8,.9,1], 
    ['',  '', '', '', '',.1,.2,.3,.4,.5,'','','','',1],
)
for ax in axs:
    axis_year(ax, "%y")
fig.suptitle("Cross-sectional")
plt.show()

In [None]:
fig, ax = plt.subplots( figsize = (9,3), layout = 'constrained' )
ax.plot( mutual_information )
ax.set_ylabel( "Normalized Mutual Information" )
axis_year(ax, "%y")
ax.set_title("Cross-sectional")
fig.savefig("plots/2_NMI_cross-sectional.pdf")
plt.show()

In [None]:
measures = pd.DataFrame( { 
    'Conditional Entropy':           conditional_entropy,
    'Entropy':                       entropy,
    'Normalized Mutual Information': mutual_information,
    'log(Volatility)':               np.log(volatility),
} )
sns.pairplot( measures ).fig.suptitle("Cross-sectional", y=1)

# Computations: time series

In [None]:
volatility = {}
entropy = {}
conditional_entropy = {}
mutual_information = {}

nmi = NormalizedMI(n_dims=len(past), verbose=False)

ids = d['stock_id'].unique()
for id in tqdm(ids):
    i = d['stock_id'] == id
    X = d[i][past]
    Y = d[i][future]
    XY = d[i][past+future]
    entropy[id] = get_h(X)
    volatility[id] = X.iloc[:,0].std()
    conditional_entropy[id] = get_h(XY) - entropy[id]
    
    mutual_information[id] = np.nan
    try: 
        nmi.fit(XY.values)
        mutual_information[id] = nmi.nmi_[0,1]
    except:
        pass
        
conditional_entropy = pd.Series( conditional_entropy )
entropy = pd.Series( entropy )
volatility = pd.Series( volatility )
mutual_information = pd.Series( mutual_information )

In [None]:
# Replace np.inf with np.nan
for u in [
    conditional_entropy,
    entropy,
    volatility,
    mutual_information,
]: 
    u[ ~ np.isfinite(u) ] = np.nan

fig, axs = plt.subplots(2,2, figsize=(8,8), layout='constrained')
axs[0,0].hist( conditional_entropy, edgecolor='tab:blue', facecolor='lightblue' )
axs[0,1].hist( mutual_information,  edgecolor='tab:blue', facecolor='lightblue' )
axs[1,0].hist( entropy,             edgecolor='tab:blue', facecolor='lightblue' )
axs[1,1].hist( np.log(volatility),  edgecolor='tab:blue', facecolor='lightblue' )
axs[0,0].set_title( "Conditional Entropy" )
axs[0,1].set_title( "Normalized Mutual Information" )
axs[1,0].set_title( "Entropy" )
axs[1,1].set_title( "log(Volatility)" )
fig.suptitle( "Time series" )
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(4,2.5), layout='constrained')
ax.hist( mutual_information, bins=20, edgecolor='tab:blue', facecolor='lightblue' )
ax.set_xlabel( "Normalized Mutual Information" )
#ax.set_title( "Time series" )
for side in ['left', 'right', 'top']:
    ax.spines[side].set_visible(False)
ax.set_yticks([])
fig.savefig("plots/2_NMI_time_series.pdf")
plt.show()

In [None]:
measures = pd.DataFrame( { 
    'Conditional Entropy':           conditional_entropy,
    'Entropy':                       entropy,
    'Normalized Mutual Information': mutual_information,
    'log(Volatility)':               np.log(volatility),
} )
sns.pairplot( measures ).fig.suptitle("Time series", y=1)