In [None]:
import pandas as pd
pd.set_option('display.max_rows',500)
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use("fivethirtyeight")
%matplotlib inline

# For reading stock data from yahoo
import pandas_datareader.data as pdr
from pandas_datareader.data import DataReader
import yfinance as yf

# For time stamps
import datetime

In [None]:

# download stocks
file = pd.read_csv('M6_Universe.csv')
stocks_list = file['symbol'].values.tolist()

end = datetime.datetime.now()
start = datetime.datetime(end.year - 3, end.month, end.day)

last_month_days = []
for year in range(start.year, end.year+1):
    for month in range(1,13):
        first_day = datetime.datetime(year, month, 1)
        prev_day = first_day - datetime.timedelta(days=1)
        last_month_days += [prev_day.strftime('%Y-%m-%d')]
last_month_days

In [None]:
all_stocks = DataReader(stocks_list, 'yahoo', start, end)['Adj Close']
spy = DataReader(['SPY'], 'yahoo', start, end)['Adj Close']

marketCaps = pdr.get_quote_yahoo(stocks_list)['marketCap']
all_stocks.describe()

In [None]:
# extract monthly stock prices
month_stocks = all_stocks.copy()
month_stocks['Datestring'] = month_stocks.index
    
month_stocks['year'] = pd.DatetimeIndex(month_stocks.index).year
month_stocks['month'] = pd.DatetimeIndex(month_stocks.index).month

month_on_month = month_stocks.sort_values(by='Date').groupby(['year','month']).last()
month_on_month = month_on_month.drop(month_on_month.tail(1).index).set_index(['Datestring'])

month_on_month

In [None]:
monthly_returns = month_on_month.pct_change().round(3).T
monthly_returns

In [None]:
# Rank generator helpers (pandas ranking does not handle ties on margins of classes as explained in M6 description)
def generate_rank(df):
    ranks = pd.DataFrame(index=df.index, columns=df.columns)
    vector_ranks = pd.DataFrame(index=df.index, columns=df.columns)
    
    for col in df.columns:
        if df[col].isnull().all():
            continue
        ranks[col], vector_ranks[col] = generate_rank_from_values(df[col].values)
        
    return ranks, vector_ranks

def generate_rank_from_values(value_list):
    if len(value_list) != 100:
        print('Values list not 100 in number')
        return
    df = pd.DataFrame(value_list, columns=['data'])
    orig_index = df.index
    #df[241] = [random.randint(1,20) for i in range(100)]
    df = df.sort_values('data',ascending=False)
    df['my100rank'] = range(100,0,-1)
    df['my5rank'] = [5]*20 + [4]*20 + [3]*20 + [2]*20 + [1]*20
    df['pyrank'] = df['data'].rank()
    df['5rank'] = pd.qcut(df['data'], 5, labels=False, precision=1) + 1
    df['diff'] = df['my5rank'] != df['5rank']
    uniq_ranks = df[df['diff'] == True]['pyrank'].unique()
    df['adjusted5rank'] = df['my5rank']
    df['rankvector'] = np.nan
    
    df = df.join(pd.get_dummies(df['my5rank']))
    
    
    # ties on the margins of the classes
    for rank in uniq_ranks:
        rank_index = df[df['pyrank']==rank].index
        population = df['my5rank'][rank_index]
        df.loc[rank_index,'adjusted5rank'] = round(population.mean(),2)
        for p in population:
            df.loc[rank_index, p] = round(sum(population == p) / len(population),2)
            
    
    
    #print(df.to_string())
    ranks = df['adjusted5rank'][orig_index].values.tolist()
    vector_ranks = []
    for index, row in df.loc[orig_index].iterrows():
        vector_ranks.append([row[1],row[2],row[3],row[4],row[5]])
    
    # clean house
    for p in [1,2,3,4,5]:
        df[f'Rank {p}'] = df[p]
    df = df.drop(columns=[1,2,3,4,5])
    return ranks, vector_ranks

ranks, vector_ranks = generate_rank(monthly_returns)
ranks

In [None]:
# Peek at last month's data
agg_data = month_on_month.T.copy()
last_month = agg_data.columns[-1]
agg_data = agg_data[[last_month]]

agg_data['Last Month Return'] = monthly_returns[last_month]

monthly_ranks, monthly_vranks = generate_rank(monthly_returns)

agg_data['Last Month Return Rank'] = monthly_ranks[last_month]
agg_data['Last Month Return Rank Vector'] = monthly_vranks[last_month]
agg_data

In [None]:
# view of the monthly returns
time_span = 40
plt.figure(figsize=(16,6))
plt.title('Monthly Return')
plt.xlabel('Date', fontsize=18)
plt.ylabel('Monthly Return', fontsize=18)
for stock in stocks_list[:4]:
    plt.plot(monthly_returns.T[stock][-time_span:])

plt.legend(stocks_list[:4], loc='lower right')
plt.show()

In [None]:
# monthly return ranks
plt.figure(figsize=(16,6))
plt.title(f"Monthly Return Rank for {','.join(stocks_list[:4])}")
plt.xlabel('Date', fontsize=18)
plt.ylabel('Monthly Return Rank', fontsize=18)
for stock in stocks_list[:4]:
    plt.plot(monthly_ranks.T[stock][-60:])

plt.legend(stocks_list[:4], loc='lower right')
plt.show()

In [None]:
# Forecast Performance

def RPS(df, predictions):
    result = pd.DataFrame(index=df.index, columns=df.columns)
    
    for col in df.columns:
        if df[col].isnull().all():
            continue
        result[col] = RPS_T(df[col].values, predictions)
        
    return result

def RPS_T(actual, predictions):
    result = []
    for x,y in zip(actual, predictions):
        result.append(RPS_i_T(x,y))
        
    return result

def RPS_i_T(actual, predictions):
    actual, predictions = np.array(actual), np.array(predictions)
    return np.mean((predictions - actual) ** 2)

In [None]:
# PMF ranks over the previous 36 months to today
pmf_vranks = []

for row in monthly_ranks.index:
    pmf, bins = np.histogram(monthly_ranks.T[row].dropna().values - 0.001, bins = range(0,6), density=True)
    pmf_vranks.append(np.round(pmf, 3).tolist())
    

agg_data['PMF Rank Vector'] = pmf_vranks
agg_data['PMF Rank'] = np.dot(np.array(pmf_vranks), np.array(range(1,6)))
agg_data['Last Month PMF RPS'] = RPS_T(monthly_vranks[last_month].values.tolist(), pmf_vranks)
agg_data


In [None]:
# Monthly Return
import random

area = np.pi * 20

plt.figure(figsize=(10,7))
plt.scatter(agg_data['PMF Rank'], agg_data['Last Month Return Rank'], s=area)
plt.xlabel('PMF rank')
plt.ylabel('Last Month rank')
plt.title('Last Month rank vs PMF rank')

# y=x line
plt.plot(range(6), range(6), 'r-')

for label, x, y in zip(agg_data.index, agg_data['PMF Rank'], agg_data['Last Month Return Rank']):
    plt.annotate(label, xy=(x,y), xytext=(10+random.randint(-20,20),10+random.randint(-20,20)), 
        textcoords='offset points', ha='right', va='bottom')
    

### 'Efficient Market Hypothesis' Investment Decision

In [None]:
# naive, each stock gets 1% of portfolio
daily_ret = all_stocks.pct_change().T.round(3)
naive_returns = daily_ret.copy()
for col in naive_returns.columns:
    naive_returns[col] = naive_returns[col] * 0.01

naive_returns.loc['Total'] = naive_returns.sum()

In [None]:
# today's market cap data, each stock gets % proportional to market cap
agg_data['EMH Decision'] = (marketCaps / marketCaps.sum()).round(4)
emh_decision = agg_data['EMH Decision']

emh_returns = daily_ret.copy()
for col in emh_returns.columns:
    emh_returns[col] = emh_returns[col] * emh_decision.T

emh_returns.loc['Total'] = emh_returns.sum()

In [None]:
RET = emh_returns.loc[['Total'],:]
RET.loc[1,:] = naive_returns.loc['Total',:]
RET.index = ['EMH-RET','Naive-RET']

ret = RET.T
ret['SPY'] = spy['SPY'].pct_change().T.round(3)
ret = np.log(1 + ret.T)
ret

In [None]:
monthly_returns = ret.T

monthly_returns['Datestring'] = monthly_returns.index
    
monthly_returns['year'] = pd.DatetimeIndex(monthly_returns.index).year
monthly_returns['month'] = pd.DatetimeIndex(monthly_returns.index).month

month_cum_returns = monthly_returns.sort_values(by='Date').groupby(['year','month']).sum()

month_cum_returns['EMH-RET-sdp'] = monthly_returns.sort_values(by='Date').groupby(['year','month']).std()['EMH-RET']
month_cum_returns['Naive-RET-sdp'] = monthly_returns.sort_values(by='Date').groupby(['year','month']).std()['Naive-RET']
month_cum_returns['SPY-sdp'] = monthly_returns.sort_values(by='Date').groupby(['year','month']).std()['SPY']

month_cum_returns['EMH-RET-IR'] = month_cum_returns['EMH-RET'] / month_cum_returns['EMH-RET-sdp']
month_cum_returns['Naive-RET-IR'] = month_cum_returns['Naive-RET'] / month_cum_returns['Naive-RET-sdp']
month_cum_returns['SPY-IR'] = month_cum_returns['SPY'] / month_cum_returns['SPY-sdp']

month_cum_returns['Datestring'] = monthly_returns.sort_values(by='Date').groupby(['year','month']).last()['Datestring']

month_cum_returns = month_cum_returns.drop(month_cum_returns.tail(1).index).set_index(['Datestring'])
month_cum_returns

In [None]:
plt.figure(figsize=(16,6))
plt.title("Decision Monthly Returns")
plt.xlabel('Month', fontsize=18)
plt.ylabel('Monthly return', fontsize=18)


plt.axhline(y=0.0, linestyle='-', color='black')
plt.plot(month_cum_returns['EMH-RET'])
plt.plot(month_cum_returns['Naive-RET'])
plt.plot(month_cum_returns['SPY'])

plt.legend(['Zero','EMH','Naive','SPY'])

plt.show()

In [None]:
plt.figure(figsize=(16,6))
plt.title("Decision Monthly Risk")
plt.xlabel('Month', fontsize=18)
plt.ylabel('Monthly risk', fontsize=18)


plt.axhline(y=0.0, color='k', linestyle='-')
plt.plot(month_cum_returns['EMH-RET-sdp'])
plt.plot(month_cum_returns['Naive-RET-sdp'])
plt.plot(month_cum_returns['SPY-sdp'])


plt.legend(['Zero','EMH','Naive','SPY'])

plt.show()

In [None]:
plt.figure(figsize=(16,6))
plt.title("EMH Decision Monthly Information Ratio")
plt.xlabel('Date', fontsize=18)

plt.axhline(y=0.0, color='k', linestyle='-')
plt.plot(month_cum_returns['EMH-RET-IR'])
plt.plot(month_cum_returns['Naive-RET-IR'])
plt.plot(month_cum_returns['SPY-IR'])


plt.legend(['Zero','EMH','Naive','SPY'])
plt.show()

In [None]:
pmf_values = np.array([x for x in agg_data['PMF Rank Vector'].values])
submission = agg_data.filter('PMF Rank Vector')
for i in range(5):
    submission[f'Rank {i+1}'] = pmf_values[:,i]
    
submission['Decision'] = agg_data['EMH Decision']
#submission.to_csv('submission.csv')
submission