In [1]:
from utils.cluster import group_kmeans, single_kmeans
from utils.visualize import (
    plot_distortions,
    plot_label_distr,
    plot_return,
)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

%load_ext autoreload
%autoreload 2

# Clustering with data from day i and day i-1

## Read and preprocess data

In [3]:
# read data
READ_PATH = "data/raw/spy_max_221118.csv"
df = pd.read_csv(READ_PATH)

# rename columns
rename_dict = {"Date": "date", "Open": "open", "High": "high",
               "Low": "low", "Close": "close", "Volume": "volume"}
df.rename(columns=rename_dict, inplace=True)

# compute next day (i+1) return
# we assume a strategy of buying at open and selling at close
df['open_i+1'] = df['open'].shift(-1)
df['close_i+1'] = df['close'].shift(-1)
df.dropna(inplace=True)
annualized_fctr = 252
df['return_i+1'] = (df['close_i+1']-df['open_i+1'])/df['open_i+1']*annualized_fctr

# add clustering attributes 
# open, high, low, close, volume from the day before
df['open_i-1'] = df['open'].shift(1)
df['high_i-1'] = df['high'].shift(1)
df['low_i-1'] = df['low'].shift(1)
df['close_i-1'] = df['close'].shift(1)
df['volume_i-1'] = df['volume'].shift(1)
df.dropna(inplace=True)
# ratio between today and yesterday
df['open/open_i-1'] = df['open']/df['open_i-1']
df['high/high_i-1'] = df['high']/df['high_i-1']
df['low/low_i-1'] = df['low']/df['low_i-1']
df['close/close_i-1'] = df['close']/df['close_i-1']
df['volume/volume_i-1'] = df['volume']/df['volume_i-1']


# save clustering attributes to a new df
use_cols = ['open/open_i-1', 'high/high_i-1', 'low/low_i-1', 'close/close_i-1',
            'volume/volume_i-1']
X = df[use_cols]
X.head()

Unnamed: 0,date,open,high,low,close,volume,open_i+1,close_i+1,return_i+1,open_i-1,high_i-1,low_i-1,close_i-1,volume_i-1,open/open_i-1,high/high_i-1,low/low_i-1,close/close_i-1,volume/volume_i-1
1,1993-02-01 00:00:00-05:00,25.352041,25.514208,25.352041,25.514208,480500,25.496205,25.568279,0.712367,25.352045,25.352045,25.225915,25.334026,1003200.0,1.0,1.006396,1.005,1.007112,0.478967
2,1993-02-02 00:00:00-05:00,25.496205,25.586298,25.44215,25.568279,201300,25.604302,25.838543,2.305419,25.352041,25.514208,25.352041,25.514208,480500.0,1.005686,1.002825,1.003554,1.002119,0.418939
3,1993-02-03 00:00:00-05:00,25.604302,25.856561,25.586284,25.838543,529400,25.928637,25.946655,0.175122,25.496205,25.586298,25.44215,25.568279,201300.0,1.00424,1.010563,1.005665,1.01057,2.629906
4,1993-02-04 00:00:00-05:00,25.928637,26.000711,25.640341,25.946655,531500,25.928637,25.928637,0.0,25.604302,25.856561,25.586284,25.838543,529400.0,1.012667,1.005575,1.002113,1.004184,1.003967
5,1993-02-05 00:00:00-05:00,25.928637,25.982692,25.784488,25.928637,492100,25.928637,25.928637,0.0,25.928637,26.000711,25.640341,25.946655,531500.0,1.0,0.999307,1.005622,0.999306,0.92587


## Perform clustering and plot return distribution

In [None]:
# perform kmeans
k_upper = 20
distortions = group_kmeans(k_upper, X)
# k=4 is the optimal k with elbow method
fig_size = (12, 8)
file_name = 'kmeans_elbow_day_i_i-1_no_norm.png'
plot_distortions(k_upper, distortions, fig_size, file_name)

In [None]:
# perform kmeans with optimal k=4
opt_k = 4
labels = single_kmeans(opt_k, X)
# plot distribution of labels
file_name = 'kmeans_day_i_i-1_no_norm_cluster_distr.png'
plot_label_distr(labels, fig_size, file_name)
# note:the clusters are very imbalanced -> maybe need a way to improve later

## Perform clustering with standardized data and plot return
* variance of volume/volume_i-1 is much larger than the rest
* it could be a good idea to standardize data before clustering

In [None]:
# variance of attributes feeding into the clustering algorithm
X.var()

In [None]:
min_max_scaler = MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X)
# perform kmeans
k_upper = 20
distortions = group_kmeans(k_upper, X_minmax)
# k=4 is the optimal k with elbow method
fig_size = (12, 8)
file_name = 'kmeans_day_i_i-1_elbow.png'
plot_distortions(k_upper, distortions, fig_size, file_name)

In [None]:
# perform kmeans with optimal k=4
opt_k = 4
labels = single_kmeans(opt_k, X_minmax)
# plot distribution of labels
file_name = 'kmeans_day_i_i-1_clusters.png'
plot_label_distr(labels, fig_size, file_name)
# note:the clusters are still somehow imbalanced but much better

In [None]:
nrows, ncols = 2, 2
fig_size = (12, 12)
file_name = 'kmeans_day_i_i-1_returns.png'
plot_return(labels, np.array(df['return_i+1']), opt_k, nrows, ncols, fig_size, file_name)

# Clustering with data from day i, day i-1, and day i-2

## Add attributes related to day i-2

In [4]:
# day i-2 attributes
# open, high, low, close, volume from 2 days before
df['open_i-2'] = df['open'].shift(2)
df['high_i-2'] = df['high'].shift(2)
df['low_i-2'] = df['low'].shift(2)
df['close_i-2'] = df['close'].shift(2)
df['volume_i-2'] = df['volume'].shift(2)
df.dropna(inplace=True)

# ratio between day i-1 and i-2
df['open_i-1/open_i-2'] = df['open_i-1']/df['open_i-2']
df['high_i-1/high_i-2'] = df['high_i-1']/df['high_i-2']
df['low_i-1/low_i-2'] = df['low_i-1']/df['low_i-2']
df['close_i-1/close_i-2'] = df['close_i-1']/df['close_i-2']
df['volume_i-1/volume_i-2'] = df['volume_i-1']/df['volume_i-2']


# save clustering attributes to a new df
use_cols = ['open/open_i-1', 'high/high_i-1', 'low/low_i-1', 'close/close_i-1',
            'volume/volume_i-1', 'open_i-1/open_i-2', 'high_i-1/high_i-2', 'low_i-1/low_i-2',
            'close_i-1/close_i-2', 'volume_i-1/volume_i-2']
X = df[use_cols]
#X.head()
df.head()

Unnamed: 0,date,open,high,low,close,volume,open_i+1,close_i+1,return_i+1,open_i-1,...,open_i-2,high_i-2,low_i-2,close_i-2,volume_i-2,open_i-1/open_i-2,high_i-1/high_i-2,low_i-1/low_i-2,close_i-1/close_i-2,volume_i-1/volume_i-2
3,1993-02-03 00:00:00-05:00,25.604302,25.856561,25.586284,25.838543,529400,25.928637,25.946655,0.175122,25.496205,...,25.352041,25.514208,25.352041,25.514208,480500.0,1.005686,1.002825,1.003554,1.002119,0.418939
4,1993-02-04 00:00:00-05:00,25.928637,26.000711,25.640341,25.946655,531500,25.928637,25.928637,0.0,25.604302,...,25.496205,25.586298,25.44215,25.568279,201300.0,1.00424,1.010563,1.005665,1.01057,2.629906
5,1993-02-05 00:00:00-05:00,25.928637,25.982692,25.784488,25.928637,492100,25.928637,25.928637,0.0,25.928637,...,25.604302,25.856561,25.586284,25.838543,529400.0,1.012667,1.005575,1.002113,1.004184,1.003967
6,1993-02-08 00:00:00-05:00,25.928637,26.018729,25.8926,25.928637,596100,25.838538,25.748446,-0.878661,25.928637,...,25.928637,26.000711,25.640341,25.946655,531500.0,1.0,0.999307,1.005622,0.999306,0.92587
7,1993-02-09 00:00:00-05:00,25.838538,25.838538,25.69439,25.748446,122100,25.748461,25.784498,0.352694,25.928637,...,25.928637,25.982692,25.784488,25.928637,492100.0,1.0,1.001387,1.004193,1.0,1.211339


In [None]:
X.var()

## Perform clustering with standardization and plot return

In [None]:
X_minmax = min_max_scaler.fit_transform(X)
# perform kmeans
k_upper = 20
distortions = group_kmeans(k_upper, X_minmax)
# k=4 is the optimal k with elbow method
fig_size = (12, 8)
file_name = 'kmeans_day_i_i-2_elbow.png'
plot_distortions(k_upper, distortions, fig_size, file_name)

In [None]:
# perform kmeans with optimal k=4
opt_k = 5
labels = single_kmeans(opt_k, X_minmax)
# plot distribution of labels
file_name = 'kmeans_day_i_i-2_clusters.png'
plot_label_distr(labels, fig_size, file_name)
# note:the clusters are still somehow imbalanced

In [None]:
nrows, ncols = 2, 3
fig_size = (18, 12)
file_name = 'kmeans_day_i_i-2_returns.png'
plot_return(labels, np.array(df['return_i+1']), opt_k, nrows, ncols, fig_size, file_name)