In [None]:
from utils.cluster import group_kmeans, single_kmeans
from utils.visualize import (
    plot_distortions,
    plot_label_distr,
    plot_return,
    plot_return_box,
    plot_qq,
    report_return_mean_std,
)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

%load_ext autoreload
%autoreload 2

## Read and extract clustering attributes

In [None]:
# TODO: change arguments if necessary
# read path of training data
TRAIN_PATH = 'data/processed/xlv_max_221118_train.csv'
# read path of testing data
TEST_PATH = 'data/processed/xlv_max_221118_test.csv'
# timespan of 2 uses data from day i and day i-1
# timespan of 3 uses data from day i, day i-1, and day i-2
timespan = 3

In [None]:
# read data
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

# save clustering attributes to a new df
if timespan == 2:
    use_cols = ['open/open_i-1', 'high/high_i-1', 'low/low_i-1', 'close/close_i-1',
                'volume/volume_i-1']
else:
    use_cols = ['open/open_i-1', 'high/high_i-1', 'low/low_i-1', 'close/close_i-1',
            'volume/volume_i-1', 'open_i-1/open_i-2', 'high_i-1/high_i-2', 'low_i-1/low_i-2',
            'close_i-1/close_i-2', 'volume_i-1/volume_i-2']
    
X = train[use_cols]
test_X = test[use_cols]

X.head()

## Perform clustering and plot return distribution

In [None]:
# perform kmeans
k_upper = 20
distortions = group_kmeans(k_upper, X)
# k=4 is the optimal k with elbow method
fig_size = (12, 8)
file_name = 'kmeans_elbow_day_i_i-2_no_norm.png'
plot_distortions(k_upper, distortions, fig_size, file_name)

In [None]:
# perform kmeans with optimal k=4
opt_k = 2
model = single_kmeans(opt_k, X)
# plot distribution of labels
fig_size = (12, 8)
file_name = 'kmeans_day_i_i-2_no_norm_cluster_distr.png'
plot_label_distr(model.labels_, fig_size, file_name)
# note:the clusters are very imbalanced -> maybe need a way to improve later

## Perform clustering with standardized data and plot return
* variance of volume/volume_i-1 is much larger than the rest
* it could be a good idea to standardize data before clustering

In [None]:
print(X.var())
# standarsize 
min_max_scaler = MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X)

In [None]:
# perform kmeans
k_upper = 20
distortions = group_kmeans(k_upper, X_minmax)
# k=4 is the optimal k with elbow method
fig_size = (12, 8)
file_name = 'kmeans_day_i_i-2_elbow.png'
plot_distortions(k_upper, distortions, fig_size, file_name)

In [None]:
# perform kmeans with optimal k=4
opt_k = 5
model = single_kmeans(opt_k, X_minmax)
# plot distribution of labels
fig_size = (12, 8)
file_name = 'kmeans_day_i_i-2_clusters.png'
plot_label_distr(model.labels_, fig_size, file_name)
# note:the clusters are still somehow imbalanced but much better

In [None]:
nrows, ncols = 2, 3
fig_size = (18, 12)
file_name = 'kmeans_day_i_i-2_returns.png'
plot_return(model.labels_, np.array(train['return_i+1']), opt_k, nrows, ncols, fig_size, file_name)

## Results analysis on train and test data

In [None]:
# report mean and variance of each cluster return: training
report_return_mean_std(opt_k, model.labels_, np.array(train['return_i+1']))

In [None]:
# box plot for training returns 
fig_size = (12, 8)
file_name = 'kmeans_day_i_i-2_returns_boxplot.png'
plot_return_box(model.labels_, np.array(train['return_i+1']), opt_k, fig_size, file_name)

In [None]:
# predict cluster of test data and draw distribution
# standardize test_X
test_X_minmax = min_max_scaler.transform(test_X)
# predict clusters of test data
test_pred = model.predict(test_X_minmax)
# draw distribution
fig_size = (12, 8)
file_name = 'kmeans_day_i_i-2_clusters_test.png'
plot_label_distr(test_pred, fig_size, file_name)
# very similar distribution of clusters compared to train data
# test data distr mirrors train data distr based on clusters

In [None]:
nrows, ncols = 2, 3
fig_size = (18, 12)
file_name = 'kmeans_day_i_i-2_qq.png'
plot_qq(model.labels_, test_pred, np.array(train['return_i+1']),
        np.array(test['return_i+1']), opt_k, nrows, ncols, fig_size, file_name)

In [None]:
# report mean and std of test returns for each cluster
report_return_mean_std(opt_k, test_pred, np.array(train['return_i+1']))