# DEMO 3

K-means applied to Stock Market Data

In [1]:
import sys
import os
# Manually set the path relative to the py file's location that you want to import
func_lib_path = os.path.abspath(os.path.join(os.getcwd(), '../../'))# Add the path to sys.path

# Add the path to sys.path
sys.path.append(func_lib_path)

# Now you can import func_lib
import func_lib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [None]:
historical_prices = func_lib.createHistPrices()
list_of_momentums = []
total_returns     = func_lib.computingReturns(historical_prices, list_of_momentums)
total_returns.dropna(inplace=True)

[**                     4%%                      ]  19 of 501 completed

In [None]:
total_returns.head()

In [None]:
# Group by Ticker and calculate average return and standard deviation
ticker_stats = total_returns.groupby('Ticker')['F_1_d_returns'].agg(['mean', 'std'])
ticker_stats.head()

In [None]:
# Standardize the data
scaler = StandardScaler()
ticker_stats_scaled = scaler.fit_transform(ticker_stats[['mean', 'std']])

In [None]:
# Initialize list to store inertias
sse = []
k_range = range(1, 11)

# Test different numbers of clusters
for k in k_range:    
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(ticker_stats_scaled)
    sse.append(kmeans.inertia_)

# Plot the elbow method
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), sse, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.xticks(range(1, 11))
plt.grid(True)
plt.show()

In [None]:
# Determine the optimal number of clusters using the elbow method
func_lib.plot_optimal_cluster_point(sse, k_range)

In [None]:
# Number of clusters 
k = 3

# Perform K-Means clustering
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(ticker_stats_scaled)

# Add cluster labels to the original DataFrame
ticker_stats_scaled['Cluster'] = clusters
ticker_stats_scaled.head()

In [None]:
# Visualize the clusters
# (You may need to adjust this based on your visualization preferences)
plt.figure(figsize=(10, 6))
for cluster in range(k):
    plt.scatter(ticker_stats[ticker_stats_scaled['Cluster'] == cluster]['std'],
                ticker_stats[ticker_stats_scaled['Cluster'] == cluster]['mean'],
                label=f'Cluster {cluster}', s=1)
plt.title('K-Means Clustering of Tickers based on Average Return and Standard Deviation')
plt.xlabel('Standard Deviation')
plt.ylabel('Average Return')
plt.legend()
plt.grid(True)
plt.show()