Use K-means clustering to predict returns of futures traded on global markets.

Import Packages

In [1]:
import sys
!{sys.executable} -m pip install scikit-learn
import yfinance as yf
import pandas as pd
import numpy as np
import plotnine

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score




[notice] A new release of pip available: 22.3 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Download Futures Data from yfinance:
USD, Gold, Silver, Copper, Brent Crude Oil, Natural Gas, Corn, Soybean, S&P 500, Nasdaq 100, Russell 2000

Store only closing data in the df we will use

In [2]:
tickers = ['DX=F', 'GC=F', 'SI=F', 'HG=F', 'BZ=F', 'NG=F', 'ZC=F', 'ZS=F', 'ES=F', 'NQ=F']
ticker_to_name = {
    "BZ=F": "Crude",
    "GC=F": "Gold",
    "NG=F": "Nat_Gas",
    "SI=F": "Silver",
    "ZC=F": "Corn",
    "ZS=F": "Soybeans",
    "ES=F": "S&P_500",
    "NQ=F": "Nasdaq_100",
    "DX=F": "USD",
    "HG=F": "Copper"
}

full_data = yf.download(tickers, start = '2000-01-01', end='2024-12-31')
full_data.columns = full_data.columns.set_levels([ticker_to_name.get(t, t) for t in full_data.columns.levels[1]], level=1)

close_data = full_data["Close"]

[*********************100%***********************]  10 of 10 completed


Describe Futures Close Data Gathered

In [3]:
close_data.describe()

Ticker,Crude,USD,S&P_500,Gold,Copper,Nat_Gas,Nasdaq_100,Silver,Corn,Soybeans
count,4332.0,6193.0,6133.0,6105.0,6110.0,6111.0,6133.0,6107.0,6118.0,6110.0
mean,78.62762,92.340932,2128.468195,1160.788845,2.761719,4.428152,5297.366104,17.044722,410.785673,1006.032774
std,24.333126,11.296524,1258.95868,580.298316,1.120774,2.2411,5056.885653,8.499005,160.090006,332.053084
min,19.33,71.304001,676.0,255.100006,0.604,1.482,809.5,4.026,174.75,418.5
25%,60.057501,82.404999,1193.0,631.700012,2.0935,2.776,1693.0,11.4775,310.5625,779.25
50%,76.754997,92.411003,1513.25,1233.900024,2.9905,3.786,2769.75,16.747999,375.25,979.75
75%,100.324999,99.589996,2781.0,1615.199951,3.594,5.597,7215.0,22.682,501.9375,1295.6875
max,146.080002,121.209999,6099.0,2788.5,5.119,15.378,22110.25,48.584,831.25,1771.0


In [4]:
close_data.head()

Ticker,Crude,USD,S&P_500,Gold,Copper,Nat_Gas,Nasdaq_100,Silver,Corn,Soybeans
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2000-06-22,,107.400002,,,,,,,,
2000-06-23,,107.720001,,,,,,,,
2000-06-26,,107.769997,,,,,,,,
2000-06-27,,106.980003,,,,,,,,
2000-06-28,,107.25,,,,,,,,


In [5]:
close_data.tail()

Ticker,Crude,USD,S&P_500,Gold,Copper,Nat_Gas,Nasdaq_100,Silver,Corn,Soybeans
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-12-23,72.629997,107.806999,6036.0,2612.300049,4.023,3.656,21753.25,29.888,447.75,969.5
2024-12-24,73.580002,108.023003,6098.0,2620.0,4.0495,3.946,22028.5,29.974001,448.5,975.25
2024-12-26,73.260002,107.892998,6095.25,2638.800049,4.072,3.715,22008.0,30.047001,453.75,988.0
2024-12-27,74.169998,107.792,6027.0,2617.199951,4.0625,3.514,21698.5,29.655001,454.0,980.0
2024-12-30,74.389999,107.932999,5958.75,2606.100098,4.0455,3.936,21416.25,29.106001,452.25,982.0


In [6]:
close_data.shape

(6209, 10)

Lets create our derived variabels: Rolling averages (3 day, 7 day, 10 day, 14 day, 30 day), std deviations

In [None]:
column_names = ["USD", "Crude", "Nat_Gas", "Gold", "Silver", "Copper", "Corn", "Soybeans", "S&P_500", "Nasdaq_100"]
windows = [3, 7, 10, 14, 30]
sd_windows = [30, 90, 180]
roi

# Close block as MultiIndex
base = close_data.reindex(columns=column_names).copy()
base.columns = pd.MultiIndex.from_product([["Close"], column_names])

# SMA blocks as MultiIndex, one per window
sma_frames = {}
for w in windows:
    sma = close_data.reindex(columns=column_names).rolling(window=w).mean()
    sma.columns = pd.MultiIndex.from_product([[f"{w}-day SMA"], column_names])
    sma_frames[w] = sma

# SD blocks, longer windows
sd_frames = {}
for w in sd_windows:
    sd = close_data.rolling(window=w).std()
    sd.columns = pd.MultiIndex.from_product([[f"{w}-day SD"], column_names])
    sd_frames[w] = sd

# Add 1 month, 3 month, and 6 month returns to dataset


# Add volume traded


# Concatenate and round
data = pd.concat([base] + [sma_frames[w] for w in windows] + [sd_frames[w] for w in sd_windows], axis=1).round(3)

data.tail()

Unnamed: 0_level_0,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,...,180-day SD,180-day SD,180-day SD,180-day SD,180-day SD,180-day SD,180-day SD,180-day SD,180-day SD,180-day SD
Unnamed: 0_level_1,USD,Crude,Nat_Gas,Gold,Silver,Copper,Corn,Soybeans,S&P_500,Nasdaq_100,...,USD,Crude,Nat_Gas,Gold,Silver,Copper,Corn,Soybeans,S&P_500,Nasdaq_100
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2024-12-23,107.807,72.63,3.656,2612.3,29.888,4.023,447.75,969.5,6036.0,21753.25,...,5.597,1.922,292.879,145.285,0.251,0.459,1125.752,1.776,25.433,92.215
2024-12-24,108.023,73.58,3.946,2620.0,29.974,4.049,448.5,975.25,6098.0,22028.5,...,5.557,1.943,294.147,145.136,0.252,0.469,1134.898,1.769,25.512,92.161
2024-12-26,107.893,73.26,3.715,2638.8,30.047,4.072,453.75,988.0,6095.25,22008.0,...,5.504,1.962,294.94,144.979,0.252,0.476,1142.225,1.762,25.619,92.087
2024-12-27,107.792,74.17,3.514,2617.2,29.655,4.062,454.0,980.0,6027.0,21698.5,...,5.454,1.98,295.327,144.847,0.253,0.478,1147.896,1.757,25.741,92.074
2024-12-30,107.933,74.39,3.936,2606.1,29.106,4.045,452.25,982.0,5958.75,21416.25,...,5.392,1.995,294.574,144.662,0.254,0.486,1148.68,1.754,25.83,91.95


In [9]:
data.shape

(6209, 90)

In [15]:
data.describe()

Unnamed: 0_level_0,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,...,180-day SD,180-day SD,180-day SD,180-day SD,180-day SD,180-day SD,180-day SD,180-day SD,180-day SD,180-day SD
Unnamed: 0_level_1,USD,Crude,Nat_Gas,Gold,Silver,Copper,Corn,Soybeans,S&P_500,Nasdaq_100,...,USD,Crude,Nat_Gas,Gold,Silver,Copper,Corn,Soybeans,S&P_500,Nasdaq_100
count,6193.0,4332.0,6111.0,6105.0,6107.0,6110.0,6118.0,6110.0,6133.0,6133.0,...,2703.0,4577.0,5078.0,3990.0,4196.0,4398.0,5078.0,3990.0,4076.0,4257.0
mean,92.340932,78.62762,4.428152,1160.788845,17.044722,2.761719,410.785673,1006.032774,2128.468195,5297.366105,...,8.09895,2.144196,111.050934,71.614411,0.255091,0.655108,381.939274,1.995565,47.673265,92.173265
std,11.296524,24.333126,2.2411,580.298317,8.499005,1.120776,160.090006,332.053084,1258.958679,5056.885653,...,4.212237,0.954888,80.24493,30.821596,0.156443,0.557664,393.763017,1.279339,29.756834,52.857945
min,71.304,19.33,1.482,255.1,4.026,0.604,174.75,418.5,676.0,809.5,...,2.993,0.48,19.689,19.598,0.064,0.123,43.143,0.492,9.536,25.273
25%,82.405,60.0575,2.776,631.7,11.4775,2.093,310.5625,779.25,1193.0,1693.0,...,5.1355,1.426,57.0895,49.26,0.152,0.308,113.818,1.07725,23.3005,47.339
50%,92.411,76.755,3.786,1233.9,16.748,2.99,375.25,979.75,1513.25,2769.75,...,6.458,1.977,77.292,66.4575,0.211,0.452,192.708,1.6055,37.8915,85.612
75%,99.59,100.325,5.597,1615.2,22.682,3.594,501.9375,1295.6875,2781.0,7215.0,...,10.0625,2.743,151.55575,82.29725,0.333,0.716,479.632,2.52075,70.83325,120.377
max,121.21,146.08,15.378,2788.5,48.584,5.119,831.25,1771.0,6099.0,22110.25,...,21.672,5.119,358.258,179.053,0.974,2.652,1634.071,7.139,126.718,254.843


Account for seasonality and trend if possible:

K-Means Clustering

Lets set our data to scale: