### Financial Instrument K-Means Clustering

In [None]:
# Import common libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [None]:
# Import data
df = pd.read_csv('QES_1.0_dprices.csv', parse_dates=['Date'],index_col='Date', sep=',')
df.head()

In [None]:
# Define features and target
features =[
#'Date'
#,'TB3',
 'NQNL'
,'NQAU'
,'NQCA'
,'NQCN'
,'NQCZ'
,'NQEG'
,'NQFI'
,'NQFR'
,'NQDE'
,'NQGR'
,'NQHK'
,'NQIN'
,'NQID'
,'NQIL'
,'NQJP'
,'NQKR'
,'NQMY'
,'NQPH'
,'NQPL'
,'NQPT'
,'NQRU'
,'NQSG'
,'NQES'
,'NQSE'
,'NQCH'
,'NQTW'
,'NQTH'
,'NQTR'
,'NQGB'
]

target   = ['NQGI']

In [None]:
# Define prices dataframe

prices_df = pd.DataFrame(df[features])
prices_df.head()

In [None]:
# Compute returns and returns volatility

from math import sqrt
from scipy.cluster.vq import kmeans,vq
from sklearn.cluster import KMeans

returns = prices_df.pct_change().mean() * 252
returns = pd.DataFrame(returns)
returns.columns = ['Returns']
returns['Volatility'] = prices_df.pct_change().std() * sqrt(252)
 

In [None]:
#format the data as a numpy array to feed into the K-Means algorithm
data = np.asarray([np.asarray(returns['Returns']),np.asarray(returns['Volatility'])]).T
 
X = data
distorsions = []
for k in range(2, 20):
    k_means = KMeans(n_clusters=k)
    k_means.fit(X)
    distorsions.append(k_means.inertia_)

The “Elbow Curve” highlights the relationship between how many clusters we choose, and the Sum of Squared Errors (SSE) resulting from using that number of clusters.

In [None]:
# Plot elbow curve

fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 20), distorsions)
plt.grid(True)
plt.title('Elbow curve')

In [None]:
# Plot K-Means clustering

from pylab import plot,show

centroids,_ = kmeans(data,5)
# assign each sample to a cluster
idx,_ = vq(data,centroids)
 
# some plotting using numpy's logical indexing
plot(data[idx==0,0],data[idx==0,1],'ob',
     data[idx==1,0],data[idx==1,1],'oy',
     data[idx==2,0],data[idx==2,1],'or',
     data[idx==3,0],data[idx==3,1],'og',
     data[idx==4,0],data[idx==4,1],'om')
plot(centroids[:,0],centroids[:,1],'sg',markersize=8)
plt.title('K-Means of Global Indicies Before Outlier Removal')
show()

In [None]:
#identify the outlier
print(returns.idxmax())

In [None]:
#drop the relevant stock outliers from our data
returns.drop('NQCZ',inplace=True)
returns.drop('NQDE',inplace=True)

In [None]:
#recreate data to feed into the algorithm
data = np.asarray([np.asarray(returns['Returns']),np.asarray(returns['Volatility'])]).T
# computing K-Means with K = 5 (5 clusters)
centroids,_ = kmeans(data,5)
# assign each sample to a cluster
idx,_ = vq(data,centroids) 

In [None]:
# Plot updated K-Means using numpy's logical indexing
plot(data[idx==0,0],data[idx==0,1],'ob',
     data[idx==1,0],data[idx==1,1],'oy',
     data[idx==2,0],data[idx==2,1],'or',
     data[idx==3,0],data[idx==3,1],'og',
     data[idx==4,0],data[idx==4,1],'om')
plot(centroids[:,0],centroids[:,1],'sg',markersize=8)
plt.title('K-Means of Global Indicies After Outlier Removal')
show()

In [None]:
# Print 
details = [(name,cluster) for name, cluster in zip(returns.index,idx)]
 
for detail in details:
    print(detail)