In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from niftystocks import ns
from sklearn.decomposition import PCA

In [None]:
get_nifty50 = ns.get_nifty50_with_ns()
data = yf.download(get_nifty50, start="2020-01-01", end="2021-01-01")

In [None]:
returns = data['Adj Close'].pct_change()
returns = returns.iloc[1:]
returns = returns.dropna(axis=1)

In [None]:
features = ["Mean Returns", "Volatility", "Sharpe Ratio"]
cluster_data = pd.DataFrame(index=returns.columns, columns=features)
cluster_data["Mean Returns"] = returns.mean()
cluster_data["Volatility"] = returns.std()
cluster_data["Sharpe Ratio"] = cluster_data["Mean Returns"] / cluster_data["Volatility"]


In [None]:
scaler = StandardScaler()
if not cluster_data.empty:
    scaled_returns = pd.DataFrame(scaler.fit_transform(cluster_data), index=cluster_data.index, columns=cluster_data.columns)
else:
    raise ValueError("The cluster DataFrame is empty. Ensure that the data is correctly downloaded and processed.")


In [None]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(scaled_returns)
clusters = kmeans.predict(scaled_returns)

In [None]:
print("Number of stocks:", len(returns.columns))
print("Number of clusters:", len(clusters))
clustered_stocks = pd.DataFrame(list(zip(returns.columns, clusters)), columns=['Stock', 'Cluster'])
clustered_stocks.sort_values('Cluster', inplace=True)
clustered_stocks.head()

In [None]:
pca = PCA(n_components=2)
pca_components = pca.fit_transform(scaled_returns)

plt.scatter(pca_components[:, 0], pca_components[:, 1], c=clusters, cmap='viridis')
plt.title('Nifty 50 Clustering based upon Returns')
plt.show()