In [27]:
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
#import all the dependencies

In [28]:
#get current S&P tickers from wikipedia
wiki_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
sp500 = pd.read_html(wiki_url, header=0)[0]
symbols = sp500["Symbol"].str.replace(r"\.", "-", regex=True).tolist()

In [29]:
#get top 50 companies traded on Nasdaq on S&P 500 by average daily volume over last 60 days
vol_data = yf.download(symbols, period="60d", group_by="ticker", auto_adjust=True)
avg_vol = {
    sym: vol_data[sym]["Volume"].mean()
    for sym in symbols
    if sym in vol_data.columns.levels[0]
}
top100 = sorted(avg_vol, key=avg_vol.get, reverse=True)[:100]
print(f"Using top 50 by volume: {top100}\n")

[*********************100%***********************]  503 of 503 completed


Using top 50 by volume: ['NVDA', 'F', 'TSLA', 'INTC', 'PLTR', 'SMCI', 'AAPL', 'PFE', 'AMZN', 'BAC', 'T', 'WBD', 'AMD', 'AMCR', 'GOOGL', 'AVGO', 'CMCSA', 'CCL', 'MSFT', 'VZ', 'MU', 'CSCO', 'WMT', 'HBAN', 'GOOG', 'HPE', 'UBER', 'NKE', 'KO', 'WFC', 'WBA', 'PCG', 'FCX', 'META', 'SLB', 'C', 'XOM', 'MRK', 'KVUE', 'CSX', 'NCLH', 'AES', 'KEY', 'KDP', 'GM', 'VTRS', 'KMI', 'CMG', 'BMY', 'LUV', 'LRCX', 'OXY', 'NEM', 'HAL', 'SCHW', 'MCHP', 'DAL', 'PYPL', 'NEE', 'ANET', 'USB', 'JPM', 'SBUX', 'KHC', 'CVS', 'ORCL', 'HST', 'ON', 'DIS', 'MO', 'MDLZ', 'TFC', 'MRNA', 'DVN', 'JNJ', 'GILD', 'UAL', 'COP', 'RF', 'CVX', 'APA', 'PARA', 'WDC', 'QCOM', 'EQT', 'BA', 'DOW', 'EXC', 'HPQ', 'APH', 'PG', 'DELL', 'BSX', 'BKR', 'TGT', 'VST', 'TXN', 'AMAT', 'MS', 'PEP']



In [30]:
#Download 1 year of daily price data for these 50 stocks
prices = yf.download(
    tickers=top100,
    period="1y",
    auto_adjust=True,
    group_by="ticker",
    threads=True
)

[*********************100%***********************]  100 of 100 completed


In [31]:
# Fetch SPY just for beta:
spy_prices = (
    yf.download("SPY", period="1y", auto_adjust=True)["Close"]
    .pct_change()
    .dropna()
)
spy_prices.name = "SPY"


[*********************100%***********************]  1 of 1 completed


In [32]:
#price based features for each stock
feat_list = []
for sym in top100:
    # a) compute returns
    df = prices[sym]["Close"].pct_change().to_frame("ret").dropna()

    # b) rolling 30‑day volatility
    df["vol30"] = df["ret"].rolling(30).std()

    # c) 30‑day momentum
    df["mom30"] = prices[sym]["Close"].pct_change(30)

    # d) rolling 60‑day beta vs SPY
    combined = pd.concat([df["ret"], spy_prices], axis=1, join="inner").dropna()
    cov    = combined["ret"].rolling(60).cov(combined["SPY"])
    var    = combined["SPY"].rolling(60).var()
    df["beta60"] = (cov / var).reindex(df.index)

    # e) grab the most recent valid row
    last = df.dropna().iloc[-1].to_dict()
    last["ticker"] = sym
    feat_list.append(last)

# assemble into DataFrame
feat_df = pd.DataFrame(feat_list).set_index("ticker")
print("Price‑based features:"); display(feat_df.head())


Price‑based features:


Unnamed: 0_level_0,ret,vol30,mom30,beta60
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NVDA,0.025894,0.049991,-0.034,1.748606
F,0.009823,0.031267,0.026973,0.84222
TSLA,0.023849,0.064963,0.215652,2.159177
INTC,0.032032,0.052082,-0.139399,1.419522
PLTR,0.069535,0.056246,0.422131,1.957583


In [33]:
#get fundamentals for the tickers
funds = []
for sym in feat_df.index:
    info = yf.Ticker(sym).info
    funds.append({
        "ticker":    sym,
        "pe":        info.get("trailingPE",    np.nan),
        "pb":        info.get("priceToBook",   np.nan),
        "div_yield": info.get("dividendYield", np.nan),
    })
fund_df = pd.DataFrame(funds).set_index("ticker")
print("Fundamentals (PE, PB, dividend yield):")
print(fund_df.head(), "\n")


Fundamentals (PE, PB, dividend yield):
                pe         pb  div_yield
ticker                                  
NVDA     38.945576  35.328600       0.03
F         7.041096   0.908769       7.30
TSLA    164.120000  12.388285        NaN
INTC           NaN   0.901657        NaN
PLTR    654.105300  58.101917        NaN 



In [34]:
#merge features & fundamentals, inspect missing values
data = feat_df.join(fund_df)
print("Missing values before imputation:")
print(data.isna().sum(), "\n")


Missing values before imputation:
ret           0
vol30         0
mom30         0
beta60        0
pe            9
pb            0
div_yield    18
dtype: int64 



In [35]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")
data_imputed = pd.DataFrame(
    imputer.fit_transform(data),
    columns=data.columns,
    index=data.index
)
print("Missing values after imputation:")
print(data_imputed.isna().sum(), "\n")


Missing values after imputation:
ret          0
vol30        0
mom30        0
beta60       0
pe           0
pb           0
div_yield    0
dtype: int64 



In [36]:
#perform K means
scaler = StandardScaler()
X = scaler.fit_transform(data_imputed)

kmeans = KMeans(n_clusters=3, random_state=42)
data_imputed["cluster"] = kmeans.fit_predict(X)

In [37]:
#map the clusters
centroids = pd.DataFrame(
    scaler.inverse_transform(kmeans.cluster_centers_),
    columns=data_imputed.columns[:-1]
)
order = centroids["vol30"].sort_values().index.tolist()
risk_map = { order[i]: lab for i, lab in enumerate(["Low","Medium","High"]) }
data_imputed["risk_label"] = data_imputed["cluster"].map(risk_map)

In [39]:
from sklearn.preprocessing import RobustScaler
from sklearn.cluster import KMeans
import numpy as np

# 1) Log‑transform vol30 to downweight extreme volatilities
data["vol30_log"] = np.log1p(data["vol30"])

# 2) Features for clustering
feat_cols = ["vol30_log", "mom30", "beta60"]

# 3) Fit RobustScaler on those features
rs = RobustScaler()
X_rs = rs.fit_transform(data[feat_cols])

# 4) Run K‑Means on the scaled data
kmeans2 = KMeans(n_clusters=3, random_state=42).fit(X_rs)
data["cluster2"] = kmeans2.labels_

# 5) Inverse‑transform the centroids back to original feature units
centroids = pd.DataFrame(
    rs.inverse_transform(kmeans2.cluster_centers_),
    columns=feat_cols
)

# 6) Order clusters by ascending vol30_log to map to Low/Med/High
order = centroids["vol30_log"].sort_values().index.tolist()
risk_map = { order[i]: label for i, label in enumerate(["Low","Medium","High"]) }
data["risk_label2"] = data["cluster2"].map(risk_map)

# 7) Inspect new distribution
print(data["risk_label2"].value_counts())


risk_label2
Low       52
Medium    28
High      20
Name: count, dtype: int64


In [40]:
# Save the new risk_label2 assignments to CSV
out_path = "stock_risk_kmeans_robust.csv"
data.to_csv(out_path, columns=[
    "vol30", "mom30", "beta60", "vol30_log",
    "cluster2", "risk_label2"
])
print(f" Saved robust‑scaled K‑Means results to '{out_path}'")


 Saved robust‑scaled K‑Means results to 'stock_risk_kmeans_robust.csv'
