In [1]:
# file path
from google.colab import drive
drive.mount('/content/drive')
xlsx_path = "/content/drive/MyDrive/Dataset.xlsx"

Mounted at /content/drive


In [2]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
px_all = pd.read_parquet("/content/drive/MyDrive/price_metrics.parquet")
px_all.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/price_metrics.parquet'

In [None]:
# Extract PX_LAST only (MultiIndex columns)
px = px_all.xs("PX_LAST", axis=1, level="metric")
# Resample to month end
px_m = px.resample("M").last()
px_m

In [None]:
# Compute 6-1 and 12-1 momentum
mom_6_1 = np.log(px_m.shift(1)) - np.log(px_m.shift(7))

# Winsorize
def winsorize(row, lower=0.01, upper=0.99):
    if row.isna().all():
        return row
    lo, hi = row.quantile([lower, upper])
    return row.clip(lo, hi)

mom6_w = mom_6_1.apply(winsorize, axis=1)


In [None]:
# Z-score
mom6_z  = mom6_w.sub(mom6_w.mean(axis=1), axis=0).div(mom6_w.std(axis=1), axis=0)


# Baseline portfolio
def build_positions(signal, long_q=0.8, short_q=0.2):
    pos = pd.DataFrame(index=signal.index, columns=signal.columns)

    for dt, row in signal.iterrows():
        r = row.rank(pct=True)
        pos.loc[dt] = (r >= long_q).astype(int) - (r <= short_q).astype(int)

    return pos

pos6_m  = build_positions(mom6_z)


# Compute daily return
daily_ret = px.pct_change()

# In-sample window
px_bt = px.loc["2010-01-01":"2020-12-31"]
daily_ret_bt = daily_ret.loc["2010-01-01":"2020-12-31"]

pos6_d  = pos6_m.reindex(daily_ret_bt.index).ffill()


# Compute strategy returns
strategy6 = (pos6_d.shift(1) * daily_ret_bt).mean(axis=1).fillna(0)


# Cum returns
cum6 = (1 + strategy6).cumprod()


# Plot
import matplotlib.pyplot as plt

plt.figure(figsize=(12,6))
plt.plot(cum6, label="6-1 Momentum")
plt.title("Baseline Momentum Backtest (2010â€“2020)")
plt.grid(True)
plt.legend()
plt.show()


In [None]:
factors_std = pd.read_parquet("/content/drive/MyDrive/factors_std.parquet")

In [None]:
keep_metrics = [

    # VALUE
    "PE_RATIO",
    "PX_TO_BOOK_RATIO",
    "PX_TO_SALES_RATIO",
    "CURRENT_EV_TO_T12M_EBITDA",
    "FREE_CASH_FLOW_YIELD",
    "EQY_DVD_YLD_12M",

    # QUALITY
    "EBITDA_MARGIN",
    "GROSS_MARGIN",
    "OPER_MARGIN",
    "PROF_MARGIN",
    "RETURN_ON_ASSET",

    # LEVERAGE
    "TOT_DEBT_TO_EBITDA",
    "TOT_DEBT_TO_TOT_EQY",

    # SIZE
    "CURRENT_MARKET_CAP_SHARE_CLASS",

    # RISK
    "BETA_ADJ_OVERRIDABLE",
    "VOLATILITY_30D", "VOLATILITY_90D", "VOLATILITY_180D", "VOLATILITY_360D",

    # TAIL RISK
    "RET_SKEW_30D", "RET_SKEW_90D", "RET_SKEW_180D", "RET_SKEW_360D",
    "RET_KURT_30D", "RET_KURT_180D", "RET_KURT_360D", "RET_KURT_90D",

    # LIQUIDITY
    "TURNOVER",

    "RET_30D"
]


In [None]:
factors_kmeans = factors_std.loc[:, factors_std.columns.get_level_values("metric").isin(keep_metrics)]

In [None]:
factors_kmeans = factors_kmeans.ffill()

In [None]:
print(factors_kmeans)

In [None]:
# Collapse K-means features: median across stocks per date
X_kmeans = factors_kmeans.groupby(level="metric", axis=1).median()
# Collapse cross-section by taking median across stocks
mom6_feat  = mom6_z.median(axis=1)

X_mom = pd.DataFrame({
    "MOM_6_1": mom6_feat,
})
# Combine kmeans factors + momentum
X_all = pd.concat([X_kmeans, X_mom], axis=1)

corr_all = X_all.corr()
corr_all  # bottom rows are MOM6, MOM12 correlations


In [None]:
import numpy as np
import pandas as pd

# Absolute correlation
abs_corr = corr_all.abs()

# Threshold
thr = 0.7

# Get upper triangle mask (avoid duplicates & self-corr=1)
mask = np.triu(np.ones(abs_corr.shape), k=1).astype(bool)

# Extract pairs
high_corr_pairs = (
    abs_corr.where(mask)
            .stack()
            .reset_index()
)

high_corr_pairs.columns = ["Feature1", "Feature2", "Correlation"]

# Apply threshold
high_corr_pairs = high_corr_pairs[high_corr_pairs["Correlation"] >= thr]

# Sort by highest correlation
high_corr_pairs = high_corr_pairs.sort_values("Correlation", ascending=False)

high_corr_pairs


In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def choose_k_silhouette(X, k_min=5, k_max=20):
    best_k = k_min
    best_score = -1

    for k in range(k_min, k_max+1):
        km = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
        score = silhouette_score(X, km.labels_)
        if score > best_score:
            best_score = score
            best_k = k

    return best_k


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans


# Feature smoothing (EWMA)

lambda_ = 0.94
alpha = 1 - lambda_

metrics_in_data = factors_kmeans.columns.get_level_values("metric").unique()

for m in metrics_in_data:
    X = factors_kmeans.xs(m, axis=1, level="metric")
    X_smooth = X.ewm(alpha=alpha, adjust=False, min_periods=1).mean()

    X_smooth.columns = pd.MultiIndex.from_product(
        [[m], X_smooth.columns], names=["metric", "stock"]
    )

    mask = factors_kmeans.columns.get_level_values("metric") == m
    factors_kmeans.loc[:, mask] = X_smooth.values

# Monthly snapshot
factors_kmeans_m = factors_kmeans.resample("M").last()


# Momentum signal (6-1)

def winsorize_cs(row, lower=0.01, upper=0.99):
    if row.isna().all():
        return row
    lo, hi = row.quantile([lower, upper])
    return row.clip(lo, hi)

mom6_w = mom_6_1.apply(winsorize_cs, axis=1)

def zscore_cs(X):
    return X.sub(X.mean(axis=1), axis=0).div(X.std(axis=1).replace(0, np.nan), axis=0)

mom6_z = zscore_cs(mom6_w)

# Forward monthly return
fwd_ret_m = np.log(px_m.shift(-1)) - np.log(px_m)



# Long-short return function
def long_short_return(signal_cs, fwd_ret_cs, long_frac=0.2, short_frac=0.2):
    sig = signal_cs.dropna()
    common = sig.index.intersection(fwd_ret_cs.dropna().index)
    if len(common) < 10:
        return np.nan

    sig = sig.loc[common]
    r = fwd_ret_cs.loc[common]

    n = len(sig)
    n_long = max(1, int(n * long_frac))
    n_short = max(1, int(n * short_frac))

    sig_sorted = sig.sort_values()

    long_names = sig_sorted.index[-n_long:]
    short_names = sig_sorted.index[:n_short]

    return r.loc[long_names].mean() - r.loc[short_names].mean()


# Rolling K-means backtest
K = 30 # number of clusters

# Valid monthly dates
rebalance_dates = mom6_z.dropna(how="all").index
rebalance_dates = rebalance_dates.intersection(factors_kmeans_m.index)
rebalance_dates = rebalance_dates.intersection(fwd_ret_m.index)

rebalance_dates = rebalance_dates[(rebalance_dates >= "2011-01-31") &
                                  (rebalance_dates <= "2020-12-31")]

baseline_returns = []
cluster_returns = []
cluster_dates = []

# Baseline
for t in rebalance_dates[:-1]:
    baseline_returns.append(long_short_return(mom6_z.loc[t], fwd_ret_m.loc[t]))
# full-df baseline dates (no intersection with cluster dates)
baseline_dates = rebalance_dates[:-1]

# Clustered stratege

for t in rebalance_dates[:-1]:
    t_next = rebalance_dates[rebalance_dates.get_loc(t) + 1]

    # Load monthly features
    row_t = factors_kmeans_m.loc[t]
    X_t = row_t.unstack("metric")

    # Keep stocks with >=70% non-NaN features
    min_valid = int(0.7 * X_t.shape[1])
    X_t = X_t.dropna(axis=0, thresh=min_valid).fillna(0)

    if X_t.shape[0] < K + 5:
        continue

    # Fit K-means
    km = KMeans(n_clusters=K, n_init=50, random_state=0)
    labels = km.fit_predict(X_t.values)

    clusters_t = pd.Series(labels, index=X_t.index, name="cluster")

    sig6_t = mom6_z.loc[t]
    fwd_t = fwd_ret_m.loc[t]

    # Align to available universe
    universe = X_t.index.intersection(sig6_t.dropna().index).intersection(fwd_t.dropna().index)
    if len(universe) < K * 5:
        continue

    clusters_t = clusters_t.loc[universe]
    sig6_t = sig6_t.loc[universe]
    fwd_t = fwd_t.loc[universe]

    # Drop clusters with <5 names
    valid_clusters = clusters_t.value_counts()[lambda x: x >= 5].index
    clusters_t = clusters_t[clusters_t.isin(valid_clusters)]
    sig6_t = sig6_t.loc[clusters_t.index]
    fwd_t = fwd_t.loc[clusters_t.index]

    # Equal-weight long/short across clusters
    cluster_ls = []
    for cid in clusters_t.unique():
        idx = clusters_t.index[clusters_t == cid]
        if len(idx) < 10:
            continue
        ret_c = long_short_return(sig6_t.loc[idx], fwd_t.loc[idx])
        if not np.isnan(ret_c):
            cluster_ls.append(ret_c)

    if len(cluster_ls) < max(3, int(0.3 * K)):
        continue

    cluster_returns.append(np.mean(cluster_ls))
    cluster_dates.append(t_next)


# Build results dataframe

results = pd.DataFrame(index=baseline_dates)
results["Baseline_6_1"] = baseline_returns
results["Cluster_6_1"] = pd.Series(cluster_returns, index=cluster_dates)

cum = (1 + results).cumprod()

plt.figure(figsize=(10, 6))
plt.plot(cum.index, cum["Baseline_6_1"], label="Baseline 6-1")
plt.plot(cum.index, cum["Cluster_6_1"], label="Clustered 6-1")
plt.legend()
plt.title("Cumulative Returns: Baseline vs Clustered Momentum (Dollar-Neutral)")
plt.grid(True)
plt.show()
