In [72]:
import os
from dotenv import load_dotenv
from binance.client import Client

# Load the keys from the invisible .env file
load_dotenv()

api_key = os.getenv("BINANCE_API_KEY")
api_secret = os.getenv("BINANCE_SECRET")

# Safety Check: Print a warning if keys are missing (Optional)
if not api_key:
    print("CRITICAL ERROR: API Key not found!")

client = Client(api_key, api_secret)

In [73]:
import requests
import pandas as pd

url = "https://api.coingecko.com/api/v3/coins/markets"

params = {
    "vs_currency": "usd",
    "order": "market_cap_desc",
    "per_page": 50,   # pull more so filtering still leaves ~20
    "page": 1,
    "sparkline": False
}

response = requests.get(url, params=params)
data = response.json()

df = pd.DataFrame(data)[
    ["id", "market_cap_rank", "symbol", "name", "market_cap", "current_price"]
]

# -----------------------------
# Filters
# -----------------------------

stablecoins = {
    "usdt", "usdc", "busd", "dai", "tusd",
    "usdp", "fdusd", "gusd", "lusd",
    "ust", "ustc", "usdd", "eurc",'usds','usde', 'usd1', 'bsc-usd'
}

# normalize text
df["symbol"] = df["symbol"].str.lower()
df["name"] = df["name"].str.lower()

# remove stablecoins
df = df[~df["symbol"].isin(stablecoins)]

# remove wrapped coins
df = df[
    ~df["name"].str.contains("wrapped") &
    ~df["symbol"].str.startswith("w")
]

# take top 20 after filtering
df = df.sort_values("market_cap", ascending=False).head(20)

print(df)

                  id  market_cap_rank      symbol          name  \
0            bitcoin                1         btc       bitcoin   
1           ethereum                2         eth      ethereum   
3             ripple                4         xrp           xrp   
4        binancecoin                5         bnb           bnb   
6             solana                7         sol        solana   
7               tron                8         trx          tron   
8       figure-heloc                9  figr_heloc  figure heloc   
9           dogecoin               10        doge      dogecoin   
11      bitcoin-cash               12         bch  bitcoin cash   
13           cardano               14         ada       cardano   
14         leo-token               15         leo     leo token   
15       hyperliquid               16        hype   hyperliquid   
17            monero               18         xmr        monero   
18    canton-network               19          cc        canto

In [74]:
# Binance symbols are uppercase
df["binance_symbol"] = df["symbol"].str.upper() + "USDT"

# Check which ones actually trade on Binance
exchange_info = client.get_exchange_info()
binance_pairs = {s["symbol"] for s in exchange_info["symbols"]}

df = df[df["binance_symbol"].isin(binance_pairs)]

print(df[["symbol", "binance_symbol"]])

   symbol binance_symbol
0     btc        BTCUSDT
1     eth        ETHUSDT
3     xrp        XRPUSDT
4     bnb        BNBUSDT
6     sol        SOLUSDT
7     trx        TRXUSDT
9    doge       DOGEUSDT
11    bch        BCHUSDT
13    ada        ADAUSDT
17    xmr        XMRUSDT
19   link       LINKUSDT
21    xlm        XLMUSDT
23    ltc        LTCUSDT
24    zec        ZECUSDT
26   hbar       HBARUSDT


In [75]:
import numpy as np
from statsmodels.tsa.stattools import coint

##Fetch historical klines from Binance

def get_binance_klines(symbol, interval=Client.KLINE_INTERVAL_1DAY, limit=365):
    klines = client.get_klines(
        symbol=symbol,
        interval=interval,
        limit=limit
    )

    df = pd.DataFrame(klines, columns=[
        "open_time", "open", "high", "low", "close", "volume",
        "close_time", "qav", "num_trades", "taker_base_vol",
        "taker_quote_vol", "ignore"
    ])

    df["open_time"] = pd.to_datetime(df["open_time"], unit="ms")
    df.set_index("open_time", inplace=True)

    df["close"] = df["close"].astype(float)

    return df[["close"]]

In [76]:
print(get_binance_klines("ETHUSDT", limit = 10))

              close
open_time          
2026-02-02  2347.02
2026-02-03  2233.72
2026-02-04  2148.26
2026-02-05  1826.83
2026-02-06  2063.38
2026-02-07  2087.08
2026-02-08  2089.74
2026-02-09  2105.02
2026-02-10  2022.67
2026-02-11  1949.05


In [77]:
## Set BTC price series (benchmark)

LOOKBACK=30

btc = get_binance_klines("BTCUSDT", limit=LOOKBACK)
btc_log = np.log(btc["close"])

##Run Engle-Granger test vs BTC

results = []

for _, row in df.iterrows():
    symbol = row["binance_symbol"]

    try:
        alt = get_binance_klines(symbol, limit=LOOKBACK)

        merged = btc.join(alt, how="inner", lsuffix="_btc", rsuffix="_alt")

        if symbol == "BTCUSDT":
            continue

        if len(merged) < 10:
            continue

        btc_log = np.log(merged["close_btc"])
        alt_log = np.log(merged["close_alt"])

        test_stat, p_value, crit_vals = coint(btc_log, alt_log)

        results.append({
            "coin": symbol.replace("USDT", ""),
            "binance_pair": symbol,
            "p_value": p_value,
            "cointegrated_5pct": p_value < 0.05
        })

    except Exception as e:
        print(f"Skipping {symbol}: {e}")

results_df = (
    pd.DataFrame(results)
    .sort_values("p_value")
    .reset_index(drop=True)
)

print(results_df)

    coin binance_pair   p_value  cointegrated_5pct
0    SOL      SOLUSDT  0.000007               True
1    XLM      XLMUSDT  0.003547               True
2    ETH      ETHUSDT  0.021716               True
3    ZEC      ZECUSDT  0.036409               True
4    TRX      TRXUSDT  0.105604              False
5    XRP      XRPUSDT  0.288696              False
6    BCH      BCHUSDT  0.341787              False
7   DOGE     DOGEUSDT  0.385476              False
8    ADA      ADAUSDT  0.423575              False
9   LINK     LINKUSDT  0.618650              False
10   BNB      BNBUSDT  0.622503              False
11  HBAR     HBARUSDT  0.869936              False
12   LTC      LTCUSDT  0.913645              False


In [78]:
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller

# -----------------------------
# Build price matrix
# -----------------------------

# 1. Setup
LOOKBACK = 180
target_symbol = "BTC"  # We will standardize on this name for the target column

price_data = {}

# 2. Fetch BTC (The Benchmark) - explicit handling to prevent KeyErrors
try:
    btc_df = get_binance_klines("BTCUSDT", limit=LOOKBACK)
    price_data[target_symbol] = btc_df["close"]
except Exception as e:
    print(f"CRITICAL: Could not fetch BTC. {e}")

# 3. Fetch Alts
print("Fetching altcoin data...")
for _, row in df.iterrows():
    symbol = row["binance_symbol"]
    # Clean the coin name (e.g., "ETHUSDT" -> "ETH")
    coin_name = symbol.replace("USDT", "") 
    
    if symbol == "BTCUSDT": 
        continue

    try:
        alt_df = get_binance_klines(symbol, limit=LOOKBACK)
        # Store using the short name (e.g. "ETH")
        price_data[coin_name] = alt_df["close"]
    except Exception as e:
        print(f"Skipping {symbol}: {e}")
        continue

# 4. Create DataFrame
# pd.concat with axis=1 aligns all series by their Date index automatically
prices = pd.concat(price_data, axis=1)

# Drop rows where BTC is missing (since we need 'y' to exist)
if target_symbol not in prices.columns:
    raise ValueError("CRITICAL ERROR: BTC column is missing! Check your connection or symbol name.")

prices = prices[prices[target_symbol].notna()]

# 5. Smart Filtering (Protect BTC)
min_obs = 100 
valid_coins = []

for col in prices.columns:
    # Always keep BTC
    if col == target_symbol:
        valid_coins.append(col)
        continue
    
    # For alts, only keep if they have enough data points
    if prices[col].notna().sum() >= min_obs:
        valid_coins.append(col)

prices = prices[valid_coins]

# Fill forward small gaps (e.g. weekends/holidays data glitches), then drop remaining NaNs
prices = prices.ffill().dropna()

print("-" * 30)
print(f"Final Matrix Shape: {prices.shape}")
print(f"Coins included: {prices.columns.tolist()}")
print("-" * 30)

# 6. Lasso Regression
log_prices = np.log(prices)

y = log_prices[target_symbol]
X = log_prices.drop(columns=[target_symbol])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

lasso = LassoCV(
    cv=5,
    max_iter=10_000,
    n_alphas=100,
    random_state=42
)

lasso.fit(X_scaled, y)

# 7. Output Results
coefs = pd.Series(lasso.coef_, index=X.columns)
print("\n--- Significant Co-movers (Lasso Coefficients) ---")
print(coefs[coefs != 0].sort_values(ascending=False))

Fetching altcoin data...
------------------------------
Final Matrix Shape: (180, 14)
Coins included: ['BTC', 'ETH', 'XRP', 'BNB', 'SOL', 'TRX', 'DOGE', 'BCH', 'ADA', 'LINK', 'XLM', 'LTC', 'ZEC', 'HBAR']
------------------------------

--- Significant Co-movers (Lasso Coefficients) ---
ETH     0.063309
BNB     0.032161
ADA     0.030184
XRP     0.018592
HBAR    0.005791
TRX     0.001931
BCH    -0.000791
dtype: float64


In [79]:
lasso_coefs = pd.Series(lasso.coef_, index=X.columns)

selected_coins = lasso_coefs[lasso_coefs != 0].index.tolist()

print("LASSO-selected coins:", selected_coins)

selected_coins = (
    lasso_coefs.abs()
    .sort_values(ascending=False)
    .head(3)
    .index
    .tolist()
)

print("Top 3 selected coins:", selected_coins)

LASSO-selected coins: ['ETH', 'XRP', 'BNB', 'TRX', 'BCH', 'ADA', 'HBAR']
Top 3 selected coins: ['ETH', 'BNB', 'ADA']


In [80]:
X_sel = log_prices[selected_coins]
X_const = sm.add_constant(X_sel)

ols_model = sm.OLS(y, X_const).fit()
print(ols_model.summary())

                            OLS Regression Results                            
Dep. Variable:                    BTC   R-squared:                       0.988
Model:                            OLS   Adj. R-squared:                  0.988
Method:                 Least Squares   F-statistic:                     4743.
Date:                Wed, 11 Feb 2026   Prob (F-statistic):          4.78e-168
Time:                        14:58:14   Log-Likelihood:                 491.61
No. Observations:                 180   AIC:                            -975.2
Df Residuals:                     176   BIC:                            -962.5
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          7.2675      0.174     41.774      0.0

In [81]:
# 1. Create the Spread
# Note: We use the variables from your previous OLS step
# spread = y - predicted_y
spread = y - ols_model.predict(X_const)

# 2. Run Augmented Dickey-Fuller Test
adf_stat, p_value, _, _, crit_vals, _ = adfuller(spread)

print(f"ADF Statistic: {adf_stat:.4f}")
print(f"p-value: {p_value:.4f}")
print("Critical Values:", crit_vals)

# 3. Calculate Half-Life (Mean Reversion Speed)
# Formula: Delta_spread ~ gamma * lag_spread + error
spread_lag = spread.shift(1).dropna()
spread_diff = spread.diff().dropna()

# Align indices perfectly
valid_index = spread_lag.index.intersection(spread_diff.index)
spread_lag = spread_lag.loc[valid_index]
spread_diff = spread_diff.loc[valid_index]

# Run OLS for Half-Life
spread_lag_const = sm.add_constant(spread_lag)
hl_model = sm.OLS(spread_diff, spread_lag_const).fit()

# Get the Mean Reversion Speed (gamma)
# We use .iloc[1] to get the slope coefficient safely
gamma = hl_model.params.iloc[1]

print(f"Mean Reversion Strength (Gamma): {gamma:.4f}")

if gamma < 0:
    half_life = -np.log(2) / gamma
    print(f"Half-life: {half_life:.2f} days")
else:
    print("Warning: Spread is NOT mean-reverting (Gamma >= 0). Half-life is undefined.")

ADF Statistic: -5.7177
p-value: 0.0000
Critical Values: {'1%': np.float64(-3.4674201432469816), '5%': np.float64(-2.877826051844538), '10%': np.float64(-2.575452082332012)}
Mean Reversion Strength (Gamma): -0.2909
Half-life: 2.38 days
