In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [None]:
# Import necessary libraries
import yfinance as yf
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Step 1: Define stock and market index tickers
stock_ticker = "PLTR"       # Change this to any stock
benchmark_ticker = "^GSPC"  # S&P 500 index
risk_free_ticker = "^TNX"   # 10-year Treasury bond yield as risk-free rate

# Here we Define time range for 3 years
end_date = pd.Timestamp.today()
start_date = end_date - pd.DateOffset(years=3)

# Fetch stock, market, and risk-free rate data
stock_data = yf.download(stock_ticker, start=start_date, end=end_date)[["Close"]].rename(columns={"Close": "Stock_Close"})
market_data = yf.download(benchmark_ticker, start=start_date, end=end_date)[["Close"]].rename(columns={"Close": "Market_Close"})
risk_free_data = yf.download(risk_free_ticker, start=start_date, end=end_date)[["Close"]].rename(columns={"Close": "Risk_Free_Close"})

# Debugging: Print available columns to ensure proper data loading
print("Stock Data Columns:", stock_data.columns)
print("Market Data Columns:", market_data.columns)
print("Risk-Free Data Columns:", risk_free_data.columns)

# Merge dataframes based on index
data = stock_data.join(market_data, how="inner").join(risk_free_data, how="inner")
data.dropna(inplace=True)  # Remove missing values

# Step 2: Compute daily returns
data["Stock_Returns"] = data["Stock_Close"].pct_change()
data["Market_Returns"] = data["Market_Close"].pct_change()
data["Risk_Free_Rate"] = (data["Risk_Free_Close"] / 100) / 252  # Convert 10-year yield to daily risk-free rate

# Drop NaN values after pct_change()
data.dropna(inplace=True)

# Compute excess returns
data["Excess_Stock_Returns"] = data["Stock_Returns"] - data["Risk_Free_Rate"]
data["Excess_Market_Returns"] = data["Market_Returns"] - data["Risk_Free_Rate"]

# Step 3: Ensure proper DataFrame creation
df_excess_returns = data[["Excess_Stock_Returns", "Excess_Market_Returns"]].dropna()

# Step 4: CAPM Estimation using OLS Regression
X = sm.add_constant(df_excess_returns["Excess_Market_Returns"])  # Add intercept term
Y = df_excess_returns["Excess_Stock_Returns"]

model = sm.OLS(Y, X)
results = model.fit()

# Extract regression coefficients
alpha = results.params[0]
beta = results.params[1]
r_squared = results.rsquared

# Step 5: Plot regression line and scatterplot
plt.figure(figsize=(10, 6))
plt.scatter(df_excess_returns["Excess_Market_Returns"], df_excess_returns["Excess_Stock_Returns"], alpha=0.5, label="Data Points")
plt.plot(df_excess_returns["Excess_Market_Returns"], results.predict(X), color='red', label="Regression Line")

plt.xlabel("Market Excess Returns")
plt.ylabel(f"{stock_ticker} Excess Returns")
plt.title(f"CAPM Regression: {stock_ticker} vs. {benchmark_ticker}")
plt.legend()
plt.show()

# Print results
print("CAPM Regression Results:")
print(f"Alpha (Intercept): {alpha:.6f}")
print(f"Beta (Slope): {beta:.6f}")
print(f"R-squared: {r_squared:.4f}")

# Interpretation of Beta
if beta > 1:
    volatility_message = "more volatile than the market"
elif beta < 1:
    volatility_message = "less volatile than the market"
else:
    volatility_message = "as volatile as the market"

print(f"\nInterpretation: The beta of {stock_ticker} is {beta:.4f}, meaning it is {volatility_message}.")


## 1.2 Problem 2: Fama-French Three-Factor Model
# Objective: Extend the analysis to the Fama-French Three-Factor Model. **bold text**


In [None]:
# Import necessary libraries
import yfinance as yf
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt


In [None]:
# Define stock and market index tickers
stock_ticker = "PLTR"       # Change this to any stock
benchmark_ticker = "^GSPC"  # S&P 500 index
risk_free_ticker = "^TNX"   # 10-year Treasury yield as risk-free rate


In [None]:
# Define time range for 3 years
end_date = pd.Timestamp.today()
start_date = end_date - pd.DateOffset(years=3)

# Fetch stock, market, and risk-free rate data
stock_data = yf.download(stock_ticker, start=start_date, end=end_date)[["Close"]].rename(columns={"Close": "Stock_Close"})
market_data = yf.download(benchmark_ticker, start=start_date, end=end_date)[["Close"]].rename(columns={"Close": "Market_Close"})
risk_free_data = yf.download(risk_free_ticker, start=start_date, end=end_date)[["Close"]].rename(columns={"Close": "Risk_Free_Close"})

# Debugging: Print available columns
print("Stock Data Columns:", stock_data.columns)
print("Market Data Columns:", market_data.columns)
print("Risk-Free Data Columns:", risk_free_data.columns)

# Merge datasets
data = stock_data.join(market_data, how="inner").join(risk_free_data, how="inner")
data.dropna(inplace=True)  # Remove missing values


In [None]:
# Load Fama-French three-factor data
ff_url = "https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_daily_CSV.zip"

# Read Fama-French data, skipping initial description rows and footer text
ff_data = pd.read_csv(ff_url, compression="zip", skiprows=3, skipfooter=5, engine="python")

# Rename first column to "Date"
ff_data.rename(columns={ff_data.columns[0]: "Date"}, inplace=True)

# Convert "Date" column to string before filtering non-numeric values
ff_data["Date"] = ff_data["Date"].astype(str)

# Drop any rows where "Date" is not purely numeric (removing footer text)
ff_data = ff_data[ff_data["Date"].str.match(r"^\d+$")]

# Convert Date column to datetime format
ff_data["Date"] = pd.to_datetime(ff_data["Date"], format="%Y%m%d")

# Filter Fama-French data to match our stock's date range
ff_data = ff_data[(ff_data["Date"] >= start_date) & (ff_data["Date"] <= end_date)]

# Set Date as index
ff_data.set_index("Date", inplace=True)

# Convert factor values to percentages
ff_data = ff_data / 100

# Debugging: Display first rows to ensure correct loading
print(ff_data.head())




In [None]:
# Compute daily stock and market returns
data["Stock_Returns"] = data["Stock_Close"].pct_change()
data["Market_Returns"] = data["Market_Close"].pct_change()
data["Risk_Free_Rate"] = (data["Risk_Free_Close"] / 100) / 252  # Convert 10-year yield to daily risk-free rate

# Drop NaN values after computing returns
data.dropna(inplace=True)

# Compute excess returns
data["Excess_Stock_Returns"] = data["Stock_Returns"] - data["Risk_Free_Rate"]
data["Excess_Market_Returns"] = data["Market_Returns"] - data["Risk_Free_Rate"]

# Debug: Display first few rows to verify excess return calculations
print(data.head())


In [None]:
# Check full index details
print("Data Index Levels:", data.index.names)
print("FF Data Index Levels:", ff_data.index.names)

# Print first few index values to compare
print("\nFirst few dates in `data`:", data.index[:5])
print("\nFirst few dates in `ff_data`:", ff_data.index[:5])

# Check data types of indices
print("\nData index type:", type(data.index))
print("FF Data index type:", type(ff_data.index))








In [None]:
# Flatten MultiIndex columns in `data`
if isinstance(data.columns, pd.MultiIndex):
    print("Flattening MultiIndex columns in `data`...")
    data.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in data.columns]

# Convert Fama-French column names to strings for compatibility
ff_data.columns = ff_data.columns.astype(str)

# Now merge datasets
final_data = data.join(ff_data, how="inner")

# Drop NaN values
final_data.dropna(inplace=True)

# Debug: Check first few rows
print(final_data.head())



In [None]:
# Rename columns to remove trailing underscores (if any)
final_data.columns = [col.rstrip('_') for col in final_data.columns]

# Debug: Check updated column names
print("Updated columns in final_data:", final_data.columns)




In [None]:
# Define dependent variable (Stock excess returns)
Y = final_data["Excess_Stock_Returns"]

# Define independent variables (Fama-French three factors)
X = final_data[["Mkt-RF", "SMB", "HML"]]
X = sm.add_constant(X)  # Add intercept term

# Run multiple regression (Three-Factor Model)
model_ff = sm.OLS(Y, X)
results_ff = model_ff.fit()

# Display regression results
print(results_ff.summary())


In [None]:
# CAPM R-squared from previous model (update with actual CAPM result)
r_squared_capm = 0.379

# Fama-French Three-Factor Model R-squared
r_squared_ff = results_ff.rsquared

# Compare the models
print(f"CAPM R-squared: {r_squared_capm:.4f}")
print(f"Fama-French R-squared: {r_squared_ff:.4f}")

if r_squared_ff > r_squared_capm:
    print(" The Three-Factor Model explains more variation in returns than CAPM.")
else:
    print(" The CAPM model explains returns better in this case.")


In [None]:
# Extract SMB and HML coefficients
smb_coef = results_ff.params["SMB"]
hml_coef = results_ff.params["HML"]

# Interpret SMB (Size Factor)
if smb_coef > 0:
    size_effect = "✅ The stock behaves more like small-cap stocks."
else:
    size_effect = "❌ The stock behaves more like large-cap stocks."

# Interpret HML (Value Factor)
if hml_coef > 0:
    value_effect = "✅ The stock behaves more like value stocks."
else:
    value_effect = "❌ The stock behaves more like growth stocks."

print(f"\nSMB Coefficient: {smb_coef:.4f} → {size_effect}")
print(f"HML Coefficient: {hml_coef:.4f} → {value_effect}")


# Section 1.3: Clustering Stocks Based on Three-Factor Model Betas

In [None]:
# Import necessary libraries
import yfinance as yf
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


In [None]:
# Define stock tickers
tickers = ["NVDA", "MSFT", "PLTR", "TSLA", "GOOGL", "COST", "BABA","HOLX","HM","ABNB"]

# Define time range
start_date = "2010-02-01"
end_date = "2025-02-01"

# Download adjusted closing prices
stock_data = yf.download(tickers, start=start_date, end=end_date)["Close"]

# Compute daily returns
stock_returns = stock_data.pct_change().dropna()

# Debugging: Display first rows
print(stock_returns.head())


In [None]:
# Load Fama-French Three-Factor Data
ff_data = pd.read_csv("https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_daily_CSV.zip",
                      compression="zip", skiprows=3, skipfooter=5, engine="python")

# Rename first column to "Date"
ff_data.rename(columns={ff_data.columns[0]: "Date"}, inplace=True)

# Convert "Date" column to string before filtering non-numeric values
ff_data["Date"] = ff_data["Date"].astype(str)

# Drop any rows where "Date" is not purely numeric (removing footer text)
ff_data = ff_data[ff_data["Date"].str.match(r"^\d+$")]

# Convert Date column to datetime format
ff_data["Date"] = pd.to_datetime(ff_data["Date"], format="%Y%m%d")

# Convert percentages to decimal format
for col in ["Mkt-RF", "SMB", "HML", "RF"]:
    ff_data[col] = ff_data[col] / 100

# Set Date as index
ff_data.set_index("Date", inplace=True)

# Merge stock returns with Fama-French factors
stock_returns = stock_returns.merge(ff_data, how="inner", left_index=True, right_index=True)

# Debug: Display first few rows
print(stock_returns.head())



In [None]:
# Prepare an empty list for betas
betas = []

# Loop through each stock to estimate betas
for stock in tickers:
    Y = stock_returns[stock] - stock_returns["RF"]
    X = stock_returns[["Mkt-RF", "SMB", "HML"]]
    X = sm.add_constant(X)  # Add intercept

    # Run OLS regression
    model = sm.OLS(Y, X).fit()

    # Store betas
    betas.append([stock, model.params["Mkt-RF"], model.params["SMB"], model.params["HML"]])

# Convert to DataFrame
betas_df = pd.DataFrame(betas, columns=["Stock", "Beta_MktRF", "Beta_SMB", "Beta_HML"])

# Debug: Display first few betas
print(betas_df.head())


In [None]:
# Standardize betas for clustering
scaler = StandardScaler()
betas_scaled = scaler.fit_transform(betas_df.iloc[:, 1:])  # Exclude stock names

# Convert back to DataFrame
betas_scaled_df = pd.DataFrame(betas_scaled, columns=["Beta_MktRF", "Beta_SMB", "Beta_HML"])
betas_scaled_df["Stock"] = betas_df["Stock"]

# Apply K-Means clustering (3 clusters)
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
betas_scaled_df["Cluster"] = kmeans.fit_predict(betas_scaled_df.iloc[:, :-1])

# Debug: Display first few rows
print(betas_scaled_df.head())


In [None]:
# Scatter plot of clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=betas_scaled_df["Beta_MktRF"],
                y=betas_scaled_df["Beta_SMB"],
                hue=betas_scaled_df["Cluster"],
                palette="Set1", alpha=0.7)

plt.xlabel("Beta (Market-RF)")
plt.ylabel("Beta (SMB - Size Factor)")
plt.title("Stock Clusters Based on Fama-French Betas")
plt.legend(title="Cluster")
plt.show()


In [None]:
# Calculate mean beta values for each cluster
cluster_means = betas_scaled_df.groupby("Cluster")[["Beta_MktRF", "Beta_SMB", "Beta_HML"]].mean()

# Debugging: Display cluster characteristics
print(cluster_means)


In [None]:
# Plot boxplots for each beta factor across clusters
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Market Beta
sns.boxplot(x="Cluster", y="Beta_MktRF", data=betas_scaled_df, ax=axes[0])
axes[0].set_title("Market Beta (Mkt-RF) by Cluster")

# SMB Beta (Size Effect)
sns.boxplot(x="Cluster", y="Beta_SMB", data=betas_scaled_df, ax=axes[1])
axes[1].set_title("Size Beta (SMB) by Cluster")

# HML Beta (Value Effect)
sns.boxplot(x="Cluster", y="Beta_HML", data=betas_scaled_df, ax=axes[2])
axes[2].set_title("Value Beta (HML) by Cluster")

plt.tight_layout()
plt.show()


In [None]:
# Group stocks by cluster
stocks_by_cluster = betas_scaled_df.groupby("Cluster")["Stock"].apply(list)

# Debugging: Display stocks in each cluster
for cluster, stocks in stocks_by_cluster.items():
    print(f"Cluster {cluster}: {stocks}")


In [None]:
from sklearn.decomposition import PCA
import plotly.express as px

# Prepare data for PCA
X_betas_only = betas_scaled_df[["Beta_MktRF", "Beta_SMB", "Beta_HML"]].copy()
stock_names = betas_scaled_df["Stock"].copy()
cluster_labels = betas_scaled_df["Cluster"].copy()

# Perform PCA to reduce to 2D
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_betas_only)

# Convert PCA result into DataFrame
pca_df = pd.DataFrame(X_pca, columns=["PCA1", "PCA2"])
pca_df["Stock"] = stock_names
pca_df["Cluster"] = cluster_labels

# Plotly scatter plot
fig = px.scatter(
    pca_df,
    x="PCA1",
    y="PCA2",
    color=pca_df["Cluster"].astype(str),  # Ensures Cluster is treated as a category
    hover_data=["Stock"],
    title="PCA Projection of Clusters (Colored by Cluster, Labeled by Company Name)",
    labels={"PCA1": "PCA Component 1", "PCA2": "PCA Component 2"},
)

fig.show()

