In [None]:
# Cell 0: Install Required Packages and Import Libraries
import sys
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install yfinance statsmodels seaborn scikit-learn pandas matplotlib scipy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import scipy.stats as stats
import requests, io, zipfile
import datetime

# Improve plot appearance
sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)



In [None]:
# Cell 1: Define Date Range and Common Variables
end_date = datetime.date.today()
start_date = end_date - datetime.timedelta(days=3*365)

# Annual risk-free rate of 2% -> daily rate approximated over 252 trading days
annual_rf = 0.02
daily_rf = (1 + annual_rf)**(1/252) - 1
print(f"Daily Risk-Free Rate: {daily_rf:.6f}")


In [None]:
# Cell 2: Problem 1 – CAPM Model: Data Retrieval
ticker_stock = 'AAPL'
ticker_index = '^GSPC'

df_stock = yf.download(ticker_stock, start=start_date, end=end_date)
df_index = yf.download(ticker_index, start=start_date, end=end_date)

print("Stock data sample:")
print(df_stock.head())
print("\nIndex data sample:")
print(df_index.head())


In [None]:
# Cell 3: Problem 1 – CAPM Model: Calculate Excess Returns

# If the DataFrame columns are MultiIndex, flatten them by taking only the first level.
if isinstance(df_stock.columns, pd.MultiIndex):
    df_stock.columns = df_stock.columns.get_level_values(0)
if isinstance(df_index.columns, pd.MultiIndex):
    df_index.columns = df_index.columns.get_level_values(0)

# Calculate daily returns using the "Close" column
df_stock['Return'] = df_stock['Close'].pct_change()
df_index['Return'] = df_index['Close'].pct_change()

# Compute excess returns (daily return minus daily risk-free rate)
df_stock['Excess_Return'] = df_stock['Return'] - daily_rf
df_index['Excess_Return'] = df_index['Return'] - daily_rf

# Drop NaN values that result from the pct_change calculation
df_stock.dropna(subset=['Excess_Return'], inplace=True)
df_index.dropna(subset=['Excess_Return'], inplace=True)

print(df_stock.head())
print(df_index.head())



In [None]:
# Cell 4: CAPM Regression Analysis

# Merge the excess returns for stock and index using their Date index
df_capm = pd.merge(
    df_stock[['Excess_Return']],
    df_index[['Excess_Return']],
    left_index=True, 
    right_index=True, 
    suffixes=('_stock', '_index')
)

# Add a constant (intercept) to the independent variable (Index Excess Return)
X_capm = sm.add_constant(df_capm['Excess_Return_index'])
y_capm = df_capm['Excess_Return_stock']

# Fit an Ordinary Least Squares (OLS) regression model
model_capm = sm.OLS(y_capm, X_capm).fit()

# Print the regression summary
print(model_capm.summary())


In [None]:
# Cell 5: Plot the CAPM Regression

# Create a scatter plot of the data points
plt.figure(figsize=(8,5))
plt.scatter(df_capm['Excess_Return_index'], df_capm['Excess_Return_stock'], alpha=0.5, label="Data Points")

# Generate values for the regression line using parameter names to avoid future warnings
x_vals = np.linspace(df_capm['Excess_Return_index'].min(), df_capm['Excess_Return_index'].max(), 100)
y_vals = model_capm.params["const"] + model_capm.params["Excess_Return_index"] * x_vals

# Plot the regression line
plt.plot(x_vals, y_vals, color="red", label="CAPM Regression Line")

# Add labels and title
plt.xlabel("Index Excess Return (S&P 500)")
plt.ylabel("Stock Excess Return (AAPL)")
plt.title("CAPM Regression: AAPL vs. S&P 500")
plt.legend()
plt.show()



In [None]:
# Cell 6: Problem 2 – Fama–French Three-Factor Model: Data Retrieval
import io, zipfile

# URL for Fama–French daily factors data (CSV zipped)
ff_url = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/F-F_Research_Data_Factors_daily_CSV.zip'
r = requests.get(ff_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
fname = z.namelist()[0]

# Read the CSV file, skipping the header rows
with z.open(fname) as f:
    ff_df = pd.read_csv(f, skiprows=3)

# If the first column isn't labeled "Date", rename it
if 'Date' not in ff_df.columns:
    print("Columns before renaming:", ff_df.columns)
    ff_df.rename(columns={ff_df.columns[0]: "Date"}, inplace=True)
    print("Columns after renaming:", ff_df.columns)

# Remove any rows that are not data (i.e. where 'Date' is not numeric)
ff_df = ff_df[ff_df['Date'].apply(lambda x: str(x).strip().isdigit())].copy()

# Convert the Date column to datetime format (YYYYMMDD)
ff_df['Date'] = pd.to_datetime(ff_df['Date'], format='%Y%m%d')

# Convert factor columns from percentages to decimals
ff_cols = ['Mkt-RF', 'SMB', 'HML', 'RF']
ff_df[ff_cols] = ff_df[ff_cols].apply(pd.to_numeric) / 100

# Filter for our date range and set Date as index
ff_df = ff_df[(ff_df['Date'] >= pd.Timestamp(start_date)) & (ff_df['Date'] <= pd.Timestamp(end_date))]
ff_df.set_index('Date', inplace=True)

print("Fama–French Factors data sample:")
print(ff_df.head())


In [None]:
# Cell 7: Problem 2 – Fama–French Three-Factor Model: Merge and Regression Analysis

# Merge AAPL's daily returns (from df_stock) with Fama–French factors on the Date index
df_ff_merge = pd.merge(df_stock[['Return']], ff_df, left_index=True, right_index=True, how='inner')

# Calculate the stock's excess return using the Fama–French risk-free rate (RF)
df_ff_merge['Excess_Return'] = df_ff_merge['Return'] - df_ff_merge['RF']

# Prepare the independent variables (Mkt-RF, SMB, HML) and add a constant term
X_ff = sm.add_constant(df_ff_merge[['Mkt-RF', 'SMB', 'HML']])
y_ff = df_ff_merge['Excess_Return']

# Run the Ordinary Least Squares (OLS) regression for the Three-Factor Model
model_ff = sm.OLS(y_ff, X_ff).fit()

# Print the regression summary
print(model_ff.summary())


In [None]:
# Cell 8: Problem 3 – Clustering Stocks: Data Retrieval & Feature Engineering

# Define tickers for 10 stocks from different sectors
tickers = ['AAPL', 'MSFT', 'AMZN', 'TSLA', 'JPM', 'PFE', 'KO', 'XOM', 'NVDA', 'META']

# Download daily price data for these tickers
df_prices_all = yf.download(tickers, start=start_date, end=end_date)

# Try to select the 'Adj Close' columns. If not available, fall back to 'Close'
if 'Adj Close' in df_prices_all.columns:
    df_prices = df_prices_all['Adj Close']
elif 'Adj Close' in df_prices_all.columns.get_level_values(0):
    df_prices = df_prices_all.xs('Adj Close', axis=1, level=0)
elif 'Close' in df_prices_all.columns:
    df_prices = df_prices_all['Close']
elif 'Close' in df_prices_all.columns.get_level_values(0):
    df_prices = df_prices_all.xs('Close', axis=1, level=0)
else:
    raise KeyError("Neither 'Adj Close' nor 'Close' columns found in the downloaded data.")

# Calculate daily returns and drop the first row (which will be NaN)
df_returns = df_prices.pct_change().dropna()

# Compute summary statistics for each stock
stats_df = pd.DataFrame(index=df_returns.columns)
stats_df['Mean_Return'] = df_returns.mean()
stats_df['Std_Dev'] = df_returns.std()
stats_df['Skew'] = df_returns.skew()
stats_df['Kurtosis'] = df_returns.kurtosis()

print("Summary statistics for clustering:")
print(stats_df)


In [None]:
# Cell 9: Problem 3 – Clustering Stocks: Clustering Analysis & Visualization

import os
import warnings

# Set the environment variable to limit the number of threads and help avoid the MKL memory leak warning
os.environ["OMP_NUM_THREADS"] = "1"

# Filter out the specific KMeans memory leak warning
warnings.filterwarnings("ignore", message="KMeans is known to have a memory leak on Windows with MKL")

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Normalize the summary statistics
scaler = StandardScaler()
stats_scaled = scaler.fit_transform(stats_df)

# Apply k-means clustering with 3 clusters and explicitly set n_init to 10
kmeans = KMeans(n_clusters=3, n_init=10, random_state=42)
stats_df['Cluster'] = kmeans.fit_predict(stats_scaled)

print("Cluster assignments:")
print(stats_df[['Cluster']])

# Create a scatter plot: Mean Return vs. Standard Deviation colored by cluster
plt.figure(figsize=(8,5))
sns.scatterplot(x='Mean_Return', y='Std_Dev', hue='Cluster', data=stats_df, palette='Set1', s=100)

# Annotate each point with its ticker symbol
for ticker in stats_df.index:
    plt.annotate(ticker, (stats_df.loc[ticker, 'Mean_Return'], stats_df.loc[ticker, 'Std_Dev']),
                 textcoords="offset points", xytext=(5,5), ha='left')

plt.xlabel("Mean Daily Return")
plt.ylabel("Standard Deviation of Daily Return")
plt.title("Stock Clusters based on Daily Return Statistics")
plt.legend(title="Cluster")
plt.show()



In [None]:
# Cell 10: Final Analysis and Conclusions

print("Final Analysis and Conclusions:")
print("--------------------------------------------------")
print("CAPM Analysis:")
print(" - The beta from the CAPM regression indicates the sensitivity of AAPL's returns to the market (S&P 500).")
print(" - The R-squared value shows that a significant portion of the stock's excess returns is explained by market movements.")
print("")
print("Fama–French Analysis:")
print(" - The Fama–French three-factor model includes additional factors (SMB and HML) that capture size and value effects.")
print(" - Comparing the coefficients and R-squared values with the CAPM model can highlight the added explanatory power of these factors.")
print("")
print("Clustering Analysis:")
print(" - The k-means clustering groups stocks based on their mean returns, volatility, skewness, and kurtosis.")
print(" - The scatter plot shows how stocks with similar return characteristics tend to cluster together.")
print("")
print("Overall, this project demonstrates multiple methods for analyzing stock performance and grouping stocks based on their historical return behavior.")
