In [None]:
# Problem 1 CAPM Model
# 1. Data Retrieval
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# Step 1: Fetch Data
start_date = "2022-01-01"
end_date = "2024-12-31"
risk_free_ticker = "^IRX"  # Use the 13-week Treasury yield as a proxy for risk-free rate
market_index_ticker = "^GSPC"  # S&P 500 index
equity_tickers = ["NVDA"]  # Replace with desired stock tickers

# Fetch data
risk_free_data = yf.download(risk_free_ticker, start=start_date, end=end_date)["Close"]
market_data = yf.download(market_index_ticker, start=start_date, end=end_date)["Close"]
stock_data = yf.download(equity_tickers, start=start_date, end=end_date)["Close"]
#1.2 Excess Returns
# Step 2: Prepare Data
# Calculate daily returns
market_returns = market_data.pct_change().dropna()
stock_returns = stock_data.pct_change().dropna()

# Convert risk-free rate from percentage to daily rate
risk_free_rate = risk_free_data / 100 / 252
risk_free_rate = risk_free_rate.reindex(market_returns.index, method="ffill")

# convert to pandas dataframe
risk_free_rate = pd.DataFrame(risk_free_rate).dropna()

# Merge All data together
merged_df = pd.merge(stock_returns,
                    pd.merge(market_returns, risk_free_rate,
                            left_index=True, right_index=True),
                    left_index=True, right_index=True)

# Calculate cumulative return
for col in merged_df.columns:
    merged_df[f"{col}_cumu_return"] = (1 + merged_df[col]).cumprod()
import plotly.graph_objects as go

# Assuming stock_data['NVDA'] contains the data for NVIDIA's stock
fig = go.Figure()

# Add a line plot for NVIDIA stock prices
fig.add_trace(go.Scatter(
    x=stock_data.index,  # Assuming the index contains dates
    y=stock_data['NVDA'],
    mode='lines',
    name='NVIDIA Stock Prices'
))

# Customize the layout
fig.update_layout(
    title='NVIDIA Stock Price Over Time',
    xaxis_title='Date',
    yaxis_title='Price',
    template='plotly_white',
    showlegend=True
)

# Show the plot
fig.show()
selected = 'NVDA'
comparable = '^GSPC'
# Assuming merged_df['NVDA'] contains the data for NVIDIA's stock
fig = go.Figure()

# Add a line plot for NVIDIA stock prices
fig.add_trace(go.Scatter(
    x=merged_df.index,  # Assuming the index contains dates
    y=merged_df[f'{selected}_cumu_return'],
    mode='lines',
    name=f'{selected} Cumulative Return'
))

if comparable != '':
    # Add a line plot for NVIDIA stock prices
    fig.add_trace(go.Scatter(
        x=merged_df.index,  # Assuming the index contains dates
        y=merged_df[f'{comparable}_cumu_return'],
        mode='lines',
        name=f'{comparable} Cumulative Return'
    ))

    # Customize the layout
    fig.update_layout(
        title=f'{selected} and {comparable} Daily Return',
        xaxis_title='Date',
        yaxis_title='Price',
        template='plotly_white',
        showlegend=True
    )
else:
    # Customize the layout
    fig.update_layout(
        title=f'{selected} Daily Returm',
        xaxis_title='Date',
        yaxis_title='Price',
        template='plotly_white',
        showlegend=True
    )

# Show the plot
fig.show()
# Calculate market excess return
merged_df['market_excess_return'] = merged_df['^GSPC'] - merged_df['^IRX']
merged_df['dt'] = merged_df.index
#1.3 CAPM Estimation
# use CAPM model
stock = 'NVDA'

# Stock excess return ~ market excess return
y = merged_df[stock] - merged_df['^IRX']
x = merged_df['market_excess_return']
x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
# Extract parameters
alpha, beta = model.params
r_squared = model.rsquared
# Display results
print(f"CAPM Model for {stock}:")
print(f"Alpha (Intercept):{alpha}")
print(f"Beta (Slope):{beta}")
print(f"R-squared: {r_squared}")
#1.4 Analysis
# ****Analysis: Because the Beta for Nvidia is 2.3, it is significantly more volitile than the market. The graph below shows that a slight change in the market return will cause a large change in Nividia return. ***

plt.figure(figsize=(8, 5))
plt.scatter(merged_df['market_excess_return'], y, alpha=0.6, label="Excess Returns Data")

# Regression line
x_range = np.linspace(min(merged_df['market_excess_return']), max(merged_df['market_excess_return']), 100)
y_pred = alpha + beta * x_range  # Regression equation
plt.plot(x_range, y_pred, color='red', label=f"Regression Line: y = {alpha} + {beta}x")

# Labels and Title
plt.xlabel("Market Excess Return")
plt.ylabel(f"{stock} Excess Return")
plt.title(f"CAPM Regression: {stock} vs Market Excess Returns")
plt.legend()
plt.grid()
plt.show()

# Problem 2: Fama-French Three-Factor Model
# 2.1 Data Retrieval - downloaded .csv name F-F_Research_Data_Factors_daily
# 2.2 Excess Returns
FF_df = pd.read_csv('F-F_Research_Data_Factors_daily.CSV')
FF_df['Date'] = pd.to_datetime(FF_df['Date'], format='%Y%m%d')
FF_df['Mkt-RF'] = FF_df['Mkt-RF']/100
FF_df['SMB'] = FF_df['SMB']/100
FF_df['HML'] = FF_df['HML']/100
FF_df['RF'] = FF_df['RF']/100


# Merge the FF Factors to Stocks
stock_returns = stock_returns.merge(FF_df, on = 'Date', how = 'inner')

# Calculate NVDA excess return
stock = 'NVDA'
stock_returns['NVDA_Excess_Return'] = stock_returns[stock] - stock_returns['RF']

print(stock_returns[['Date', stock, 'RF', 'NVDA_Excess_Return']].head())

# 2.3 Model Estimation
y = stock_returns[stock] - stock_returns['RF']
x = stock_returns[['Mkt-RF', 'SMB', 'HML']]
# get alpha for ff model
ffx = sm.add_constant(x)
model = sm.OLS(y, ffx).fit()
print(f"FF model")
print(model.summary())
#2.4 Analysis: Compare the R-squared values of the CAPM and Three-Factor Model.
# The R-squared of the three CAPM model was .515 while the Three-Factor Model had an R-squared of .585. This means that more of the variation in Nvidia excess returns is due to the market when using the Three-Factor Model than is when using CAPM. This is because the Three-Factor Model incorporates more data into the R-squared and therefore should be more accurate.

# Interpret the SMB and HML coefficients to discuss size and value effects.
# The SMB coeficient is -.54 which indicates that the stock is large cap. This makes sense because Nvidia is the top 5 largest companies in terms of market cap and is currently valued at $3.4 trillion.
# The HML coeficient is almost negative 1 which means that the stock has a veru book to market ratio and is a growth stock. In this case, the HML coeficient means that the market expects Nvidia to have significant growth in the coming years.
# The results for both coeficients also have a P value of 0.000 so the coefiecients are significant.
#3.1 Data Retrieval
equity_ticker_new = ["AAPL", "MSFT", "AMZN", "TSLA", "JPM", "PFE", "KO", "XOM", "NVDA", "META"]
stock_data_new = yf.download(equity_ticker_new, start=start_date, end=end_date, progress=False)['Close']
#3.2 Calculate daily returns
stock_returns_new = stock_data_new.pct_change().dropna()
print(stock_returns_new.head())

#3.1 Data Retrieval
equity_ticker_new = ["AAPL", "MSFT", "AMZN", "TSLA", "JPM", "PFE", "KO", "XOM", "NVDA", "META"]
stock_data_new = yf.download(equity_ticker_new, start=start_date, end=end_date, progress=False)['Close']
#3.2 Calculate daily returns
stock_returns_new = stock_data_new.pct_change().dropna()
print(stock_returns_new.head())

#3.2.2 Compute summary statistics (done completing regression for all 10 stocks)
# Merge the FF Factors to Stocks
stock_returns_new = stock_returns_new.merge(FF_df, on = 'Date', how = 'inner')
for stock in equity_ticker_new:
    print(f"Summary statistics for {stock}:")
    y = stock_returns_new[stock] - stock_returns_new['RF']  # Stock Excess Return
    x = stock_returns_new[['Mkt-RF', 'SMB', 'HML']]  # Fama-French Factors
    x = sm.add_constant(x)
    model = sm.OLS(y, x).fit()

    # Print full model summary
    print(model.summary())
#3.3 Clustering
# Combine w/ additional data
all_betas = []
for stock in equity_ticker_new:
    y = stock_returns_new[stock] - stock_returns_new['RF']
    x = stock_returns_new[['Mkt-RF', 'SMB', 'HML']]
    model = sm.OLS(y, x).fit()
    temp = {'Ticker': stock,
            'MKT_excess_beta': model.params['Mkt-RF'],
            'SMB_beta': model.params['SMB'],
            'HML_beta': model.params['HML']}
    all_betas.append(temp)

all_beta_df = pd.DataFrame(all_betas)
# get 'ticker', 'name', 'market_cap', 'sector', 'industry'and merge
stock_info = []
for ticker in equity_ticker_new:
    stock = yf.Ticker(ticker)
    stock_data = stock.info  # Fetch metadata

    temp = {'Ticker': ticker,
            'Name': stock_data.get('longName', 'N/A'),
            'Market Cap': stock_data.get('marketCap', 'N/A'),
            'Sector': stock_data.get('sector', 'N/A'),
            'Industry': stock_data.get('industry', 'N/A')}

    stock_info.append(temp)
stock_info_df = pd.DataFrame(stock_info)
stock_info_df = stock_info_df[['Ticker', 'Name', 'Market Cap', 'Sector', 'Industry']]

all_beta_df = all_beta_df.merge(stock_info_df, on='Ticker', how='left')



print(all_beta_df)

#3.3 continued Normalize and use k-means clustering to group the stocks into 3 clusters
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

columns_for_clustering = ['MKT_excess_beta', 'SMB_beta', 'HML_beta']
X = all_beta_df[columns_for_clustering]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
#3.4 Visualize
optimal_clusters = 3

# K-Means Clustering
kmeans = KMeans(n_clusters=optimal_clusters, random_state=20)
all_beta_df['Cluster'] = kmeans.fit_predict(X_scaled)

plt.figure(figsize=(8, 6))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=all_beta_df['Cluster'], cmap='viridis', edgecolors='k', s=100)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=250, marker='X', c='red', edgecolors='k', label="Centroids")
for i, ticker in enumerate(all_beta_df['Ticker']):
    plt.text(X_scaled[i, 0], X_scaled[i, 1], ticker, fontsize=9, ha='right', va='bottom')
for i, (x, y) in enumerate(zip(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1])):
    plt.text(
        x, y - 0.15, f"{i}",
        fontsize=10, fontweight='bold', ha='center', va='top', color='black'
    )
plt.title("K-Means Clustering of Stocks (3 Clusters)")
plt.xlabel('MKT_excess_beta (Normalized)')
plt.ylabel('SMB_beta (Normalized)')
plt.legend()
plt.show()

all_beta_df['Cluster'] =  kmeans.labels_
cluster_analysis = all_beta_df.groupby('Cluster').agg(
    mean_MKT_excess_beta=('MKT_excess_beta', 'mean'),
    mean_SMB_beta=('SMB_beta', 'mean'),
    mean_HML_beta=('HML_beta', 'mean'),
    mean_market_cap=('Market Cap', 'mean'),  # Ensure column name matches
    num_comp=('Ticker', 'count')
).reset_index()

print("Cluster Analysis (Means and Record Counts):\n", cluster_analysis)
cluster_analysis
# Distribution of sectors across clusters
sector_distribution = all_beta_df.groupby(['Cluster', 'Sector']).size().reset_index(name='sector_count')

# Distribution of industries across clusters
industry_distribution = all_beta_df.groupby(['Cluster', 'Industry']).size().reset_index(name='industry_count')

# Top 3 most frequent sectors per cluster
top_sectors = sector_distribution.groupby('Cluster').apply(lambda x: x.nlargest(3, 'sector_count')).reset_index(drop=True)

# Top 3 most frequent industries per cluster
top_industries = industry_distribution.groupby('Cluster').apply(lambda x: x.nlargest(3, 'industry_count')).reset_index(drop=True)

# Print the results
print("Top 3 Most Frequent Sectors per Cluster:\n", top_sectors)
print("\nTop 3 Most Frequent Industries per Cluster:\n", top_industries)
#3.5 - Analysis:Interpret the clusters and discuss potential similarities among stocks in the same cluster.
# The clusters of stock appear to be organized into tech, auto, and other categories. Because Tesla is the only car company in the data and is a very unique company, it has its own cluster. Apple, Amazon, Microsoft, Meta, and Nvidia are another cluster. These are all tech stocks with similar movement in the stock martket. The last cluster appears to be all of the stocks that did not fit in the other two. If we were to add more stocks and clusters, they would be organized into more detailed categories.
