In [None]:
# Install required libraries (only needed if they aren't already installed)
!pip install yfinance
!pip install seaborn
!pip install scikit-learn

# Import libraries
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime
from scipy.stats import skew, kurtosis
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Optional: Configure matplotlib for Jupyter Notebook
%matplotlib inline

In [None]:
# Define the stock tickers from different sectors
tickers = ['AAPL',  # Apple Inc.
           'MSFT',  # Microsoft Corporation
           'AMZN',  # Amazon.com, Inc.
           'TSLA',  # Tesla, Inc.
           'BAC',   # Bank of America.
           'LYV',   # Live Nation
           'PFE',   # Pfizer Inc.
           'ASB',   # Associated Bank
           'XOM',   # Exxon Mobil Corporation
           'NVDA',  # NVIDIA Corporation
           'META']  # Meta Platforms, Inc. (formerly Facebook)

# Define the date range for the past 3 years
end_date = datetime.now()
start_date = end_date - pd.DateOffset(years=3)

In [None]:
# Download stock data
data = yf.download(tickers, start=start_date, end=end_date)

# Check if 'Adj Close' is available
if 'Adj Close' in data.columns.get_level_values(0):
    # Use Adjusted Close prices
    data = data['Adj Close']
elif 'Close' in data.columns.get_level_values(0):
    # Use Close prices
    print("'Adj Close' not available. Using 'Close' prices instead.")
    data = data['Close']
else:
    raise KeyError("Neither 'Adj Close' nor 'Close' found in the downloaded data.")

# Display the first few rows to verify
print("Price Data:")
print(data.head())

In [None]:
# Calculate daily returns for each stock
returns = data.pct_change().dropna()

# Display the first few rows of returns
print("Daily Returns:")
print(returns.head())

In [None]:
# Initialize a DataFrame to hold summary statistics
stats_df = pd.DataFrame(columns=['Mean', 'StdDev', 'Skewness', 'Kurtosis'])

# Calculate summary statistics for each stock
for ticker in tickers:
    stock_returns = returns[ticker]
    stats_df.loc[ticker, 'Mean'] = stock_returns.mean()
    stats_df.loc[ticker, 'StdDev'] = stock_returns.std()
    stats_df.loc[ticker, 'Skewness'] = skew(stock_returns)
    stats_df.loc[ticker, 'Kurtosis'] = kurtosis(stock_returns)

# Convert columns to numeric
stats_df = stats_df.apply(pd.to_numeric)

# Display the summary statistics
print("Summary Statistics:")
print(stats_df)

In [None]:
# Prepare the data for clustering
features = ['Mean', 'StdDev', 'Skewness', 'Kurtosis']
X = stats_df[features]

# Normalize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(X)

# Create a DataFrame of the scaled features
scaled_stats_df = pd.DataFrame(scaled_features, index=stats_df.index, columns=features)

# Display the scaled features
print("Scaled Features:")
print(scaled_stats_df)

In [None]:
# Define the number of clusters
num_clusters = 3

# Apply K-Means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(scaled_features)

# Assign cluster labels to the stocks
stats_df['Cluster'] = kmeans.labels_

# Display the clusters
print("Cluster Assignments:")
print(stats_df['Cluster'])


In [None]:
# Assign colors to clusters
cluster_colors = {0: 'red', 1: 'green', 2: 'blue'}
stats_df['Color'] = stats_df['Cluster'].map(cluster_colors)

# Create a scatter plot (Mean Return vs. Standard Deviation)
plt.figure(figsize=(10, 7))

for cluster in range(num_clusters):
    clustered_data = stats_df[stats_df['Cluster'] == cluster]
    plt.scatter(clustered_data['StdDev'], clustered_data['Mean'],
                color=cluster_colors[cluster], label=f'Cluster {cluster}', s=100)

# Annotate each point with the stock ticker
for i in range(len(stats_df)):
    plt.annotate(stats_df.index[i],
                 (stats_df['StdDev'][i], stats_df['Mean'][i]),
                 textcoords="offset points", xytext=(0,10), ha='center')

plt.xlabel('Standard Deviation (Volatility)')
plt.ylabel('Mean Daily Return')
plt.title('Clustering of Stocks Based on Return Characteristics')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Examine the stocks in each cluster
for cluster in range(num_clusters):
    cluster_stocks = stats_df[stats_df['Cluster'] == cluster].index.tolist()
    print(f"\nStocks in Cluster {cluster}: {cluster_stocks}")

In [None]:
# Plot pairwise relationships
sns.pairplot(scaled_stats_df.join(stats_df['Cluster']), hue='Cluster', palette=cluster_colors)
plt.suptitle('Pairplot of Scaled Summary Statistics Colored by Cluster', y=1.02)
plt.show()

This grouping makes sense. For instance Nvidia and Tesla are grouped together under the color green. These both have seen the most rapid growth and volatilty of the group.                                                                     Copilot was used to assist with code generation