In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Retrieve the data of S&P 500 stocks
sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
tickers = sp500.Symbol.tolist()

# Initialize an empty DataFrame to store the recommendation scores for each stock
recommendation_scores = pd.DataFrame(columns=['Ticker', 'Score'])

# Loop through each stock ticker
for ticker in tickers:
    try:
        # Retrieve the historical stock data for the past 1 year
        stock_data = yf.download(ticker, period='1y')
        
        # Calculate the average daily return for the past year
        avg_return = stock_data['Adj Close'].pct_change().mean()
        
        # Calculate the average daily trading volume for the past year
        avg_volume = stock_data['Volume'].mean()
        
        # Calculate the recommendation score as the product of the average return and volume
        recommendation_score = avg_return * avg_volume
        
        # Append the recommendation score for this stock to the DataFrame
        recommendation_scores = recommendation_scores.append({'Ticker': ticker, 'Score': recommendation_score}, ignore_index=True)
    except:
        # If there is an error retrieving data for this stock, skip it
        pass

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [6]:
data = recommendation_scores.copy()
data.dropna(inplace=True)
data.set_index('Ticker',inplace=True)

In [15]:
data.to_csv('/content/data.csv')

In [10]:
# Feature selection
corr_matrix = data.corr()
features = ['Score']
selected_features = []
for feature in features:
    similar_features = corr_matrix[feature][corr_matrix[feature] > 0.8].index.tolist()
    if feature in similar_features:
        similar_features.remove(feature)
    if similar_features not in selected_features:
        selected_features.append([feature] + similar_features)


# Cluster analysis
X = data[selected_features[0]]
kmeans = KMeans(n_clusters=10, random_state=42).fit(X)
cluster_labels = kmeans.labels_
clustered_data = pd.concat([data[selected_features[0]].reset_index(), pd.Series(cluster_labels, name='Cluster')], axis=1)

# Stock ranking
clustered_data['Rank'] = clustered_data.groupby('Cluster')['Score'].rank(method='min', ascending=True)
top10 = clustered_data[clustered_data['Rank'] <= 10].sort_values(['Cluster', 'Rank'])
# print(top10)

In [17]:
top10.head(10)

Unnamed: 0,Ticker,Score,Cluster,Rank
280,KR,-3115.722973,0,1.0
477,WBA,-3113.107896,0,2.0
96,CNC,-3091.844297,0,3.0
261,INVH,-3072.608249,0,4.0
147,DLR,-3041.260673,0,5.0
290,LYV,-3008.366477,0,6.0
90,CTLT,-2981.804426,0,7.0
163,EBAY,-2908.735019,0,8.0
24,MO,-2895.149558,0,9.0
376,PNC,-2874.484366,0,10.0


In [12]:
import pickle
# saving the model
pickle.dump(kmeans, open('/content/model.pkl','wb'))