In [176]:
# In this section we adopt K-Means Clustering to get similar stocks for recommendation

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans

# First load the dataset on stock basics
dataset = pd.read_csv('./data/stock_basics.csv')
stocks = dataset.iloc[:,0].values
print("Below is stock list in dataset:")
print(stocks)
print("-------------------------------------------------------------")

features = dataset.iloc[:,1:].values
features = pd.DataFrame(features)
features.columns = ["Price", "Volume", "Market Cap", "Beta", "PE Ratio", "EPS"]
cols = features.columns
features[cols] = features[cols].apply(pd.to_numeric, errors='coerce')
print("Below is the data of stock features: ")
print(features)

# Second we eliminate null values in the dataset
for i in features.columns:
    features[i] = features[i].fillna(int(features[i].mean()))

Below is stock list in dataset:
['a' 'aa' 'aaba' ... 'zts' 'zuo' 'zyne']
-------------------------------------------------------------
Below is the data of stock features: 
       Price      Volume  Market Cap  Beta  PE Ratio     EPS
0      81.47    783350.0   25938.000  1.27     23.26   3.510
1      29.55   2375490.0    5502.000  1.38     24.72   1.200
2      74.01  16611754.0   42253.000  1.50       NaN -15.900
3      34.06   6219761.0   15214.000  1.84     11.18   3.030
4      12.72    639136.0     248.532  2.43       NaN  -0.110
5     176.78    830258.0   12655.000  1.29     30.99   5.730
6     197.00  25881697.0  943528.000  0.91     16.51  12.120
7      57.73    791443.0    3549.000  0.79    120.64   0.500
8      29.49     88499.0    2834.000  1.06     11.75   2.500
9      19.58   1773966.0   42287.000  1.03     19.38   1.020
10     83.45   3499353.0  124136.000  0.99     22.95   3.660
11     77.27   1717291.0   16490.000  1.53     14.33   5.450
12      7.65    279782.0     363.9

In [194]:
def gen_clusters(features, columns, clusterNum, clusterPrint):    
    kmeans = KMeans(n_clusters=clusterNum,init='k-means++',max_iter=300,n_init=10,random_state=0) 

    clusterIds = kmeans.fit_predict(features[columns])
    
    # First initialize each cluster
    clusters = []
    for i in range(0, clusterNum):
        clusters.append([])

    # Second fill cluster with stocks
    stockId = 0
    for clusterId in clusterIds:
        stock = stocks[stockId]
        clusters[clusterId].append(stock)
        stockId += 1 
        
    # Print out cluster
    if clusterPrint == 1:
        print("Here are generated clusters:\n")
        clusterId = 1
        for i in range(0, clusterNum):
            print("cluster-" + str(clusterId) + ": " + ",".join(clusters[i]))
            clusterId += 1
    return clusters

In [196]:
# Now we start to recommend stocks
# First input stock symbol of google (googl) and stock features you care about, the system will return a list of stocks in the same cluster
# You are free to modify to other stock symbol and concerned features to test
# 
# Available feature ID and name pair are lsited as below:
# 1: "Price", 
# 2: "Volume", 
# 3: "Market Cap", 
# 4: "Beta", 
# 5: "PE Ratio"
# 6: "EPS"

# Here users want to recommendation based on price and volume for Amazon Inc.
stock_input = "amzn"
fids_concerned = [1, 2]

# Here cluster number is set to be 100 so that each cluster has 15 stocks on average
clusterNum = 100
clusterPrint = 0
columns = []
for fid in fids_concerned:
    columns.append(features.columns[fid-1])
clusters = gen_clusters(features, columns, clusterNum, clusterPrint)

print("Stock you have input: " + stock_input)
print("Features you are concerned about: " + ', '.join(columns))
print("---------------------------------------------------")
count = 0
isfound = 0
for cluster in clusters:
    if stock_input in cluster and len(cluster)>1:
        isfound = 1
        print("Here are a list of stocks you may be interested in:\n")
        for cluster_stock in cluster:
            if cluster_stock != stock_input:
                count += 1
                print("Stock-" + str(count) + ": " + cluster_stock)
                print("=====================================")
        break        
if not isfound:
    print("Sorry, we can not make any recommendation based on your input")

Stock you have input: amzn
Features you are concerned about: Price, Volume
---------------------------------------------------
Here are a list of stocks you may be interested in:

Stock-1: agnc
Stock-2: amrn
Stock-3: bidu
Stock-4: cbs
Stock-5: disca
