In [1]:
import yfinance as yf
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from pandas.io.html import read_html
import re
import requests
import gurobipy as gp
from gurobipy import GRB
from math import *
from tqdm import tqdm
from pandas.core.common import flatten

import nltk
from nltk.corpus import stopwords

In [2]:
base_url = "https://www.investopedia.com/top-stocks-4581225"

content = requests.get(base_url)

urls_raw = re.findall("<li class=\"journey-nav__sublist-item \">\n.*\n.*\n", content.text) 

urls=[]

for url in urls_raw:
    url_clean = re.findall("https.*\"",url)
    url_cleaner = url_clean[0][:-1]
    urls.append(url_cleaner)

In [3]:
urls_m = urls[0:10+1] + urls[18:]
pages = []
try:
    for url in urls_m:
        page = read_html(url)
        pages.append(page)
except Exception as e:
    print(url)
    
# pages

In [4]:
# Check if very webpage follow the value, growth, momentum format.
for page in pages:
    if(len(page) != 3):
        print(len(page))

In [5]:
def get_ticker(stocks_raw):
    tickers = []
    for i in range(len(stocks_raw)):
        ticker_raw = stocks_raw.iloc[i]
        ticker = re.findall("\(.*\)",ticker_raw)[0].replace("(","").replace(")","")
        tickers.append(ticker)
    return(tickers)

def get_info(info_raw):
    
    check_value = ["Value"]
    check_growth = ["Growth", "Growing", "Least Profit Decline"]
    check_momentum = ["Momentum","Performance"]
    
    check = 0
    
    
    
    if ([check  for word in check_value if re.findall(word,info_raw)] != []):
        cat = "Value"
    
    elif (["1" for word in check_growth if re.findall(word,info_raw)] != []):
        cat = "Growth"
    
    elif (["1" for word in check_momentum if re.findall(word,info_raw)] != []):
        cat = "Momentum"
    
    return(cat,info_raw)
    

In [6]:
tickers = []
cats = []
tags = []

for page in pages:
    tables = page
    for table in tables:
        stocks_raw = table.iloc[1:3+1,0]
        info_raw = stocks_raw.name
        ticker = get_ticker(stocks_raw)
        cat, tag = get_info(info_raw)
        
        tickers.append(ticker)
        cats.append(cat)
        tags.append(tag)


In [7]:
df = pd.DataFrame( 
    {"ticker" : tickers,
    "cat" : cats,
    "tag" : tags}
)

df = df.explode("ticker").reset_index().iloc[:,1:]

In [8]:
cat = df.groupby(df["ticker"])['cat'].apply(list)
tag = df.groupby(df["ticker"])['tag'].apply(list)

for i in range(len(tag)):
    val = tag[i]
    val = [word.split(" ") for word in val]
    val = list(flatten(val))
    tag[tag.index[i]] = val

df = pd.concat([cat,tag],axis = 1)
df = df.reset_index()

In [9]:
tickers = []
for ticker in df["ticker"]:
    if(ticker[-2:] == ".A"):
        tickers.append(ticker.replace(".","-"))
    else:
        tickers.append(ticker)
df['ticker'] = tickers

In [10]:
# Can't invest in assets ending with .TO or .V

tickers_tradable = []

for ticker in df['ticker']:
    if(ticker[-2:] != ".V" and ticker[-3:] != ".TO"):
        tickers_tradable.append(ticker)
        
df_tradable = df[df["ticker"].isin(tickers_tradable)]
df_tradable = df_tradable.reset_index().iloc[:,1:]

In [11]:
maindata = yf.download(list(df_tradable["ticker"]),start = "2010-01-01")["Adj Close"]

[*********************100%***********************]  161 of 161 completed


In [12]:
data = maindata.dropna().pct_change().dropna().copy()

In [13]:
def run_model(data,minrisk_return):
    # define variables
    stock_volatility = data.std()
    stock_return = data.mean()
    sigma = data.cov()
    stocks = data.columns
    
    m = gp.Model('portfolio')
    n = len(df_tradable)

    # Upper bound set to 0.1 to ensure a minimum of 10 stocks, to ensure diversification.
    weights = pd.Series(m.addVars(stocks, vtype = "S", lb = 0.001, ub = 0.1, name = "weights"), index=stocks)

    portfolio_risk = sigma.dot(weights).dot(weights)
    portfolio_return = stock_return.dot(weights)
    m.setObjective(portfolio_risk, GRB.MINIMIZE)
    
    m.addConstr(weights.sum() == 1, 'budget')
    # Ensure a sizable stake in value stock for stability
    checklist_cat = [int(("Value" in df_tradable.cat[i])) for i in range(len(df_tradable.cat))]
    m.addConstr(gp.quicksum(weights[i] * checklist_cat[i] for i in range(n)) >= 0.3, "Value")
    # Given the current situation, our group would like to invest substaintiably into Tech, Alterntive Energy and Pharmaceutical    
    checklist_favor = [int(("Tech" in df_tradable.tag[i]) or ("Alternative" in df_tradable.tag[i]) or ("Pharmaceutical" in df_tradable.tag[i])) for i in range(len(df_tradable.tag))]
    m.addConstr(gp.quicksum(weights[i] * checklist_favor[i] for i in range(n)) >= 0.3, "Situational")

    
    m.addConstr(portfolio_return >= minrisk_return, 'target')

    m.setParam('OutputFlag', 0) # don't print the whole Chunk out
    
    try: 
        m.optimize()
        sharpe = portfolio_risk.getValue()/portfolio_return.getValue()
        result = sharpe
        weights_result = []
        for weight in weights:
            if weight.x > 0:
                weights_result.append([weight.varname[8:-1],weight.x])
                
    except: 
        result = 0
        weights_result = 0
    m.reset()
    return([result,weights_result])

In [14]:
def sim(data):
    data_ = data.iloc[np.random.randint(len(data), size=2520)]
    stock_return = data_.mean()
    ret = np.linspace(0,stock_return.max(), 500)
    
    sharpes = []
    weights = []
    
    result = 1
    i=0
    
    while (result > 0):
        val = run_model(data_,ret[i])
        i = i+1
        result = val[0]
        sharpes.append(val[0])
        weights.append(val[1])
    
    
    return([
        max(sharpes),
        weights[sharpes.index(max(sharpes))]
    ])

In [15]:
# %timeit sim(data) returns 9.59 s ± 3.33 s per loop , 6 per min, 10 min can sample 60 times
%timeit sim(data)

Using license file c:\gurobi\gurobi.lic
Academic license - for non-commercial use only
The slowest run took 21.70 times longer than the fastest. This could mean that an intermediate result is being cached.
5.37 s ± 4.3 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
data_test = data.copy()
data_test = data_test["2020-01-01":]

In [17]:
top = np.sum(data_test,axis=1).sort_values(ascending = False).index[0:28]
bot = np.sum(data_test,axis=1).sort_values().index[0:28]

data_top = data_test[data_test.index.isin(top)].mean()
data_bot = data_test[data_test.index.isin(bot)].mean()
data_average = data_test[~data_test.index.isin((list(top)+list(bot)))].mean()


In [18]:
score = []
weights_ = []
for i in tqdm(range(60)):
    best_sharpe, weights = sim(data)
    total_ret = 0
    for ticker, weight in weights:
        # Given the uncertainty of the current climate, Our group decide to apply the naive method, simply giving an equal weightage to all three possible possibilities 
        total_ret = 1/3*(total_ret + data_top[ticker] * weight) + 1/3*(total_ret + data_bot[ticker] * weight) + 1/3*(total_ret + data_average[ticker] * weight)
    score.append(total_ret)
    weights_.append(weights)


100%|██████████| 60/60 [16:33<00:00, 16.56s/it]  


In [19]:
weights_[score.index(max(score))]


[['CSIQ', 0.1],
 ['CVNA', 0.1],
 ['DQ', 0.1],
 ['ENPH', 0.1],
 ['FSLY', 0.08955362782623981],
 ['JKS', 0.1],
 ['QDEL', 0.1],
 ['RUN', 0.1],
 ['SEDG', 0.1],
 ['TWLO', 0.010446372173760335],
 ['VSLR', 0.1]]