In [1226]:
## Import Dependencies
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import date, timedelta, datetime
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [1227]:
## Ticker List, start date
tickers = ["BTC-USD", #Bitcoin (deleted - "ETH-USD", "LTC-USD", "BITW", ether, litecoin, top 10 crypto index fund)
           "GLD", "SLV", "CL=F", #Gold, silver, crude
           "VIXY", #VIX Short-Term Futures ETF
           "^IXIC", "^GSPC", "^DJI", #Nasdaq, s&p, dow
           "META", "AMZN", "AAPL", "NFLX", "GOOG", "TSLA", #faangs, telsa
           "JPM", "WFC", "C", "BAC", #Big US Banks JPM, WFC, C, BAC
           "UUP", #usd bull fund
           "IEF" #iShares 7-10 Year Treasury Bond 
           ]
start_date = "2014-07-01"

In [1228]:
## Interval lengths
long = 50
med = 15
short = 5

In [1229]:
## Quantiles
quantiles = [.01, .05, .1, .5, .9, .95, .99]
model_q = .5

In [1230]:
## Look up ticker and create csv
for ticker in tickers:
    data = pd.DataFrame(yf.download(ticker, start=start_date))
    filename = ticker + '.csv'
    data.to_csv(filename)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [1243]:
## Create comprehensive list of dates
start = datetime.strptime(start_date, '%Y-%m-%d').date() - timedelta(days = long)
end = date.today() - timedelta(days = 1)
delta = timedelta(days=1)

dates = []
while start <= end:
    dates.append(start.isoformat())
    start += delta

dates = pd.DataFrame(dates)
dates.columns =['Date']
dates['Date'] = pd.to_datetime(dates['Date'])
dates

Unnamed: 0,Date
0,2014-05-12
1,2014-05-13
2,2014-05-14
3,2014-05-15
4,2014-05-16
...,...
3251,2023-04-06
3252,2023-04-07
3253,2023-04-08
3254,2023-04-09


In [1232]:
## Create empty pd df for classification analysis
df_class = pd.DataFrame(columns=['Ticker', 'avgvol_last_' + str(long), 'hi_to_lo_last_' + str(long), 'avgvol_last_' + str(med), 'hi_to_lo_last_' + str(med),
                                 'avgvol_last_' + str(short), 'hi_to_lo_last_' + str(short), 'avgvol_last_1', 'hi_to_lo_last_1'])

## Create empty pd df for quantile comparison
df_quant = pd.DataFrame(index=quantiles)

## Create empty pd df for logistic regression comparison
df_logist = pd.DataFrame(columns=['Ticker', 'Accuracy'])

## Loop thru asset files and add metrics
for ticker in tickers:
    ## Upload csv datasets
    filename = ticker + '.csv'
    temp = pd.read_csv(filename)
    
    ## Format date
    temp['Date'] = pd.to_datetime(temp['Date'])
    
    ## Merge dfs to capture all dates including weekends and market holidays
    df = pd.merge(dates, temp, how = 'left', on='Date')
    
    ## Fill in weekends and market holidays using last trading day
    df['Open'].fillna(method='ffill', inplace=True)
    df['High'].fillna(method='ffill', inplace=True)
    df['Low'].fillna(method='ffill', inplace=True)
    df['Close'].fillna(method='ffill', inplace=True)
    df['Adj Close'].fillna(method='ffill', inplace=True)
    df['Volume'].fillna(method='ffill', inplace=True)

    ## Shift metrics to get yesterday's value
    df['hi_shift'] = df['High'].shift(1)
    df['lo_shift'] = df['Low'].shift(1)
    df['adjclose_shift'] = df['Adj Close'].shift(1)
    df['vol_shift'] = df['Volume'].shift(1)
    
    ## Calculate 'Adj Close' for past and future
    df['close_last_long'] = df['Adj Close'].shift(long)
    df['close_last_med'] = df['Adj Close'].shift(med)
    df['close_last_short'] = df['Adj Close'].shift(short)
    df['close_last_1'] = df['Adj Close'].shift(2)
    df['close_next_1'] = df['Adj Close']
    df['close_next_short'] = df['Adj Close'].shift(-short + 1)
    df['close_next_med'] = df['Adj Close'].shift(-med + 1)
    df['close_next_long'] = df['Adj Close'].shift(-long + 1)
    
    ## Calculate short, med, long hi/lo, avg vol, return
    ## Past
    df['avgvol_last_long'] = df['vol_shift'].rolling(long).sum() / long
    
    df['hi_to_lo_last_long'] = (df['adjclose_shift'].rolling(long).max() / df['adjclose_shift'].rolling(long).min()) - 1
    df['return_last_long'] = (df['adjclose_shift'] / df['close_last_long']) - 1
    df['avgvol_last_med'] = df['vol_shift'].rolling(med).sum() / med
    
    df['hi_to_lo_last_med'] = (df['adjclose_shift'].rolling(med).max() / df['adjclose_shift'].rolling(med).min()) - 1
    df['return_last_med'] = (df['adjclose_shift'] / df['close_last_med']) - 1
    df['avgvol_last_short'] = df['vol_shift'].rolling(short).sum() / short
    
    df['hi_to_lo_last_short'] = (df['adjclose_shift'].rolling(short).max() / df['adjclose_shift'].rolling(short).min()) - 1
    df['return_last_short'] = (df['adjclose_shift'] / df['close_last_short']) - 1
    df['hi_to_lo_last_1'] = (df['hi_shift'] / df['lo_shift']) - 1
    df['return_last_1'] = (df['adjclose_shift'] / df['close_last_1']) - 1
    ## Future
    df['return_next_1'] = (df['close_next_1'] / df['adjclose_shift']) - 1
    df['return_next_short'] = (df['close_next_short'] / df['adjclose_shift']) - 1
    df['return_next_med'] = (df['close_next_med'] / df['adjclose_shift']) - 1
    df['return_next_long'] = (df['close_next_long'] / df['adjclose_shift']) - 1
    
    ## Filter out Nulls
    df = df.dropna()

    ## Find last date in dfs
    max_date = df['Date'].max()
    
    ## Drop unused fields
    df = df.drop(columns=['hi_shift', 'lo_shift', 'adjclose_shift',
                          'close_last_long', 'close_last_med', 'close_last_short', 'close_last_1', 'close_next_1', 'close_next_long', 'close_next_med', 'close_next_short'])
    
    ## Rename, add ticker, and rearrange columns
    ## Rename
    df = df.rename(columns={"vol_shift": "avgvol_last_1",
                            "avgvol_last_long": "avgvol_last_" + str(long), 
                            "hi_to_lo_last_long": "hi_to_lo_last_" + str(long),
                            "return_last_long": "return_last_" + str(long),
                            "avgvol_last_med": "avgvol_last_" + str(med), 
                            "hi_to_lo_last_med": "hi_to_lo_last_" + str(med),
                            "return_last_med": "return_last_" + str(med),
                            "avgvol_last_short": "avgvol_last_" + str(short), 
                            "hi_to_lo_last_short": "hi_to_lo_last_" + str(short),
                            "return_last_short": "return_last_" + str(short),
                            "return_next_long": "return_next_" + str(long),
                            "return_next_med": "return_next_" + str(med),
                            "return_next_short": "return_next_" + str(short)})
    ## Add ticker
    df['Ticker'] = ticker

    ## Rearrange
    df = df[['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 
             'avgvol_last_' + str(long), 'hi_to_lo_last_' + str(long), 'return_last_' + str(long),
             'avgvol_last_' + str(med), 'hi_to_lo_last_' + str(med), 'return_last_' + str(med),
             'avgvol_last_' + str(short), 'hi_to_lo_last_' + str(short), 'return_last_' + str(short),
             'avgvol_last_1', 'hi_to_lo_last_1', 'return_last_1',
             'return_next_1', 'return_next_' + str(short), 'return_next_' + str(med), 'return_next_' + str(long)]]
    
    ## Write to individual ticker csv
    df.to_csv(filename)

    

    ## df_class
    ## Filter df for max_date
    df_temp = df.loc[df['Date'] == max_date]
    df_temp = df_temp[['Ticker', 'avgvol_last_' + str(long), 'hi_to_lo_last_' + str(long), 'avgvol_last_' + str(med), 'hi_to_lo_last_' + str(med),
                       'avgvol_last_' + str(short), 'hi_to_lo_last_' + str(short), 'avgvol_last_1', 'hi_to_lo_last_1']]

    ## Add row to df_class
    df_class = df_class.append(df_temp, ignore_index=True)

    ###########################################
    
    ## df_quant
    ## Create df for quantiles of each ticker
    df_quant_temp = df[['return_next_' + str(long)]]
    df_quant_temp = df_quant_temp.quantile(quantiles)
    df_quant_temp = df_quant_temp.rename(columns={'return_next_' + str(long): ticker})

    # Merge df with df_blank
    df_quant = pd.merge(df_quant, df_quant_temp, left_index=True, right_index=True)

    ###########################################

    ## df_logist
    ## Get the quantile to use for binary classification
    q_model = df_quant.filter(items=[model_q], axis=0).iloc[0][ticker]

    ## Add 'Outcome' field 
    df['Outcome'] = np.where(df['return_next_' + str(long)] >= q_model, 1, 0)

    ## Establish y and X variables
    y = df['Outcome']
    X = df.drop(columns=['Outcome', 'Date', 'Ticker', 
                         'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
                         'return_next_1', 'return_next_' + str(short), 'return_next_' + str(med)])

    ## Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

    ## Create Logistic Regression Model
    classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)

    ## Fit model on training data
    classifier.fit(X_train, y_train)

    ## Make predictions
    predictions = classifier.predict(X_test)
    
    ## Calculate accuracy score
    accuracy = accuracy_score(y_test, predictions)

    ## Append df_logist with each asset score
    df_logist_temp = {'Ticker':ticker, 'Accuracy':accuracy}
    df_logist = df_logist.append(df_logist_temp, ignore_index=True)



## Write to classification csv
df_class.to_csv('df_class.csv')

## Write to quantile csv
df_quant.to_csv('df_quant.csv')

## Write to logistic csv
df_logist.to_csv('df_logist.csv')

In [1233]:
######################

In [1234]:
## K-means Classification

## Create new variables for log of average volume
df_class['sqrt_avgvol_last_1'] = np.sqrt(df_class['avgvol_last_1'])
df_class['sqrt_avgvol_last_' + str(short)] = np.sqrt(df_class['avgvol_last_' + str(short)])
df_class['sqrt_avgvol_last_' + str(med)] = np.sqrt(df_class['avgvol_last_' + str(med)])
df_class['sqrt_avgvol_last_' + str(long)] = np.sqrt(df_class['avgvol_last_' + str(long)])

## Variable list
class_cols = ['sqrt_avgvol_last_' + str(long), 'hi_to_lo_last_' + str(long), 'sqrt_avgvol_last_' + str(med), 'hi_to_lo_last_' + str(med),
              'sqrt_avgvol_last_' + str(short), 'hi_to_lo_last_' + str(short), 'sqrt_avgvol_last_1', 'hi_to_lo_last_1']

## Scale the data
df_class_scaled = StandardScaler().fit_transform(df_class[class_cols])

# Create a DataFrame with the scaled data
df_class_scaled = pd.DataFrame(df_class_scaled, columns= class_cols)

# Copy the tickers names from the original data
df_class_scaled["Ticker"] = df_class["Ticker"]

# Set the Ticker column as index
df_class_scaled = df_class_scaled.set_index("Ticker")

# Initialize the K-Means model with n_clusters
model = KMeans(n_clusters=4)

# Fit the model for the df_stocks_scaled DataFrame
model.fit(df_class_scaled)

# Predict the model segments (clusters)
asset_clusters = model.predict(df_class_scaled)

# Create a new column in the DataFrame with the predicted clusters
df_class_scaled["Assigned_Cluster"] = asset_clusters

# Create a scatter plot with x="AnnualVariance:,  y="AnnualReturn"
df_class_scaled.hvplot.scatter(
    x='sqrt_avgvol_last_' + str(long),
    y='hi_to_lo_last_' + str(long),
    by="Assigned_Cluster",
    hover_cols = ["Ticker"], 
    title = "Scatter Plot by Asset Segment")

In [1235]:
######################

In [1236]:
## Additional features data sources
# Consumer Sentiment Data
# http://www.sca.isr.umich.edu/tables.html

# Gallup polls instituional confidence 
# https://news.gallup.com/poll/1597/confidence-institutions.aspx 

# Pew polls instituional confidence
# https://www.pewresearch.org/politics/2022/06/06/public-trust-in-government-1958-2022/

# OECD trust in government
# https://data.oecd.org/gga/trust-in-government.htm

In [1237]:
## Consumer Sentiment Data
# http://www.sca.isr.umich.edu/tables.html

## Load file
csi = pd.read_csv('Resources/tbmics.csv')

## Convert month to month number
d = {'January':1, 'February':2, 'March':3, 'April':4, 'May':5, 'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}
csi['Month'] = csi['Month'].map(d)

## Create date field, using first day of the month
csi['Date'] = pd.to_datetime(dict(year=csi['YYYY'], month=csi['Month'], day=1))

## Exclude unnecessary columns
csi = csi[['Date', 'ICS_ALL']]

## Rename columns
csi = csi.rename(columns={"ICS_ALL": "CSI"})

## Merge with date list
csi = pd.merge(dates, csi, how = 'left', on='Date')

## Interpolate in between readings
csi['CSI'].interpolate(inplace=True)

In [1238]:
## Gallup Polling Data
# https://news.gallup.com/poll/1597/confidence-institutions.aspx

## File list
polls = ['Resources/gallup_banks.xlsx', 'Resources/gallup_bigbiz.xlsx', 'Resources/gallup_police.xlsx', 'Resources/gallup_pres.xlsx']

## Date List
gallup = dates

for poll in polls:
    ## Load file and take first 30 rows
    gallup_temp = pd.read_excel(poll)
    gallup_temp = gallup_temp.iloc[:30]

    ## Create date field, using first day of the month
    gallup_temp['Date'] = pd.to_datetime(dict(year=gallup_temp['Year'], month=7, day=1))

    ## Exclude unnecessary columns
    gallup_temp = gallup_temp[['Date', 'Great deal/Quite a lot']]

    ## Rename columns
    gallup_temp = gallup_temp.rename(columns={"Great deal/Quite a lot": poll[10:len(poll)-5]})

    ## Merge with date list
    gallup_temp = pd.merge(dates, gallup_temp, how = 'left', on='Date')

    ## Interpolate in between readings
    gallup_temp[poll[10:len(poll)-5]].interpolate(inplace=True)

    ## Merge into gallup df
    gallup = pd.merge(gallup, gallup_temp, how = 'left', on='Date')

In [1239]:
## Additional data features df (merge all)
df_addl = pd.merge(csi, gallup, how='left', on='Date')

## Write to additional csv
df_addl.to_csv('df_addl.csv')

In [1240]:
######################

In [1241]:
###### SCRATCH ###############

In [1242]:
## Logistic regression
df_logist = pd.DataFrame(columns=['Ticker', 'Accuracy'])

## Get the quantile to use for binary classification
q_model = df_quant.filter(items=[model_q], axis=0).iloc[0][ticker]

## Add 'Outcome' field 
df['Outcome'] = np.where(df['return_next_' + str(long)] >= q_model, 1, 0)

## Establish y and X variables
y = df['Outcome']
X = df.drop(columns=['Outcome', 'Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
                     'return_next_1', 'return_next_' + str(short), 'return_next_' + str(med)])

## Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

## Create Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

## Fit model on training data
classifier.fit(X_train, y_train)

## Make predictions
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)


## Calculate accuracy score
accuracy = accuracy_score(y_test, predictions)

df_logist_temp = {'Ticker':ticker, 'Accuracy':accuracy}
df_logist = df_logist.append(df_logist_temp, ignore_index=True)

## Confusion Matrix
confusion_matrix(y_test, predictions)

array([[347,  41],
       [338,  51]], dtype=int64)