In [419]:
## Import Dependencies
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import date, timedelta, datetime
import hvplot.pandas
from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [420]:
## Ticker List, start date
tickers = ["BTC-USD", #Bitcoin (deleted - "ETH-USD", "LTC-USD", "BITW", ether, litecoin, top 10 crypto index fund)
           "GLD", "SLV", "CL=F", #Gold, silver, crude
           "VIXY", #VIX Short-Term Futures ETF
           "^IXIC", "^GSPC", "^DJI", #Nasdaq, s&p, dow
           "META", "AMZN", "AAPL", "NFLX", "GOOG", "TSLA", #faangs, telsa
           "JPM", "WFC", "C", "BAC", #Big US Banks JPM, WFC, C, BAC
           "UUP", #usd bull fund
           "IEF" #iShares 7-10 Year Treasury Bond 
           ]
start_date = "2013-12-31"
filter_date = "2014-11-06" #"2014-11-06" - first day BTC tracked on yfinance, "2018-04-16" - first day to include entire sentiment analysis

In [421]:
## Interval lengths, dates
long = 50
med = 15
short = 5
start_date = datetime.strptime(start_date, '%Y-%m-%d').date() - timedelta(days = long)
filter_date = datetime.strptime(filter_date, '%Y-%m-%d').date()

In [422]:
## Quantiles, target return
quantiles = [.01, .05, .1, .5, .9, .95, .99]
model_q = .5
#target_return = 0

In [423]:
## Look up ticker and create csv
for ticker in tickers:
    data = pd.DataFrame(yf.download(ticker, start=start_date))
    filename = ticker + '.csv'
    data.to_csv(filename)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [424]:
## Create comprehensive list of dates
#start = datetime.strptime(start_date, '%Y-%m-%d').date() - timedelta(days = long)
start = start_date
end = date.today() - timedelta(days = 1)
delta = timedelta(days=1)

dates = []
while start <= end:
    dates.append(start.isoformat())
    start += delta

dates = pd.DataFrame(dates)
dates.columns =['Date']
dates['Date'] = pd.to_datetime(dates['Date'])
dates

Unnamed: 0,Date
0,2013-11-11
1,2013-11-12
2,2013-11-13
3,2013-11-14
4,2013-11-15
...,...
3434,2023-04-07
3435,2023-04-08
3436,2023-04-09
3437,2023-04-10


In [425]:
## Create empty pd df for classification analysis
df_class = pd.DataFrame(columns=['Ticker', 'avgvol_last_' + str(long), 'hi_to_lo_last_' + str(long), 'avgvol_last_' + str(med), 'hi_to_lo_last_' + str(med),
                                 'avgvol_last_' + str(short), 'hi_to_lo_last_' + str(short), 'avgvol_last_1', 'hi_to_lo_last_1'])

## Create empty pd df for quantile comparison
df_quant = pd.DataFrame(index=quantiles)

## Create empty pd df for logistic regression comparison
df_logist = pd.DataFrame(columns=['Ticker', 'percentile_' + str(model_q), 'Accuracy', 'TP_rate', 'TN_rate', 'Pos_Predicted_Value', 'Neg_Predicted_Value'])
df_logist_profit = pd.DataFrame(columns=['Ticker', 'target_return', 'Accuracy', 'TP_rate', 'TN_rate', 'Pos_Predicted_Value', 'Neg_Predicted_Value'])

## Loop thru asset files and add metrics
for ticker in tickers:
    ## Upload csv datasets
    filename = ticker + '.csv'
    temp = pd.read_csv(filename)
    
    ## Format date
    temp['Date'] = pd.to_datetime(temp['Date'])
    
    ## Merge dfs to capture all dates including weekends and market holidays
    df = pd.merge(dates, temp, how = 'left', on='Date')
    
    ## Fill in weekends and market holidays using last trading day
    df['Open'].fillna(method='ffill', inplace=True)
    df['High'].fillna(method='ffill', inplace=True)
    df['Low'].fillna(method='ffill', inplace=True)
    df['Close'].fillna(method='ffill', inplace=True)
    df['Adj Close'].fillna(method='ffill', inplace=True)
    df['Volume'].fillna(method='ffill', inplace=True)

    ## Shift metrics to get yesterday's value
    df['hi_shift'] = df['High'].shift(1)
    df['lo_shift'] = df['Low'].shift(1)
    df['adjclose_shift'] = df['Adj Close'].shift(1)
    df['vol_shift'] = df['Volume'].shift(1)
    
    ## Calculate 'Adj Close' for past and future
    df['close_last_long'] = df['Adj Close'].shift(long)
    df['close_last_med'] = df['Adj Close'].shift(med)
    df['close_last_short'] = df['Adj Close'].shift(short)
    df['close_last_1'] = df['Adj Close'].shift(2)
    df['close_next_1'] = df['Adj Close']
    df['close_next_short'] = df['Adj Close'].shift(-short + 1)
    df['close_next_med'] = df['Adj Close'].shift(-med + 1)
    df['close_next_long'] = df['Adj Close'].shift(-long + 1)
    
    ## Calculate short, med, long hi/lo, avg vol, return
    ## Past
    df['avgvol_last_long'] = df['vol_shift'].rolling(long).sum() / long
    df['vol_hi_lo_last_long'] = (df['vol_shift'].rolling(long).max() / df['vol_shift'].rolling(long).min()) - 1
    df['hi_to_lo_last_long'] = (df['adjclose_shift'].rolling(long).max() / df['adjclose_shift'].rolling(long).min()) - 1
    df['return_last_long'] = (df['adjclose_shift'] / df['close_last_long']) - 1
    df['avgvol_last_med'] = df['vol_shift'].rolling(med).sum() / med
    df['vol_hi_lo_last_med'] = (df['vol_shift'].rolling(med).max() / df['vol_shift'].rolling(med).min()) - 1
    df['hi_to_lo_last_med'] = (df['adjclose_shift'].rolling(med).max() / df['adjclose_shift'].rolling(med).min()) - 1
    df['return_last_med'] = (df['adjclose_shift'] / df['close_last_med']) - 1
    df['avgvol_last_short'] = df['vol_shift'].rolling(short).sum() / short
    df['vol_hi_lo_last_short'] = (df['vol_shift'].rolling(short).max() / df['vol_shift'].rolling(short).min()) - 1
    df['hi_to_lo_last_short'] = (df['adjclose_shift'].rolling(short).max() / df['adjclose_shift'].rolling(short).min()) - 1
    df['return_last_short'] = (df['adjclose_shift'] / df['close_last_short']) - 1
    df['hi_to_lo_last_1'] = (df['hi_shift'] / df['lo_shift']) - 1
    df['return_last_1'] = (df['adjclose_shift'] / df['close_last_1']) - 1
    ## Future
    df['return_next_1'] = (df['close_next_1'] / df['adjclose_shift']) - 1
    df['return_next_short'] = (df['close_next_short'] / df['adjclose_shift']) - 1
    df['return_next_med'] = (df['close_next_med'] / df['adjclose_shift']) - 1
    df['return_next_long'] = (df['close_next_long'] / df['adjclose_shift']) - 1
    
    ## Filter out Nulls
    df = df.dropna()

    ## Find last date in dfs
    max_date = df['Date'].max()
    
    ## Drop unused fields
    df = df.drop(columns=['hi_shift', 'lo_shift', 'adjclose_shift',
                          'close_last_long', 'close_last_med', 'close_last_short', 'close_last_1', 'close_next_1', 'close_next_long', 'close_next_med', 'close_next_short'])
    
    ## Rename, add ticker, and rearrange columns
    ## Rename
    df = df.rename(columns={"vol_shift": "avgvol_last_1",
                            "avgvol_last_long": "avgvol_last_" + str(long), 
                            "vol_hi_lo_last_long": "vol_hi_lo_last_" + str(long),
                            "hi_to_lo_last_long": "hi_to_lo_last_" + str(long),
                            "return_last_long": "return_last_" + str(long),
                            "avgvol_last_med": "avgvol_last_" + str(med), 
                            "vol_hi_lo_last_med": "vol_hi_lo_last_" + str(med),
                            "hi_to_lo_last_med": "hi_to_lo_last_" + str(med),
                            "return_last_med": "return_last_" + str(med),
                            "avgvol_last_short": "avgvol_last_" + str(short), 
                            "vol_hi_lo_last_short": "vol_hi_lo_last_" + str(short),
                            "hi_to_lo_last_short": "hi_to_lo_last_" + str(short),
                            "return_last_short": "return_last_" + str(short),
                            "return_next_long": "return_next_" + str(long),
                            "return_next_med": "return_next_" + str(med),
                            "return_next_short": "return_next_" + str(short)})
    ## Add ticker
    df['Ticker'] = ticker

    ## Rearrange
    df = df[['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 
             'avgvol_last_' + str(long), 'vol_hi_lo_last_' + str(long), 'hi_to_lo_last_' + str(long), 'return_last_' + str(long),
             'avgvol_last_' + str(med), 'vol_hi_lo_last_' + str(med), 'hi_to_lo_last_' + str(med), 'return_last_' + str(med),
             'avgvol_last_' + str(short), 'vol_hi_lo_last_' + str(short), 'hi_to_lo_last_' + str(short), 'return_last_' + str(short),
             'avgvol_last_1', 'hi_to_lo_last_1', 'return_last_1',
             'return_next_1', 'return_next_' + str(short), 'return_next_' + str(med), 'return_next_' + str(long)]]
    
    ## Filter dates
    df = df.loc[df['Date'] >= pd.to_datetime(filter_date)]

    ## Write to individual ticker csv
    df.to_csv(filename)

    ###########################################

    ## df_class
    ## Filter df for max_date
    df_temp = df.loc[df['Date'] == max_date]
    df_temp = df_temp[['Ticker', 'avgvol_last_' + str(long), 'vol_hi_lo_last_' + str(long), 'hi_to_lo_last_' + str(long), 
                                 'avgvol_last_' + str(med), 'vol_hi_lo_last_' + str(med), 'hi_to_lo_last_' + str(med),
                                 'avgvol_last_' + str(short), 'vol_hi_lo_last_' + str(short),  'hi_to_lo_last_' + str(short), 
                                 'avgvol_last_1', 'hi_to_lo_last_1']]

    ## Add row to df_class
    df_class = df_class.append(df_temp, ignore_index=True)

    ###########################################
    
    ## df_quant
    ## Create df for quantiles of each ticker
    df_quant_temp = df[['return_next_' + str(long)]]
    df_quant_temp = df_quant_temp.quantile(quantiles)
    df_quant_temp = df_quant_temp.rename(columns={'return_next_' + str(long): ticker})

    # Merge df with df_blank
    df_quant = pd.merge(df_quant, df_quant_temp, left_index=True, right_index=True)

    ###########################################

    ## df_logist
    ## Get the quantile to use for binary classification
    q_model = df_quant.filter(items=[model_q], axis=0).iloc[0][ticker]

    ## Add 'Outcome' field 
    df['Outcome'] = np.where(df['return_next_' + str(long)] >= q_model, 1, 0)

    ## Establish y and X variables
    y = df['Outcome']
    X = df.drop(columns=['Outcome', 'Date', 'Ticker', 
                         'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
                         'return_next_1', 'return_next_' + str(short), 'return_next_' + str(med)])

    ## Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=88, stratify=y)

    ## Create Logistic Regression Model
    classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=88)

    ## Fit model on training data
    classifier.fit(X_train, y_train)

    ## Make predictions
    predictions = classifier.predict(X_test)
    
    ## Calculate confusion matrix, accuracy score, TP_rate, TN_rate
    cm = confusion_matrix(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)
    TP = cm[0][0]
    FN = cm[0][1]
    FP = cm[1][0]
    TN = cm[1][1]
    TP_rate = TP / (TP + FN)
    TN_rate = TN / (TN + FP)
    PPV_rate = TP / (TP + FP)
    NPV_rate = TN / (TN + FN)

    ## Append df_logist with each asset score
    df_logist_temp = {'Ticker':ticker, 'percentile_' + str(model_q): q_model, 'Accuracy':accuracy, 'TP_rate': TP_rate, 'TN_rate': TN_rate, 'Pos_Predicted_Value': PPV_rate, 'Neg_Predicted_Value': NPV_rate}
    df_logist = df_logist.append(df_logist_temp, ignore_index=True)

    ###########################################
    
    # ## df_logist_prof
    # ## Add 'Outcome' field 
    # df['Outcome2'] = np.where(df['return_next_' + str(long)] >= target_return, 1, 0)

    # ## Establish y and X variables
    # y2 = df['Outcome2']
    # X2 = df.drop(columns=['Outcome', 'Outcome2', 'Date', 'Ticker', 
    #                      'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
    #                      'return_next_1', 'return_next_' + str(short), 'return_next_' + str(med)])

    # ## Split into train and test
    # X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, random_state=88, stratify=y2)

    # ## Create Logistic Regression Model
    # classifier2 = LogisticRegression(solver='lbfgs', max_iter=200, random_state=88)

    # ## Fit model on training data
    # classifier2.fit(X_train2, y_train2)

    # ## Make predictions
    # predictions2 = classifier2.predict(X_test2)
    
    # ## Calculate confusion matrix, accuracy score, TP_rate, TN_rate
    # cm2 = confusion_matrix(y_test2, predictions2)
    # accuracy2 = accuracy_score(y_test2, predictions2)
    # TP2 = cm2[0][0]
    # FN2 = cm2[0][1]
    # FP2 = cm2[1][0]
    # TN2 = cm2[1][1]
    # TP_rate2 = TP2 / (TP2 + FN2)
    # TN_rate2 = TN2 / (TN2 + FP2)
    # PPV_rate2 = TP2 / (TP2 + FP2)
    # NPV_rate2 = TN2 / (TN2 + FN2)

    # ## Append df_logist with each asset score
    # df_logist_temp2 = {'Ticker':ticker, 'target_return': target_return, 'Accuracy':accuracy2, 'TP_rate': TP_rate2, 'TN_rate': TN_rate2, 'Pos_Predicted_Value': PPV_rate2, 'Neg_Predicted_Value': NPV_rate2}
    # df_logist_profit = df_logist_profit.append(df_logist_temp2, ignore_index=True)


###########################################

## Write to classification csv
df_class.to_csv('df_class.csv')

## Write to quantile csv
df_quant.to_csv('df_quant.csv')

## Write to logistic csv
df_logist.to_csv('df_logist.csv')

## Write to logistic profit csv
# df_logist_profit.to_csv('df_logist_profit.csv')

In [426]:
#########################################################################################################################

In [427]:
## K-means Classification

## Create new variables for log of average volume
df_class['trans_avgvol_last_1'] = np.log2(df_class['avgvol_last_1'])
df_class['trans_avgvol_last_' + str(short)] = np.log2(df_class['avgvol_last_' + str(short)])
df_class['trans_avgvol_last_' + str(med)] = np.log2(df_class['avgvol_last_' + str(med)])
df_class['trans_avgvol_last_' + str(long)] = np.log2(df_class['avgvol_last_' + str(long)])

## Variable list
class_cols = ['trans_avgvol_last_' + str(long), 'vol_hi_lo_last_' + str(long), 'hi_to_lo_last_' + str(long), 
              'trans_avgvol_last_' + str(med), 'vol_hi_lo_last_' + str(med), 'hi_to_lo_last_' + str(med),
              'trans_avgvol_last_' + str(short), 'vol_hi_lo_last_' + str(short), 'hi_to_lo_last_' + str(short), 
              'trans_avgvol_last_1', 'hi_to_lo_last_1']

## Scale the data
df_class_scaled = StandardScaler().fit_transform(df_class[class_cols])

# Create a DataFrame with the scaled data
df_class_scaled = pd.DataFrame(df_class_scaled, columns= class_cols)

# Copy the tickers names from the original data
df_class_scaled["Ticker"] = df_class["Ticker"]

# Set the Ticker column as index
df_class_scaled = df_class_scaled.set_index("Ticker")

# Initialize the K-Means model with n_clusters
model = KMeans(n_clusters=4, random_state=88)

# Fit the model for the df_stocks_scaled DataFrame
model.fit(df_class_scaled)

# Predict the model segments (clusters)
asset_clusters = model.predict(df_class_scaled)

# Create a new column in the DataFrame with the predicted clusters
df_class_scaled["Assigned_Cluster"] = asset_clusters

# Create a scatter plot with x="AnnualVariance:,  y="AnnualReturn"
df_class_scaled.hvplot.scatter(
    x='trans_avgvol_last_' + str(long),
    y='hi_to_lo_last_' + str(long),
    by="Assigned_Cluster",
    hover_cols = ["Ticker"], 
    title = "Scatter Plot by Asset")

In [428]:
#########################################################################################################################

In [429]:
## Additional features data sources
# Consumer Sentiment Data
# http://www.sca.isr.umich.edu/tables.html

# Gallup polls instituional confidence 
# https://news.gallup.com/poll/1597/confidence-institutions.aspx 

# CPI Data
# https://www.bls.gov/regions/mid-atlantic/data/consumerpriceindexhistorical_us_table.htm

# US GDP
# https://www.macrotrends.net/countries/USA/united-states/gdp-gross-domestic-product


## Unused #############

# Pew polls instituional confidence
# https://www.pewresearch.org/politics/2022/06/06/public-trust-in-government-1958-2022/

# OECD trust in government
# https://data.oecd.org/gga/trust-in-government.htm

In [430]:
## Date / Time
dates2 = dates.copy()
dates2['Day'] = pd.to_datetime(dates2['Date']).dt.dayofweek
dates2['Month'] = pd.to_datetime(dates2['Date']).dt.month
dates2.loc[dates2['Month'] >= 10, 'Quarter'] = 4
dates2.loc[dates2['Month'] <= 9, 'Quarter'] = 3
dates2.loc[dates2['Month'] <= 6, 'Quarter'] = 2
dates2.loc[dates2['Month'] <= 3, 'Quarter'] = 1
dates2

Unnamed: 0,Date,Day,Month,Quarter
0,2013-11-11,0,11,4.0
1,2013-11-12,1,11,4.0
2,2013-11-13,2,11,4.0
3,2013-11-14,3,11,4.0
4,2013-11-15,4,11,4.0
...,...,...,...,...
3434,2023-04-07,4,4,2.0
3435,2023-04-08,5,4,2.0
3436,2023-04-09,6,4,2.0
3437,2023-04-10,0,4,2.0


In [431]:
## Consumer Sentiment Data
# http://www.sca.isr.umich.edu/tables.html

## Load file
csi = pd.read_csv('Resources/tbmics.csv')

## Convert month to month number
d = {'January':1, 'February':2, 'March':3, 'April':4, 'May':5, 'June':6, 'July':7, 'August':8, 'September':9, 'October':10, 'November':11, 'December':12}
csi['Month'] = csi['Month'].map(d)

## Create date field, using first day of the month
csi['Date'] = pd.to_datetime(dict(year=csi['YYYY'], month=csi['Month'], day=1))

## Exclude unnecessary columns
csi = csi[['Date', 'ICS_ALL']]

## Rename columns
csi = csi.rename(columns={"ICS_ALL": "CSI"})

## Merge with date list
csi = pd.merge(dates, csi, how = 'left', on='Date')

## Interpolate in between readings
csi['CSI'].interpolate(inplace=True)

In [432]:
## Gallup Polling Data
# https://news.gallup.com/poll/1597/confidence-institutions.aspx

## File list
polls = ['Resources/gallup_banks.xlsx', 'Resources/gallup_bigbiz.xlsx', 'Resources/gallup_police.xlsx', 'Resources/gallup_pres.xlsx']

## Date List
gallup = dates

for poll in polls:
    ## Load file and take first 30 rows
    gallup_temp = pd.read_excel(poll)
    gallup_temp = gallup_temp.iloc[:30]

    ## Create date field, using first day of the month
    gallup_temp['Date'] = pd.to_datetime(dict(year=gallup_temp['Year'], month=7, day=1))

    ## Exclude unnecessary columns
    gallup_temp = gallup_temp[['Date', 'Great deal/Quite a lot']]

    ## Rename columns
    gallup_temp = gallup_temp.rename(columns={"Great deal/Quite a lot": poll[10:len(poll)-5]})

    ## Merge with date list
    gallup_temp = pd.merge(dates, gallup_temp, how = 'left', on='Date')

    ## Interpolate in between readings
    gallup_temp[poll[10:len(poll)-5]].interpolate(inplace=True)

    ## Merge into gallup df
    gallup = pd.merge(gallup, gallup_temp, how = 'left', on='Date')

In [433]:
## CPI Data
# https://www.bls.gov/regions/mid-atlantic/data/consumerpriceindexhistorical_us_table.htm

## Load file
cpi = pd.read_excel('Resources/cpi.xlsx')

## Convert month to month number
d = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}
cpi['Month'] = cpi['Month'].map(d)

## Create date field, using first day of the month
cpi['Date'] = pd.to_datetime(dict(year=cpi['Year'], month=cpi['Month'], day=1))

## Exclude unnecessary columns
cpi = cpi[['Date', 'CPI']]

## Merge with date list
cpi = pd.merge(dates, cpi, how = 'left', on='Date')

## Interpolate in between readings
cpi['CPI'].interpolate(inplace=True)

In [434]:
## US GDP
# https://www.macrotrends.net/countries/USA/united-states/gdp-gross-domestic-product

## Load file
gdp = pd.read_csv('Resources/united-states-gdp-gross-domestic-product.csv')

## Convert Date to proper date format
gdp['date'] = pd.to_datetime(gdp['date'])

## Exclude unnecessary columns
gdp = gdp[['date', ' Annual % Change']]

## Rename columns
gdp = gdp.rename(columns={"date": 'Date',
                          ' Annual % Change': 'GDP'})

## Merge with date list
gdp = pd.merge(dates, gdp, how = 'left', on='Date')

## Interpolate in between readings
gdp['GDP'].interpolate(inplace=True)

gdp

Unnamed: 0,Date,GDP
0,2013-11-11,
1,2013-11-12,
2,2013-11-13,
3,2013-11-14,
4,2013-11-15,
...,...,...
3434,2023-04-07,5.9455
3435,2023-04-08,5.9455
3436,2023-04-09,5.9455
3437,2023-04-10,5.9455


In [435]:
## Sentiment Analysis

## Load file
vader = pd.read_csv('../JennS/Sentiments/all_crypto_sentiments.csv')

## Convert Date to proper date format
vader['Date'] = pd.to_datetime(vader['begins_at'])

## Rename columns
vader['vader_shift'] = vader['vader_prediction'].shift(1)
vader['sentiment_shift'] = vader['sentiment'].shift(1)

## Calculate rolling values
vader['vader_avg_last_long'] = vader['vader_shift'].rolling(long).sum() / long
vader['vader_avg_last_med'] = vader['vader_shift'].rolling(med).sum() / med
vader['vader_avg_last_short'] = vader['vader_shift'].rolling(short).sum() / short

## Rename columns
vader = vader.rename(columns={'vader_shift': 'vader_avg_last_1',
                              'vader_avg_last_long': 'vader_avg_last_' + str(long),
                              'vader_avg_last_med': 'vader_avg_last_' + str(med),
                              'vader_avg_last_short': 'vader_avg_last_' + str(short)})

## Exclude unnecessary columns
vader = vader[['Date', 'vader_avg_last_' + str(long), 'vader_avg_last_' + str(med), 'vader_avg_last_' + str(short), 'vader_avg_last_1']]

vader

Unnamed: 0,Date,vader_avg_last_50,vader_avg_last_15,vader_avg_last_5,vader_avg_last_1
0,2018-02-25,,,,
1,2018-02-26,,,,0.010650
2,2018-02-27,,,,0.055795
3,2018-02-28,,,,-0.034722
4,2018-03-01,,,,0.065192
...,...,...,...,...,...
1834,2023-03-05,0.068715,0.082024,0.098283,0.058089
1835,2023-03-06,0.068340,0.085956,0.094568,0.093306
1836,2023-03-07,0.067922,0.084390,0.080840,0.061426
1837,2023-03-08,0.067206,0.081315,0.062457,0.049406


In [436]:
## Additional data features df (merge all)
df_addl = pd.merge(dates, dates2, how='left', on='Date')
df_addl = pd.merge(df_addl, csi, how = 'left', on='Date')
df_addl = pd.merge(df_addl, gallup, how='left', on='Date')
df_addl = pd.merge(df_addl, cpi, how='left', on='Date')
df_addl = pd.merge(df_addl, gdp, how='left', on='Date')
#df_addl = pd.merge(df_addl, vader, how='left', on='Date')

## Filter dates
df_addl = df_addl.loc[df_addl['Date'] >= pd.to_datetime(filter_date)]

## Write to additional csv
df_addl.to_csv('df_addl.csv')

In [437]:
#########################################################################################################################

In [438]:
## MODEL 1 - Logistic regression - BTC (WITHOUT additional variables / Recreation of original modeling above, just isolating BTC)

## Read BTC data
df_btc = pd.read_csv('BTC-USD.csv')

## Convert 'Date' to date format
df_btc['Date'] = pd.to_datetime(df_btc['Date'])

## Quantile variable
q_model = df_quant.filter(items=[model_q], axis=0).iloc[0]['BTC-USD']

## Add 'Outcome' field 
df_btc['Outcome'] = np.where(df_btc['return_next_' + str(long)] >= q_model, 1, 0)

## Establish y and X variables
y = df_btc['Outcome']
X = df_btc.drop(columns=['Unnamed: 0', 'Outcome', 'Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
                         'return_next_1', 'return_next_' + str(short), 'return_next_' + str(med)])

## Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=88, 
                                                    stratify=y)

## Create Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=88)

## Fit model on training data
classifier.fit(X_train, y_train)

## Make predictions
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

## Calculate accuracy score
accuracy = accuracy_score(y_test, predictions)

## Confusion Matrix
cm = confusion_matrix(y_test, predictions)
TP = cm[0][0]
FN = cm[0][1]
FP = cm[1][0]
TN = cm[1][1]
TP_rate = TP / (TP + FN)
TN_rate = TN / (TN + FP)
PPV_rate = TP / (TP + FP)
NPV_rate = TN / (TN + FN)

print('Confusion Matrix: \n' + str(cm) + '\n\n' + 
      'Accuracy: ' + str(round(accuracy*100,1)) + '%\n' +
      'TP_rate: ' + str(round(TP_rate*100,1)) + '%\n' +
      'TN_rate: ' + str(round(TN_rate*100,1)) + '%\n' +
      'PPV_rate: ' + str(round(PPV_rate*100,1)) + '%\n' +
      'NPV_rate: ' + str(round(NPV_rate*100,1)) + '%\n')

Confusion Matrix: 
[[318  61]
 [286  93]]

Accuracy: 54.2%
TP_rate: 83.9%
TN_rate: 24.5%
PPV_rate: 52.6%
NPV_rate: 60.4%



In [439]:
#########################################################################################################################

In [440]:
## MODEL 2 - Logistic regression - BTC (with additional variables / new start date)
## Filter date
logistic_filter = filter_date # filter_date = '2014-11-06' - first dat BTC tracked on yfinance, '2018-04-16' - first day of sentiment analysis
logistic_filter = pd.to_datetime(logistic_filter)

## Read BTC data
df_btc = pd.read_csv('BTC-USD.csv')

## Convert 'Date' to date format
df_btc['Date'] = pd.to_datetime(df_btc['Date'])

## Merge with additional variables
df_logist_model = pd.merge(df_btc, df_addl, how='left', on='Date')

## Filter dates
df_logist_model = df_logist_model.loc[df_logist_model['Date'] >= pd.to_datetime(logistic_filter)]

## Quantile variable
q_model = df_quant.filter(items=[model_q], axis=0).iloc[0]['BTC-USD']

## Add 'Outcome' field 
df_logist_model['Outcome'] = np.where(df_logist_model['return_next_' + str(long)] >= q_model, 1, 0)

## Establish y and X variables
y = df_logist_model['Outcome']
X = df_logist_model.drop(columns=['Unnamed: 0', 'Outcome', 'Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
                                  'return_next_1', 'return_next_' + str(short), 'return_next_' + str(med)])

## Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=88, 
                                                    stratify=y)

## Create Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=88)

## Fit model on training data
classifier.fit(X_train, y_train)

## Make predictions
predictions = classifier.predict(X_test)
results2 = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

## Calculate accuracy score
accuracy2 = accuracy_score(y_test, predictions)

## Confusion Matrix
cm2 = confusion_matrix(y_test, predictions)
TP2 = cm2[0][0]
FN2 = cm2[0][1]
FP2 = cm2[1][0]
TN2 = cm2[1][1]
TP_rate2 = TP2 / (TP2 + FN2)
TN_rate2 = TN2 / (TN2 + FP2)
PPV_rate2 = TP2 / (TP2 + FP2)
NPV_rate2 = TN2 / (TN2 + FN2)

print('Confusion Matrix: \n' + str(cm2) + '\n\n' + 
      'Accuracy: ' + str(round(accuracy2*100,1)) + '%\n' +
      'TP_rate: ' + str(round(TP_rate2*100,1)) + '%\n' +
      'TN_rate: ' + str(round(TN_rate2*100,1)) + '%\n' +
      'PPV_rate: ' + str(round(PPV_rate2*100,1)) + '%\n' +
      'NPV_rate: ' + str(round(NPV_rate2*100,1)) + '%\n')

Confusion Matrix: 
[[318  61]
 [286  93]]

Accuracy: 54.2%
TP_rate: 83.9%
TN_rate: 24.5%
PPV_rate: 52.6%
NPV_rate: 60.4%



In [441]:
#########################################################################################################################

In [442]:
## Compare MODEL 1 vs MODEL 2
df_compare = pd.DataFrame({'Metric': ['Accuracy', 'TP_Rate', 'TN_Rate', 'PPV_Rate', 'NPV_Rate'],
                        'MODEL 1': [round(accuracy*100,1), round(TP_rate*100,1), round(TN_rate*100,1), round(PPV_rate*100,1), round(NPV_rate*100,1)],
                        'MODEL 2': [round(accuracy2*100,1), round(TP_rate2*100,1), round(TN_rate2*100,1), round(PPV_rate2*100,1), round(NPV_rate2*100,1)]})

df_compare.to_csv('df_compare.csv')
df_compare

Unnamed: 0,Metric,MODEL 1,MODEL 2
0,Accuracy,54.2,54.2
1,TP_Rate,83.9,83.9
2,TN_Rate,24.5,24.5
3,PPV_Rate,52.6,52.6
4,NPV_Rate,60.4,60.4


In [443]:
#########################################################################################################################

In [444]:
## MODEL 3 - Nueral Net