In [1]:
import pandas as pd
import plotly as plt
import seaborn as sns
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from IPython.display import display, HTML, Markdown, Latex
from tqdm import tqdm, trange
from typing import *
from dataclasses import dataclass
from scipy import stats
import plotly.io as pio
import sys
sys.path.append('..')
from analysis.code import project_functions1 as pf

In [2]:
sns.set_theme(style="darkgrid")
sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})
sns.set(style="ticks", context="talk")
plt.style.use("dark_background")

config = {
  'toImageButtonOptions': {
    'format': 'png',
    'filename': 'custom_image',
    'height': 800,
    'width': 2000,
    'scale': 2
  }
}

In [3]:
equities = pf.EquityData()
overview_df = equities.load_and_process("overview", exclude_columns=['Change %', 'Change', 'Technical Rating', 'Volume', 'Volume*Price'])
income_statement_df = equities.load_and_process("income_statement")
balance_sheet_df = equities.load_and_process("balance_sheet")
dividends_df = equities.load_and_process("dividends", exclude_columns=['Price'])
margins_df = equities.load_and_process("margins")
performance_df = equities.load_and_process("performance", exclude_columns=['Change 1m, %', 'Change 5m, %', 'Change 15m, %', 'Change 1h, %', 'Change 4h, %', 'Change 1W, %', 'Change 1M, %', 'Change %'])
valuation_df = equities.load_and_process("valuation", exclude_columns=['Price', 'Market Capitalization', 'Price to Earnings Ratio (TTM)', 'Basic EPS (TTM)', 'EPS Diluted (FY)'])

In [4]:
dfs = [
    overview_df,
    income_statement_df,
    balance_sheet_df,
    dividends_df,
    margins_df,
    performance_df,
    valuation_df
    ]

dfs_names = [
    "Overview Data",
    "Balance Sheet Data",
    "Dividends Data",
    "Income Statement Data",
    "Margins Data",
    "Performance Data",
    "Valuation Data"
    ]

overview_df['3-Month Performance'] = performance_df['3-Month Performance']
income_statement_df['3-Month Performance'] = performance_df['3-Month Performance']
balance_sheet_df['3-Month Performance'] = performance_df['3-Month Performance']
dividends_df['3-Month Performance'] = performance_df['3-Month Performance']
margins_df['3-Month Performance'] = performance_df['3-Month Performance']
valuation_df['3-Month Performance'] = performance_df['3-Month Performance']

mega_df = pd.concat(dfs, axis=1)
mega_df = mega_df.loc[:,~mega_df.columns.duplicated()].copy()
mega_df = mega_df.dropna()
mega_df_no_strings = mega_df.select_dtypes(exclude='object')

mega_df['6-Month Performance'] = performance_df['6-Month Performance']
mega_df['YTD Performance'] = performance_df['YTD Performance']
mega_df['Yearly Performance'] = performance_df['Yearly Performance']

In [5]:
quant = pf.QuantitativeAnalysis()
viz = pf.DataVisualization()

In [6]:
type(quant.lin_reg_coef_determination(mega_df, '3-Month Performance'))

numpy.float64

In [7]:
all_columns = list(mega_df_no_strings.columns)
all_columns.remove('Price')
all_columns.remove('3-Month Performance')
all_columns.remove('6-Month Performance')
all_columns.remove('YTD Performance')
all_columns.remove('Yearly Performance')
all_columns.remove('1-Year Beta')
y_value='3-Month Performance'

y_values=['3-Month Performance', 'YTD Performance']
for y_value in y_values:
    df = quant.get_lin_reg_coefs(mega_df, x_values=all_columns, y_value=y_value)
    fig = px.bar(df, x=f'Equity Data Against {y_value}', y='Coefficient of Determination')
    fig.update_layout(title_text=f'Coefficients of Determination for Equity Data Against {y_value}', template='plotly_dark')
    fig.show()

Constructing linear regression models: 100%|██████████| 35/35 [00:00<00:00, 289.96it/s]


Constructing linear regression models: 100%|██████████| 35/35 [00:00<00:00, 212.51it/s]


As anticipated, the coefficients of determination are extremely low and no correlation is evident. Therefore, the next step is to construct a multiple linear regression model that will only select the predictors that have the highest correlation from the tests conducted above. The goal of this analysis is to normalize the coefficients of determination and use that as a multiplier to the default ranked scores applied to each equity data column.

In [8]:
for df, name in zip(dfs, dfs_names):
    df.dropna(inplace=True)
    viz.score_density_plot(df, name).show()

These plots reveal a number of things. First, they indicate that there are cases where most companies perform either reasonably well or generally bad for certian financial ratios, as represented by the width of the distribution plot for each ratio. The smaller it is, the more likely that most companies perform similarly. Secondly, most graphs have cases where outliers exist for each category, in which these companies may perform exceptionally well compared to others for a certain financial ratio--being a potentially strong pick.

The next step is to investigate if a correlation exists between cases where most companies score low for a certain ratio, and outliers of that segment performing exceptionally well.

In [9]:
viz.heatmap_plot(mega_df, f'Complete Equity Data ({len(mega_df.columns)} Data Points)', 50).show()
for df, names in zip(dfs, dfs_names):
    viz.heatmap_plot(df, names, 50).show()

These heat plots reveal that for certain categories of data, particularly valuation, income statement and balance sheet data, the top companies by market capitalization tend to have the highest scores in those categories. Although this may indicate that such companies with the highest aggregated normalized scores are the best pick for an investment portfolio, this assumption must be validated against their past 3-month performance to see if these scores did indeed dictate a positive change in the price of an equity--indicating a positive return on investment. A multiple linear regression can be used, but first, a 3D plot can be used to closely analyze the correlation of two of the most important pieces of equity data when picking stocks with the corresponding change in the price of such stocks.

In [10]:
viz.scatter_3d(mega_df, 'Price to Earnings Ratio (TTM) Score', 'Free Cash Flow Margin (FY) Score', '3-Month Performance Score').show()

There appears to be a reasonably strong correlation between the normalized Free Cash Flow Margin (FY) score and the corresponding 3-Month Performance score, indicating that a high Free Cash FLow Margin (FY) Score may be a good metric to consider when picking a stock. The same applies for the correlation between the Price to Earnings Ratio (TTM) score and the 3-Month Performance score. Plotting a regression line can be used to validate this hypothesis.