In [1]:
import sys
import os
import datetime
sys.path.append(os.path.abspath('../../fin_data'))
from utils.postgresql_conn import get_session
from utils.postgresql_tables import *
from sklearn.linear_model import LinearRegression, TheilSenRegressor
from sklearn.preprocessing import StandardScaler
from fin_statement_values_ml import main as extract_financial_data
from utils.date_functions import time_elapsed
from sqlalchemy import desc
from datetime import date
from tqdm import tqdm
import pandas as pd
import numpy as np
import time

[2024-10-06 23:31:30.852868] INFO: Norgate Data: NorgateData package v1.0.74: Init complete
connected to: dbmaster


#### Setup Basics:

In [2]:
currency_reporting = 'USD' # or None
model_date = date.today()
data_date = pd.to_datetime('2024-10-04')
use_ready_data = True # or None
iter_range = [10000]
data_path = '/Users/VadimKovshov/Dropbox/INVESTMENTS/EVALUTE/STOCKS/MODEL_OUTPUTS/FUNDAMENTAL_OVER_UNDER/DATA/'
output_path = '/Users/VadimKovshov/Dropbox/INVESTMENTS/EVALUTE/STOCKS/MODEL_OUTPUTS/FUNDAMENTAL_OVER_UNDER/OUTPUT/'
file_data_path = f'{data_path}aggregated_fin_statements_{data_date.strftime("%Y%m%d")}.csv'

#### Data Extraction: Pick one...

In [3]:
if use_ready_data:
    # Run this block to use one of the earlier data extractions
    extracted_data = pd.read_csv(file_data_path)
    print("Done with data extraction...")
    print(len(extracted_data))
    extracted_data.head(3)
else:
    # Or run this block to extract new data
    session = get_session()
    try:
        extracted_data = extract_financial_data(date=data_date, exclude_financial_sector=False, 
                                                currency_reporting=currency_reporting)
    except Exception as e:
        print(f"Error during data extraction: {e}")
        extracted_data = None
        session.rollback()

    print("Done with data extraction...")
    print(f"Tickers extracted: {len(extracted_data['ticker'])}")

Done with data extraction...
1658


In [4]:
# Creating accounting variables list
headers = extracted_data.columns
headers_list = headers.tolist()

relevant_headers = ['cashneq', 'inventory', 'receivables', 'ppnenet', 'assets', 
                    'payables', 'debtc', 'debtnc', 'debt', 'equity', 'retearn', 
                    'revenue', 'cor', 'gp', 'sgna', 'rnd', 'opinc', 'intexp', 
                    'taxexp', 'netinccmn', 'epsdil', 'dps', 'depamor', 'ncfo', 
                    'capex', 'ncfi', 'ncff', 'fcf']

accounting_variables = [header for header in headers_list if header in relevant_headers]
print("Accounting Variables:", accounting_variables)

Accounting Variables: ['cashneq', 'inventory', 'receivables', 'ppnenet', 'assets', 'payables', 'debtc', 'debtnc', 'debt', 'equity', 'retearn', 'revenue', 'cor', 'gp', 'sgna', 'rnd', 'opinc', 'intexp', 'taxexp', 'netinccmn', 'epsdil', 'dps', 'depamor', 'ncfo', 'capex', 'ncfi', 'ncff', 'fcf']


In [5]:
### functions
def estimate_fair_value_ols(data, accounting_variables):
    X = data[accounting_variables]
    y = data['market_cap']
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    model = LinearRegression()
    model.fit(X_scaled, y)
    data['fair_value_ols'] = model.predict(X_scaled)
    
    return data

def estimate_fair_value_ts_with_sampling(data, accounting_variables, num_samples=100000, sample_size=100):
    X_all = data[accounting_variables]
    y_all = data['market_cap']
    
    scaler = StandardScaler()
    X_all_scaled = scaler.fit_transform(X_all)
    
    firms = data.index
    predictions = {firm: [] for firm in firms}
    
    for _ in tqdm(range(num_samples), desc="Theil-Sen Sampling"):
        sample_indices = np.random.choice(firms, size=sample_size, replace=True)
        X_sample = X_all_scaled[sample_indices]
        y_sample = y_all.iloc[sample_indices]
        
        model = TheilSenRegressor()
        model.fit(X_sample, y_sample)
        
        for firm in firms:
            X_firm = X_all_scaled[firm].reshape(1, -1)  # Ensure it's a 2D array
            prediction = model.predict(X_firm)[0]
            predictions[firm].append(prediction)
    
    data['fair_value_ts'] = [np.median(predictions[firm]) for firm in firms]
    
    return data

def calculate_mispricing(data, fair_value_column):
    # Generate the mispricing column name
    mispricing_column = f'mispricing_{fair_value_column.split("_")[-1]}'
    
    # Ensure fair_value_column exists in the DataFrame
    if fair_value_column not in data.columns:
        raise KeyError(f"'{fair_value_column}' does not exist in the DataFrame")
    
    # Calculate the mispricing column
    data[mispricing_column] = data['market_cap'] - data[fair_value_column]
    
    # Initialize the mispricing percentage column with NaN
    mispricing_pct_column = f'{mispricing_column}_pct'
    data[mispricing_pct_column] = np.nan
    
    # Set mispricing to 1000% where fair value is negative
    negative_fair_value_mask = data[fair_value_column] < 0
    data.loc[negative_fair_value_mask, mispricing_pct_column] = 1000
    
    # Create a mask for non-zero and positive fair value (to avoid division by zero or negative)
    mask = (data[mispricing_column] != 0) & ~negative_fair_value_mask
    
    # Perform the division only where the fair value is positive and non-zero
    data.loc[mask, mispricing_pct_column] = (
        (data.loc[mask, 'market_cap'] / data.loc[mask, fair_value_column] - 1) * 100
    )
    
    return data

def assign_mispricing_decile(data, mispricing_pct_column):
    data['mispricing_ts_decile'] = pd.qcut(data[mispricing_pct_column], 10, labels=False, duplicates='drop') + 1
    return data

def split_data_by_sector(data):
    """
    Splits the extracted data into two datasets: one for financials and one for non-financials.
    ompanies.
    """
    # Filter financial sector
    financial_data = data[data['sector'].str.lower() == 'financial services'].reset_index(drop=True)
    
    # Filter non-financial sector
    non_financial_data = data[data['sector'].str.lower() != 'financial services'].reset_index(drop=True)

    return financial_data, non_financial_data

def main(data=None, acc_variables=None, random_seed=42, samples=1000, sample_size=100):
    # Check if data is provided or create a sample dataset for testing purposes
    if data is None:
        np.random.seed(random_seed)

        num_firms = 1700
        num_vars = 28
        sectors = ['Technology', 'Healthcare', 'Financial Services', 'Consumer Cyclical', 
                   'Energy', 'Real Estate', 'Utilities', 'Industrials']  # Example sectors

        # Create a sample dataset with sector information
        data = pd.DataFrame({
            'compid': np.arange(num_firms),
            'ticker': [f'TKR{i}' for i in range(num_firms)],
            'market_cap': np.random.rand(num_firms) * 1000,
            'returns': np.random.randn(num_firms),
            'sector': np.random.choice(sectors, num_firms)  # Assign random sectors
        })
        
        for i in range(1, num_vars + 1):
            data[f'accounting_var{i}'] = np.random.rand(num_firms) * 100
        
        if acc_variables is None:
            acc_variables = [f'accounting_var{i}' for i in range(1, num_vars + 1)]
    
    else:
        if acc_variables is None:
            raise ValueError("acc_variables must be provided if data is provided")
        data = data.copy()

        if 'returns' not in data.columns:
            # Simulate returns column if it doesn't exist
            np.random.seed(random_seed)
            data['returns'] = np.random.randn(data.shape[0])

        if 'sector' not in data.columns:
            raise ValueError("Sector information is missing from the provided data")

    # Estimate fair values using OLS
    ols_data = estimate_fair_value_ols(data.copy(), accounting_variables=acc_variables)

    # Estimate fair values using Theil-Sen with sampling and time the process
    start_time = time.time()
    num_samples = samples  # Use a smaller number for initial timing
    ts_data = estimate_fair_value_ts_with_sampling(data.copy(), accounting_variables=acc_variables, num_samples=num_samples, sample_size=sample_size)
    end_time = time.time()

    elapsed_time = end_time - start_time
    estimated_time_100000 = (elapsed_time / num_samples) * 100000

    print(f"Estimated time for 100,000 samples: {estimated_time_100000 / 60:.2f} minutes")

    # Calculate mispricing for OLS and TS
    ts_data = calculate_mispricing(ts_data, 'fair_value_ts')
    ols_data = calculate_mispricing(ols_data, 'fair_value_ols')

    # Combine OLS and TS data into a single DataFrame, including sector info
    combined_data = data[['compid', 'ticker', 'market_cap', 'sector']].copy()
    
    # Ensure 'market_cap', 'fair_value_ols', 'fair_value_ts' columns are numeric before applying formatting
    combined_data['market_cap'] = pd.to_numeric(combined_data['market_cap'], errors='coerce').fillna(0)
    ols_data['fair_value_ols'] = pd.to_numeric(ols_data['fair_value_ols'], errors='coerce').fillna(0)
    ts_data['fair_value_ts'] = pd.to_numeric(ts_data['fair_value_ts'], errors='coerce').fillna(0)

    # Format values as strings with commas
    combined_data['market_cap'] = combined_data['market_cap'].apply(lambda x: f"{x:,.0f}")
    combined_data['fair_value_ols'] = ols_data['fair_value_ols'].apply(lambda x: f"{x:,.0f}")
    combined_data['fair_value_ts'] = ts_data['fair_value_ts'].apply(lambda x: f"{x:,.0f}")

    # Ensure 'mispricing_ols_pct' and 'mispricing_ts_pct' columns are numeric and round them
    ols_data['mispricing_ols_pct'] = pd.to_numeric(ols_data['mispricing_ols_pct'], errors='coerce').fillna(0)
    ts_data['mispricing_ts_pct'] = pd.to_numeric(ts_data['mispricing_ts_pct'], errors='coerce').fillna(0)

    # Now round the numeric columns
    combined_data['mispricing_ols_pct'] = ols_data['mispricing_ols_pct'].round(2)
    combined_data['mispricing_ts_pct'] = ts_data['mispricing_ts_pct'].round(2)

    # Calculate the average mispricing percentage (optional)
    # combined_data['mispricing_avg_pct'] = ((combined_data['mispricing_ols_pct'] + combined_data['mispricing_ts_pct']) / 2).round(2)

    # Assign mispricing deciles based on the ts mispricing percentage
    combined_data = assign_mispricing_decile(combined_data, 'mispricing_ts_pct')

    # Sort the DataFrame by mispricing decile in descending order
    combined_data = combined_data.sort_values(by='mispricing_ts_decile', ascending=False)

    print(combined_data.head())

    return combined_data

In [6]:
# Split the data into financials and non-financials
financial_data, non_financial_data = split_data_by_sector(data=extracted_data)

In [None]:
# Process financials
for n in iter_range:
    combined_data = main(data=financial_data, acc_variables=accounting_variables, random_seed=42, samples=n, sample_size=20)
    result_file_path = f'{output_path}ols_ts_mispricing_{data_date.strftime("%Y%m%d")}_{n}_financials_{currency_reporting}.csv'
    combined_data.to_csv(result_file_path, index=False)
    print(f"Saved result to {result_file_path}")

In [8]:
# Process non-financials
for n in iter_range:
    combined_data = main(data=non_financial_data, acc_variables=accounting_variables, random_seed=42, samples=n, sample_size=100)
    result_file_path = f'{output_path}ols_ts_mispricing_{data_date.strftime("%Y%m%d")}_{n}_non_financials_{currency_reporting}.csv'
    combined_data.to_csv(result_file_path, index=False)
    print(f"Saved result to {result_file_path}")

Theil-Sen Sampling: 100%|██████████| 10000/10000 [3:45:36<00:00,  1.35s/it] 


Estimated time for 100,000 samples: 2256.39 minutes
       compid ticker market_cap             sector fair_value_ols   
611      7139    HHH      3,844        Real Estate            825  \
1047      846   ELAN      7,247         Healthcare        -20,237   
162     14033    NVR     29,283  Consumer Cyclical         48,682   
164   2638936    BHP    153,591    Basic Materials         32,862   
1328     7492    VRT     39,466        Industrials         35,994   

     fair_value_ts  mispricing_ols_pct  mispricing_ts_pct   
611         -2,150              366.12            1000.00  \
1047         3,294             1000.00             120.01   
162        -48,354              -39.85            1000.00   
164         26,863              367.39             471.76   
1328        13,918                9.64             183.56   

      mispricing_ts_decile  
611                     10  
1047                    10  
162                     10  
164                     10  
1328                 

In [9]:
# combined_data.to_clipboard()