# IPO Random Forest Regression
Predicting Excess Returns using pre-IPO Data and Random Forest Regressions

In [256]:
# Library imports
import psycopg2
import os
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import plotly.graph_objects as go

## Gathering data

In [115]:
# Establish connection to PostgreSQL
conn = psycopg2.connect(os.environ.get('DB_CONNECTION_STRING'))

In [116]:
# pre-IPO data
query = 'SELECT DISTINCT ON ("companyName") * FROM ipos ORDER BY "companyName", "createdAt" DESC NULLS LAST;'
ipos = pd.read_sql(query, conn)
ipos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188 entries, 0 to 187
Data columns (total 39 columns):
 #   Column                  Non-Null Count  Dtype              
---  ------                  --------------  -----              
 0   id                      188 non-null    int64              
 1   symbol                  188 non-null    object             
 2   companyName             188 non-null    object             
 3   expectedDate            188 non-null    datetime64[ns, UTC]
 4   auditor                 188 non-null    object             
 5   market                  188 non-null    object             
 6   cik                     188 non-null    object             
 7   address                 188 non-null    object             
 8   city                    188 non-null    object             
 9   state                   188 non-null    object             
 10  zip                     188 non-null    object             
 11  phone                   188 non-null    objec

In [117]:
# Company data
query = f'SELECT * FROM companies ORDER BY "companyName";'
companies = pd.read_sql(query, conn)
companies.iloc[0]

id                                                              369
symbol                                                          TXG
companyName                                      10X Genomics, Inc.
exchange                                                     NASDAQ
industry                                              Biotechnology
website                                  http://www.10xgenomics.com
description       10X Genomics, Inc. is a life science technolog...
CEO                                                   Serge Saxonov
securityName                               10x Genomics Inc Class A
issueType                                                        cs
sector                                            Health Technology
primarySicCode                                                 2836
employees                                                       584
address                                   6230 Stoneridge Mall Road
address2                                        

In [118]:
# Company price data
date_string = "'2019-07-22'"
companies_ids = tuple(companies['id'].to_numpy())
query = f'SELECT * FROM prices WHERE "companyId" IN {companies_ids} AND "date" >= {date_string} ORDER BY "companyId", "date";'
prices = pd.read_sql(query, conn)
prices['date'] = pd.to_datetime(prices['date']) 

In [119]:
# Benchmark price data
r = requests.get(os.environ.get('BENCHMARK_QUERY_STRING'))
raw_benchmark_data = r.json()

In [120]:
benchmark = pd.DataFrame.from_dict(raw_benchmark_data)
benchmark['date'] = pd.to_datetime(benchmark['date']) 

## Build Labeled Series

In [121]:
# 1. Create 21-trading day return series for SPY (average # of trading days per month)
# 2. Extract return for each IPO and its excess return to SPY over the same period
#    this should be a function that takes a trading day size and benchmark return series
# 3. Combine returns with pre-IPO data
# 4. Remove companies that do not have return data available given trading window size
# 5. Convert categorical data to one-hot or binary encoding
# 6. Finialze sample data (X, Y)

In [122]:
# Series variables
price_column = 'close'
period = 21
returns_groupby = 'companyId'

In [123]:
# Create a period return series
def period_return(df, period, price_column, return_type='discrete', zero_idx=True):
    '''
    Create a period return series from DataFrame (requires ASC order).
    '''
    
    types = ('discrete', 'continuous')
    if return_type not in types:
        raise TypeError(f'return_type needs to be of type: {types}')

    zero_period = period
    if zero_idx == True:
        zero_period = period - 1

    if return_type == 'discrete':
        return (df[price_column][zero_period:] - df[price_column][:-zero_period].values) / df[price_column][:-zero_period].values
    else:
        return np.log(df[price_column][zero_period:]) - np.log(df[price_column][:-zero_period].values)

In [124]:
# 1. Benchmark Returns
benchmark['returns21d'] = period_return(benchmark, period, price_column)

In [125]:
# 2. IPO returns
def add_returns_groupby(df, groupby, price_column, period, returns_column='returns'):
    '''
    Adds a return column to original DataFrame. Calculations are done arounding to groupby key.
    
    :returns: Original DataFrame with new column.
    '''
    grouped = df.groupby(groupby)
    returns_series = []
    first_series = None
    for _, group in grouped:
        if first_series is None:
            first_series = period_return(group, period, price_column).rename(returns_column)
        else:
            returns_series.append(period_return(group, period, price_column).rename(returns_column))

    return df.join(first_series.append(returns_series))

In [126]:
# 2. IPO returns
prices = add_returns_groupby(prices, returns_groupby, price_column, period)

In [127]:
# Excess returns
prices = prices.merge(benchmark[['date', 'returns21d']])
prices['ex_returns'] = prices['returns'] - prices['returns21d']

In [128]:
# Match sybmol to id
prices = prices.merge(companies[['id', 'symbol']], left_on='companyId', right_on='id')
prices

Unnamed: 0,id_x,date,high,low,volume,open,close,uHigh,uLow,uVolume,...,change,changePercent,createdAt,updatedAt,companyId,returns,returns21d,ex_returns,id_y,symbol
0,20936,2019-08-01,13.22,7.66,9314399.0,13.01,8.48,13.22,7.66,9314399.0,...,0.00,0.0000,2020-02-06 23:34:00.006000+00:00,2020-02-06 23:34:00.006000+00:00,332,,0.168240,,332,SNDL
1,20937,2019-08-02,10.48,8.42,2693863.0,8.42,10.45,10.48,8.42,2693863.0,...,1.97,23.2311,2020-02-06 23:34:00.006000+00:00,2020-02-06 23:34:00.006000+00:00,332,,0.182069,,332,SNDL
2,20938,2019-08-05,11.82,10.47,2206717.0,10.69,11.70,11.82,10.47,2206717.0,...,1.25,11.9617,2020-02-06 23:34:00.006000+00:00,2020-02-06 23:34:00.006000+00:00,332,,0.107545,,332,SNDL
3,20939,2019-08-06,13.21,11.99,2180774.0,12.00,13.00,13.21,11.99,2180774.0,...,1.30,11.1111,2020-02-06 23:34:00.006000+00:00,2020-02-06 23:34:00.006000+00:00,332,,0.097477,,332,SNDL
4,20940,2019-08-07,13.22,12.20,1611203.0,13.05,12.85,13.22,12.20,1611203.0,...,-0.15,-1.1538,2020-02-06 23:34:00.006000+00:00,2020-02-06 23:34:00.006000+00:00,332,,0.113691,,332,SNDL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15821,31903,2020-07-06,96.51,73.11,13467931.0,73.39,81.19,96.51,73.11,13467931.0,...,11.78,16.9700,2020-07-07 09:30:18.182000+00:00,2020-07-07 09:30:18.182000+00:00,548,,-0.072555,,548,LMND
15822,32029,2020-07-07,89.38,75.00,4602812.0,83.80,78.79,89.38,75.00,4602812.0,...,-2.40,-2.9600,2020-07-08 09:30:11.720000+00:00,2020-07-08 09:30:11.720000+00:00,548,,-0.099618,,548,LMND
15823,32155,2020-07-08,79.39,68.06,3499175.0,79.00,68.51,79.39,68.06,3499175.0,...,-10.28,-13.0500,2020-07-09 09:30:12.058000+00:00,2020-07-09 09:30:12.058000+00:00,548,,-0.014198,,548,LMND
15824,32282,2020-07-09,79.91,69.03,4178671.0,73.97,77.01,79.91,69.03,4178671.0,...,8.50,12.4100,2020-07-10 09:30:14.207000+00:00,2020-07-10 09:30:14.207000+00:00,548,,0.016505,,548,LMND


In [129]:
# 3/4. Combine the first return observation with pre-IPO data
def combine_return_data(prices, ipos, return_col):
    '''
    Finds a single return to match to the pre-ipo data
    '''
    grouped = prices.groupby('symbol')
    returns_series = []
    for symbol, group in grouped:
        temp_return = group[~pd.isna(group[return_col])][return_col]
        if len(temp_return) > 0:
            returns_series.append({
                'symbol': symbol,
                'ex_returns': temp_return.iloc[0]
            })
    return ipos.merge(pd.DataFrame.from_dict(returns_series), on='symbol')

In [130]:
ipos = combine_return_data(prices, ipos, 'ex_returns')

In [224]:
ipos2 = ipos.copy()
ipos2['ex_returns'] = (ipos2['ex_returns'] > 0).astype(int)
ipos2

Unnamed: 0,id,symbol,companyName,expectedDate,auditor,market,cik,address,city,state,...,stockholderEquity,companyDescription,businessDescription,useOfProceeds,competition,amount,percentOffered,createdAt,updatedAt,ex_returns
0,173,TXG,"10X GENOMICS, INC.",2019-09-12 00:00:00+00:00,"American Stock Transfer & Trust Company, LLC",NASDAQ Global Select,0001770787,6230 STONERIDGE MALL ROAD,PLEASANTON,CA,...,-2.279480e+08,Our mission is to accelerate the mastery of bi...,Our mission is to accelerate the mastery of bi...,We estimate that the net proceeds to us from t...,The life sciences market is highly competitive...,3.330000e+08,52.65,2019-09-12 11:00:00.457000+00:00,2019-09-12 11:00:00.457000+00:00,1
1,753,ONEM,1LIFE HEALTHCARE INC,2020-01-31 00:00:00+00:00,"American Stock Transfer & Trust Company, LLC",NASDAQ Global Select,0001404123,"ONE EMBARCADERO CENTER, SUITE 1900",San Francisco,CA,...,-1.734710e+08,Our vision is to delight millions of members w...,Our vision is to delight millions of members w...,We estimate that we will receive net proceeds ...,We compete in a highly fragmented primary care...,2.625000e+08,14.30,2020-01-31 11:00:00.409000+00:00,2020-01-31 11:00:00.409000+00:00,0
2,530,KRKR,36KR HOLDINGS INC.,2019-11-08 00:00:00+00:00,Not Specified,NASDAQ Global,0001779476,"5-6/F, TWR A1, JUNHAO CENTRAL PARK PLAZA","CHAOYANG DISTRICT, BEIJING 00000",,...,-1.233470e+08,Our mission is to empower New Economy particip...,Our mission is to empower New Economy particip...,We estimate that we will receive net proceeds ...,We operate in the New Economy-focused business...,2.208000e+07,4.10,2019-11-08 11:00:00.776000+00:00,2019-11-08 11:00:00.776000+00:00,0
3,507,ETNB,"89BIO, INC.",2019-11-06 00:00:00+00:00,American Stock Transfer and Trust Company LLC,NASDAQ Global,0001785173,535 MISSION STREET,SAN FRANCISCO,CA,...,-3.535800e+07,We are a clinical-stage biopharmaceutical comp...,We are a clinical-stage biopharmaceutical comp...,We estimate that we will receive net proceeds ...,The biopharmaceutical industry is intensely co...,7.000000e+07,36.27,2019-11-06 11:00:00.480000+00:00,2019-11-06 11:00:00.480000+00:00,1
4,107,JFU,9F INC.,2019-08-15 00:00:00+00:00,"Citibank, N.A",NYSE,0001619544,"JIUFU BUILDING,RONGXIN TECHNOLOGY CENTER",BEIJING 100102,,...,1.008548e+09,We are a leading digital financial account pla...,We are a leading digital financial account pla...,We estimate that we will receive net proceeds ...,The industries we are operating in are competi...,7.565000e+07,100.00,2019-08-13 11:00:00.515000+00:00,2019-08-13 11:00:00.515000+00:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,856,WIMI,WIMI HOLOGRAM CLOUD INC.,2020-04-01 00:00:00+00:00,Not Specified,NASDAQ Global,0001770088,NO. 6 XIAOZHUANG,BEIJING 100005,,...,5.795378e+07,"We offer augmented reality (""AR"")-based hologr...","We offer augmented reality (""AR"")-based hologr...",Based on the assumed initial public offering p...,There are many other companies addressing vari...,3.250000e+07,100.00,2020-04-01 11:00:00.392000+00:00,2020-04-01 11:00:00.392000+00:00,0
106,636,XP,XP INC.,2019-12-11 00:00:00+00:00,Not Specified,NASDAQ Global Select,0001787425,"AV. CHEDID JAFET 75, TORRE SUL",SAO PAULO 00000,,...,6.672470e+08,Our mission is to transform the financial mark...,Our mission is to transform the financial mark...,We estimate that the net proceeds from our iss...,The Brazilian financial services industry is h...,1.704000e+09,20.86,2019-12-11 11:00:01.108000+00:00,2019-12-11 11:00:01.108000+00:00,1
107,413,DAO,"YOUDAO, INC.",2019-10-25 00:00:00+00:00,Not Specified,NYSE,0001781753,"NO. 399, WANGSHANG ROAD","BINJIANG DISTRICT, HANGZHOU 310051",,...,-1.835560e+08,What is Youdao &#10; &#10; Youdao makes learn...,What is Youdao &#10; &#10; Youdao makes learn...,We expect to receive total estimated net proce...,We operate in the highly competitive intellige...,9.240000e+07,24.50,2019-10-25 11:00:00.429000+00:00,2019-10-25 11:00:00.429000+00:00,1
108,861,ZNTL,"ZENTALIS PHARMACEUTICALS, LLC",2020-04-03 00:00:00+00:00,"American Stock Transfer & Trust, LLC",NASDAQ Global,0001725160,"530 SEVENTH AVENUE, SUITE 2201",NEW YORK,NY,...,-8.010600e+07,We are a clinical-stage biopharmaceutical comp...,We are a clinical-stage biopharmaceutical comp...,We estimate that the net proceeds to us from i...,The biotechnology and pharmaceutical industrie...,1.300500e+08,23.27,2020-04-03 11:00:00.902000+00:00,2020-04-03 11:00:00.902000+00:00,1


## Input Features - Pre-IPO Data Variables
#### Categorical
1. market
2. state
3. industry (v2)
4. sector (v2)

#### Numerical
1. employees
2. sharesOffered
3. priceLow
4. totalExpenses
5. sharesOutstanding
6. revenue
7. netIncome
8. totalAssets
9. totalLiabilities
10. stockholderEquity
11. amount
12. percentOffered

In [225]:
def cast_columns_dtype(df, dtype, columns=[], all_columns=True):
    '''
    Casts columns into a specified type.
    '''
    if all_columns == True:
        columns = list(df.columns)

    dtype_dict = {col: dtype for col in columns}
    return df.astype(dtype_dict)
    

def create_labeled_data(data, target_col, categorical=[], numerical=[]):
    '''
    Create X and y data arrays for machine learning methods.
    '''

    # Numerical
    if len(numerical) > 0:
        X = data[numerical].copy()
    else:
        X = pd.get_dummies(data[categorical])
    # Categorical
    if len(categorical) > 0 and len(numerical):
        X = X.merge(pd.get_dummies(data[categorical]), right_index=True, left_index=True)
    # Cast types
    X = cast_columns_dtype(X, 'float64')

    y = data[target_col].copy()

    return X.to_numpy(), y.to_numpy().ravel(), list(X.columns)
    
categorical = ['state', 'market']
numerical = [
    'employees',
    'sharesOffered',
    'priceLow',
    'totalExpenses',
    'sharesOutstanding',
    'revenue',
    'netIncome',
    'totalAssets',
    'totalLiabilities',
    'stockholderEquity',
    'amount',
    'percentOffered'
]
target_col = ['ex_returns']
X, y, X_columns = create_labeled_data(ipos, target_col, categorical, numerical)

In [226]:
np.savetxt('X.csv', X, delimiter=',', header=', '.join(X_columns))
np.savetxt('y.csv', y, delimiter=',', header=target_col[0])

## Fitting a Random Forest Model
---

**fit(self, X, y, sample_weight=None)**

Build a forest of trees from the training set (X, y).

|Parameters|Description|
|:--|:--|
|**X : {array-like, sparse matrix} of shape (n_samples, n_features)**|The training input samples.|
|**y : array-like of shape (n_samples,) or (n_samples, n_outputs)**|The target values (class labels in classification, real numbers in regression).|
|**sample_weight : array-like of shape (n_samples,), default=None**|Sample weights. If None, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. In the case of classification, splits are also ignored if they would result in any single class carrying a negative weight in either child node.|

In [236]:
# Divide the data in to train, dev sets (70%, 30%)

test_size = 0.30
random_state = 0

X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=test_size, random_state=random_state)

### Initial Regression
Fit our initial version of the random forest regression and full list of input variables. This is to see if there is some explanatory or predictive power of our inputs and our target variable--one-month excess market returns.

In [237]:
# 1. 1st try - Default Hyperparameters
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X_train, y_train)
train_scores = {}
dev_scores = {}
train_scores['default'] = regr.score(X_train, y_train)
dev_scores['default'] = regr.score(X_dev, y_dev)

# Results
print('R^2 Scores - Default Hyperparameters')
print('Train score:', '{:.4f}'.format(train_scores['default']))
print('Dev score:', '{:.4f}'.format(dev_scores['default']))

# fig = go.Figure(data=[go.Table(header=dict(values=['Data', 'R^2']), \
#                                cells=dict(values=[['train_score', 'dev_score'], [train_score, dev_score]]))])
# fig.update_layout(width=500, height=300, \
#                      margin=dict(l=50, r=50, b=20, t=20, pad=0))
# fig.show()
# y_pred = regr.predict(X_test)
# print(y_pred[0])
# print(y_test[0])

R^2 Scores - Default Hyperparameters
Train score: 0.3596
Dev score: -0.0534


**Horrible first regression!** We have pretty low training performance and out of sample performance is worse than random guessing. Our goal now is to increase performance across the board and see if we are able to find some predictive power from our input variable. Some of the steps are going to try are:
- Train using full depth trees
- Simplify the set of input variables
- Setting `max_features` to a lower value
- Modify RF class to replace standard bootstrapping with sequential bootstrapping


In [238]:
# 2. Train using full depth trees
regr = RandomForestRegressor(max_depth=None, random_state=0)
regr.fit(X_train, y_train)
train_scores['fullDepth'] = regr.score(X_train, y_train)
dev_scores['fullDepth'] = regr.score(X_dev, y_dev)

# Results
print('R^2 Scores - Full Depth Trees')
print('Train score:', '{:.4f}'.format(train_scores['fullDepth']))
print('Dev score:', '{:.4f}'.format(dev_scores['fullDepth']))

R^2 Scores - Full Depth Trees
Train score: 0.8413
Dev score: -0.0931


Large improvement in training score, but no change in dev score. Model is not able to generalize to out of sample data.

In [239]:
# 3. Lower max_features than n_features
regr = RandomForestRegressor(max_depth=None, random_state=0, max_features='sqrt')
regr.fit(X_train, y_train)
train_scores['sqrtMaxF'] = regr.score(X_train, y_train)
dev_scores['sqrtMaxF'] = regr.score(X_dev, y_dev)

# Results
print('R^2 Scores - Full Depth Trees')
print('Train score:', '{:.4f}'.format(train_scores['sqrtMaxF']))
print('Dev score:', '{:.4f}'.format(dev_scores['sqrtMaxF']))

R^2 Scores - Full Depth Trees
Train score: 0.8413
Dev score: -0.0619


In [240]:
X, y, x_columns = create_labeled_data(ipos, target_col, [], numerical)

In [241]:
# Divide the data in to train, dev sets (70%, 30%)

test_size = 0.30
random_state = 1

X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=test_size, random_state=random_state)

In [242]:
regr = RandomForestRegressor(max_depth=None, random_state=0)
regr.fit(X_train, y_train)
train_scores['fullDepth'] = regr.score(X_train, y_train)
dev_scores['fullDepth'] = regr.score(X_dev, y_dev)

# Results
print('R^2 Scores - Full Depth Trees')
print('Train score:', '{:.4f}'.format(train_scores['fullDepth']))
print('Dev score:', '{:.4f}'.format(dev_scores['fullDepth']))

R^2 Scores - Full Depth Trees
Train score: 0.8591
Dev score: -0.2056


## Random Forest Classification

In [262]:
# Divide the data in to train, dev sets (70%, 30%)
X, y, X_columns = create_labeled_data(ipos2, target_col, categorical, numerical)

test_size = 0.30
random_state = 0

X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=test_size, random_state=random_state)

In [263]:
train_scores_clf = {}
dev_scores_clf = {}

# Model
clf = RandomForestClassifier(max_depth=None, random_state=random_state)
clf.fit(X_train, y_train)
train_scores_clf['maxDepth'] = clf.score(X_train, y_train)
dev_scores_clf['maxDepth'] = clf.score(X_dev, y_dev)

# Results
print('Classification Scores - Full Depth Trees')
print('Train score:', '{:.4f}'.format(train_scores_clf['maxDepth']))
print('Dev score:', '{:.4f}'.format(dev_scores_clf['maxDepth']))

Classification Scores - Full Depth Trees
Train score: 1.0000
Dev score: 0.5758


## Logistic Regression for Comparison

In [261]:
# Model
clf = LogisticRegression(solver='liblinear', random_state=random_state)
clf.fit(X_train, y_train)
train_scores_clf['logit'] = clf.score(X_train, y_train)
dev_scores_clf['logit'] = clf.score(X_dev, y_dev)

# Results
print('Classification Scores - Logit Regression')
print('Train score:', '{:.4f}'.format(train_scores_clf['logit']))
print('Dev score:', '{:.4f}'.format(dev_scores_clf['logit']))


Classification Scores - Logit Regression
Train score: 0.6104
Dev score: 0.5152
