In [20]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

In [21]:
def process_company(company_data):
    try:
        m = company_data.money_raised_usd.values
        p = company_data.post_money_valuation_usd.values
        e = company_data.exit_value.values[0]

        if company_data.exit_type.values[0] == 3:
            ret = np.zeros_like(company_data.uuid) - 1
        elif company_data.shape[0] == 1:
            ret = [(m[0] / p[0] * e - m[0]) / m[0]]
        else:
            x0 = m / p # stake of investors at their first round
            x = (p[1:] - m[1:]) / p[1:] # dilution factors
            d = np.cumprod(x)[::-1]
            d = np.insert(d, len(d), 1)
            ret = (e * x0 * d - m) / m
        return pd.DataFrame(data=ret, index=company_data.uuid)
    except:
        print(company_data)
        return pd.DataFrame(data=[np.nan] * company_data.uuid.size, index=company_data.uuid)


def filter_dates(funding_round):
    return funding_round['announced_on'] < funding_round['exit_date']

In [50]:
predicted = False

In [51]:
if predicted:
    rounds_data = pd.read_csv('obs_predicted_data.csv')
else:
    rounds_data = pd.read_csv('observed.csv')

In [52]:
rounds_data

Unnamed: 0.1,Unnamed: 0,uuid,org_uuid,announced_on,money_raised_usd,num_investors,investment_type,lead_investor_rank,post_money_valuation_usd,prev_rounds,country_code,amount_delta,holding_time
0,256063,773c9fb9-0e27-4115-8bc1-343e24ee28c8,0002aa63-f21b-4c54-9495-895a1d09e0d4,2946326400,1500000.0,7.0,seed,189123.0,4.500000e+06,0.0,USA,1500000.0,498.0
1,327293,e5e06be5-4b9a-4040-9ee4-bb45920a1f99,00060741-9e4b-44cd-880a-8d0fdd8046c8,2969827200,215983.0,2.0,pre_seed,59277.0,2.215982e+06,0.0,SGP,215983.0,40.0
2,289310,23263b3b-0c61-4980-aee2-47f00f12e357,000b7667-bf70-4298-bae9-94f44d27dc10,2966716800,8700000.0,1.0,seed,3023139.0,1.970000e+07,0.0,CHN,8700000.0,157.0
3,229099,befaba3e-2bae-4fd5-b71e-6d03a71b980a,0010d353-dfe4-483b-86a7-5e11e9996e63,2887401600,500000.0,1.0,pre_seed,215024.0,4.500000e+06,1.0,USA,200000.0,122.0
4,185548,10e2bc7d-f5a6-4cdd-acf3-6c87b167f83f,0011daf2-bf81-2365-e08a-d896bdc186d6,2902262400,1900000.0,1.0,series_a,481.0,6.400000e+06,1.0,IND,1795000.0,622.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9072,275085,08c66219-c526-4c25-8f01-38207c91ad8c,ffe232c5-fcf4-e08e-b43d-c7dd1d5fc32c,2957126400,160000000.0,4.0,series_d,26457.0,2.200000e+09,3.0,USA,60000000.0,522.0
9073,423090,43f30694-cee9-407c-9ea5-2fefc744f6ce,ffe232c5-fcf4-e08e-b43d-c7dd1d5fc32c,3018988800,250000000.0,10.0,series_e,54004.0,4.600000e+09,4.0,USA,90000000.0,716.0
9074,388007,426c4234-8ee5-4014-88eb-80db82191fa3,ffe7218c-5328-b5a9-cd5a-c5947b9d9276,3005510400,90000000.0,3.0,series_e,7821.0,1.000000e+09,7.0,CHN,-10000000.0,62.0
9075,107449,6c1cc509-c026-9398-ddf8-bc25c3535bbe,ffe74ad7-43eb-bd06-32ad-19de354a67bd,2838067200,65000000.0,2.0,private_equity,55551.0,8.600000e+08,4.0,USA,25000000.0,600.0


In [54]:
orgs = pd.read_csv("../../data/crunchbase/organizations.csv")
acqs = pd.read_csv("../../data/crunchbase/acquisitions.csv")
ipos = pd.read_csv("../../data/crunchbase/ipos.csv")

In [55]:
ipos = ipos[['org_uuid', 'went_public_on', 'valuation_price_usd']]
acqs = acqs[['acquiree_uuid', 'acquired_on', 'price_usd']]

In [56]:
rounds = rounds_data[['uuid', 'org_uuid', 'announced_on', 'money_raised_usd', 'post_money_valuation_usd']].copy()

ipos.columns = ['org_uuid', 'exit_date', 'exit_value']
acqs.columns = ['org_uuid', 'exit_date', 'exit_value']

clos = pd.DataFrame(orgs[orgs.status == 'closed'][['uuid', 'closed_on']])
clos.columns = ['org_uuid', 'exit_date']

ipos["exit_type"] = 1
acqs["exit_type"] = 2
clos["exit_type"] = 3
clos["exit_value"] = np.nan

exits = pd.concat([acqs, ipos, clos]).set_index("org_uuid")

In [57]:
rounds["exit_date"] = pd.to_datetime(rounds.org_uuid.map(exits["exit_date"].to_dict()))
rounds["exit_type"] = rounds.org_uuid.map(exits["exit_type"].to_dict()).fillna(value=4).astype(int)
rounds["exit_value"] = rounds.org_uuid.map(exits["exit_value"].to_dict())

In [58]:
ref_date = '1926-01-01'
rounds.announced_on = pd.Timestamp(ref_date) + rounds.announced_on * pd.Timedelta('1s')
rounds = rounds[(rounds.announced_on < rounds.exit_date)]
rounds = rounds.sort_values(by=['org_uuid', 'announced_on'])

In [59]:
rounds['return_usd'] = rounds.uuid.map(rounds.groupby('org_uuid').apply(process_company).droplevel(0)[0]) + 1

In [60]:
rounds = rounds.drop(columns=['uuid'])
rounds.insert(1, 'group_num', np.nan)
rounds.insert(2, 'seg_num', np.nan)

rounds.columns = ['company_num', 'group_num', 'seg_num', 'round_date', 'raised_usd',
                  'postvalue_usd', 'exit_date', 'exit_type', 'exit_value', 'return_usd']
rounds = rounds[rounds.round_date > '1990-01-01']

In [61]:
operating = rounds_data[['org_uuid', 'announced_on', 'money_raised_usd', 'post_money_valuation_usd']].copy()
operating = operating[~operating.org_uuid.isin(rounds.company_num)]

operating.announced_on = pd.Timestamp(ref_date) + operating.announced_on * pd.Timedelta('1s')
operating.insert(1, 'group_num', np.nan)
operating.insert(2, 'seg_num', np.nan)
operating.insert(operating.shape[1], 'exit_date', np.nan)
operating.insert(operating.shape[1], 'exit_type', 4)
operating.insert(operating.shape[1], 'exit_value', np.nan)
operating.insert(operating.shape[1], 'return_usd', np.nan)

operating.columns = ['company_num', 'group_num', 'seg_num', 'round_date', 'raised_usd',
                  'postvalue_usd', 'exit_date', 'exit_type', 'exit_value', 'return_usd']
operating = operating[operating.round_date.dt.year > 1990]

In [62]:
complete = pd.concat([rounds, operating], ignore_index=True)
complete.loc[complete.return_usd == 0, 'return_usd'] = np.nan
complete = complete[(complete.return_usd < 3000) | (complete.return_usd.isna())]
complete.exit_date = pd.to_datetime(complete.exit_date, errors="coerce")

In [63]:
if predicted:
    complete.to_csv('../../data/data.csv', index=False)
else:
    complete.to_csv('../../data/data_nopred.csv', index=False)