In [237]:
import pandas as pd
import numpy as np

In [238]:
data = pd.read_csv('../../data/rounds_data.csv', parse_dates=True)
data = data.dropna(subset=['announced_on', 'money_raised_usd', 'num_investors',
                           'funded_organization_categories', 'funded_organization_location',
                           'investment_type', 'lead_investor_identifiers'])

In [239]:
data.funded_organization_categories = data.funded_organization_categories.apply(lambda x: list(map(str, x.strip("']['").split("', '"))))
all_categories = data.funded_organization_categories.explode().value_counts()
data.announced_on = (pd.to_datetime(data.announced_on) - pd.Timestamp("1926-01-01")) // pd.Timedelta('1s')

def get_most_relevant_cat(categories):
    mapped = cat_freqs[cat_freqs.index.isin(categories)]
    return categories[np.argmax(mapped)]

In [240]:
pd.Series([
    data.lead_investor_identifiers.unique().size,
    data.investment_type.unique().size,
    data.funded_organization_location.unique().size,
    len(all_categories)
], index=['investors', 'round types', 'locaitons', 'categories']) # Curse of dimensionnality!

investors      24535
round types       28
locaitons        150
categories       823
dtype: int64

In [241]:
round_freqs = data.investment_type.value_counts() / data.investment_type.size
others = list(round_freqs[round_freqs < 0.01].index.values) + ['series_unknown', 'undisclosed']
data.investment_type = data.investment_type.apply(lambda x: 'other' if x in others else x)

In [242]:
cat_freqs = data.funded_organization_categories.explode().value_counts() / data.funded_organization_categories.size
data.funded_organization_categories = data.funded_organization_categories.map(get_most_relevant_cat)

In [243]:
data

Unnamed: 0,uuid,announced_on,money_raised_usd,num_investors,funded_organization_categories,funded_organization_location,investment_type,lead_investor_identifiers,post_money_valuation_usd,funded_organization_identifier.uuid
0,89da2fef-059b-4ec8-a2dc-e18fad3e9697,2954966400,16000000.0,8,E-Commerce,United States,series_b,Accel,,c7916a39-10cb-485a-b364-7fe414d9868d
4,89daa3bc-98ef-4d91-98f4-7d46e5ade50f,2987452800,3700000.0,3,Artificial Intelligence,Germany,seed,Shasta Ventures,,9fcbc4c7-5a74-43f6-8df1-e0c2917a6657
5,89daa934-05ad-f167-7adb-e6bb5abe07ab,2736460800,104351864.0,1,Biopharma,Germany,series_d,dievini Hopp Biotech Holding,,dd139f96-52cc-112a-fdb4-aae2d1a08a2a
6,89dac972-b1eb-48ae-a216-76e1c47a7b3b,3012249600,422423953.0,1,Finance,United Kingdom,debt_financing,NatWest Group,,9f9eaedd-2860-4570-af08-f687bb99b667
9,89db2d4f-729d-444a-a913-141e16e1ca76,2922393600,5427.0,1,Assisted Living,India,grant,DTU Innovation & Incubation Foundation,,6bd56d23-32af-4050-aa98-714685173443
...,...,...,...,...,...,...,...,...,...,...
451989,3c04f126-28d7-4f59-86ac-5c0dc08cd934,2822083200,1111353.0,1,Food Processing,Latvia,other,ZGI Capital,11113531.0,173842bf-8568-41ec-91ae-485e4b6e8346
451990,3c04fdb0-05f6-4892-9c32-d8f772418e70,3028233600,1400000.0,1,Apps,South Africa,seed,Imvelo Ventures,,1cf97acc-3d7a-4f86-8bdd-af2029930a25
451992,3c054e87-628c-2bee-41c0-90999a31e67e,2863987200,300000.0,1,Business Development,United States,seed,Cofounders Capital,,b4171162-cd6f-1e34-d6e1-288fe61575a6
451995,3c0587fa-46de-421d-8ea7-7c2aed9352c7,3028060800,47113512.0,6,Artificial Intelligence,China,series_b,Cathay Capital,,c9cf2757-aba4-4a08-be2d-0757e7a079ea


In [244]:
observed = data[data.post_money_valuation_usd.notna()].drop(columns=['uuid', 'funded_organization_identifier.uuid'])
unobserved = data[data.post_money_valuation_usd.isna()].drop(columns=['uuid', 'funded_organization_identifier.uuid'])

In [349]:
X = observed[['announced_on', 'money_raised_usd', 'num_investors', 'investment_type']]
X = pd.concat([X.drop(columns='investment_type'), pd.get_dummies(X.investment_type, prefix='rnd_type')], axis=1)

y = observed.post_money_valuation_usd

In [391]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': range(100, 1000, 100),
    'subsample': np.append(np.arange(0.8, 1, 0.05), [1]),
    'max_depth': range(3, 10)
}

reg = GridSearchCV(GradientBoostingRegressor(), params, n_jobs=-1, verbose=1).fit(X, y)
print(reg.best_score_)
print(reg.best_estimator_)

0.6552940929130775
GradientBoostingRegressor(max_depth=4, n_estimators=700, subsample=0.85)
