## Label output according to funding event 
- From second window onwards, windows with funding event will be labelled as positive (and vice versa)
- Only funding above series A (exclusive) is considered as 'positive'

In [24]:
# Setup
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from scipy import stats
from sklearn import preprocessing
pd.set_option('display.max_rows',None,'display.max_columns',None)

In [37]:
# Define each window size as 6 months
window_size = 6
df_inves = pd.read_csv('./cleaned_csv/cleaned_funding.csv')

In [51]:
df_inves['company_id'] = 'c:' + df_inves["company_id"].astype(str)
df_inves.head()

Unnamed: 0,company_id,investment_age,funding_round_code,funding_amount_mil,IPO,a,acquired,angel,b,c,d,e,f,merged,neeq,post-IPO,delisted,seed,stg_invse,num_investor
0,c:11060,139.0,acquired,15500.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1.0
1,c:11060,133.0,stg_invse,3317.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,5.0
2,c:11060,80.0,stg_invse,829.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2.0
3,c:11060,57.0,a,20.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,4.0
4,c:11058,72.0,acquired,780.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,


In [52]:
df_inves.drop(columns=["funding_round_code","funding_amount_mil","angel","seed",
                       "a","merged","delisted","post-IPO","num_investor"],inplace=True)

In [49]:
# Remove outlier using z-score of investment_age (e.g. remove company founded in 1860)
df_inves_bound = df_inves.copy()
df_inves_bound = df_inves_bound[(np.abs(stats.zscore(df_inves["investment_age"])) < 5)]
df_inves_bound = df_inves_bound[df_inves_bound["investment_age"] > 0]
print('Largest investment age:', df_inves_bound["investment_age"].max())

Largest investment age: 383.0


### Label positive window

In [53]:
df_inves = df_inves[df_inves['investment_age'] <= 383]
df_inves = df_inves[df_inves['investment_age'] >= 0]

In [55]:
print('Total number of companies:', df_inves['company_id'].nunique())

Total number of companies: 38812


In [56]:
# Generate windows
num_window = int(383/6+1)
allcolumns = ["company_id"]
for i in range(1,num_window):
    allcolumns += ['funding_window' + str(i)]
df_label = pd.DataFrame(columns=allcolumns)
df_label["company_id"] = df_inves.drop_duplicates("company_id")["company_id"]

In [57]:
df_label.head()

Unnamed: 0,company_id,funding_window1,funding_window2,funding_window3,funding_window4,funding_window5,funding_window6,funding_window7,funding_window8,funding_window9,...,funding_window54,funding_window55,funding_window56,funding_window57,funding_window58,funding_window59,funding_window60,funding_window61,funding_window62,funding_window63
0,c:11060,,,,,,,,,,...,,,,,,,,,,
4,c:11058,,,,,,,,,,...,,,,,,,,,,
6,c:11051,,,,,,,,,,...,,,,,,,,,,
10,c:11047,,,,,,,,,,...,,,,,,,,,,
11,c:11022,,,,,,,,,,...,,,,,,,,,,


In [13]:
tqdm.pandas(desc="my bar!")

# Label output window (positive/negative)
def inves_output(x):
    row = []
    df_row = df_inves.loc[df_inves["company_id"]==x["company_id"]]
    max_time = df_row["investment_age"].max()
    
    for i in range(1,int(max_time/window_size)+1):
        items = df_row.loc[(df_row["investment_age"] >= i * window_size)
                          & (df_row["investment_age"] < (i+1) * window_size)]
        if (np.array_equal(items.iloc[:,2:11].sum().values,np.zeros(9))==False):
            row.append(1)
        else:
            row.append(0)
    for i in range(int(max_time/window_size)+1,num_window):
        row.append(0)
    row = np.asarray(row)
    
    return list(row)   

df_label.iloc[:,1:64] = df_label.progress_apply(inves_output,axis=1,result_type="expand").values

  from pandas import Panel
my bar!: 100%|██████████| 38812/38812 [20:33<00:00, 31.46it/s] 


In [19]:
np.save("Y.npy", df_label.iloc[:,1:],allow_pickle=True)