In [6]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [7]:
df = pd.read_csv("Latest_Data_VC_BO.csv")

In [8]:
df.head(2)

Unnamed: 0,Fund_Id,Firm_Id,Fund_Name,Firm_Name,Vintage,Fund_Type,Local_Currency,Fund_Size,Fund_Status,Final_Close_Date,...,Benchmark_ID,Called_Pcent,Distr_DPI_Pcent,Value_RVPI_Pcent,Multiple,Net_IRR_Pcent,FirmName,LastUpdated,FirmCountry,Established
0,3,152327,1818 Fund II,Brown Brothers Harriman,1993.0,Buyout,USD,475.0,Liquidated,19930630.0,...,18,88.5,180.74,0.0,1.8074,11.4,Brown Brothers Harriman,20190514.0,US,1989.0
1,9,741,Riverside Capital Appreciation Fund 2000,Riverside Company,2000.0,Buyout,USD,412.8,Liquidated,20000630.0,...,18,91.5,175.8,0.0,1.758,18.4,Riverside Company,20190718.0,US,1988.0


In [9]:
df.shape

(1723, 25)

In [10]:
df = df.drop(["Firm_Id", "LastUpdated","Benchmark_ID","FirmName",
             "Vintage","Final_Close_Date","Established"], axis=1)
# dropping variables as not clear how to fit as into feature engineering

# Adding GDP year on year + Treaury rate

In [11]:
df_gdp_trsy = pd.read_csv("gdp_trsry.csv")
df_gdp_trsy.head(2)

Unnamed: 0,Fund_Id,GDP_yoy,TR_10yrs
0,3,5.88,7.008845
1,9,6.27,5.646135


In [12]:
df = df.merge(df_gdp_trsy, how="inner", on=["Fund_Id"])

In [13]:
df.head(2)

Unnamed: 0,Fund_Id,Fund_Name,Firm_Name,Fund_Type,Local_Currency,Fund_Size,Fund_Status,Fund_Focus,Fund_Number_Overall,Fund_Number_Series,Geographic_Scope,Industry,Called_Pcent,Distr_DPI_Pcent,Value_RVPI_Pcent,Multiple,Net_IRR_Pcent,FirmCountry,GDP_yoy,TR_10yrs
0,3,1818 Fund II,Brown Brothers Harriman,Buyout,USD,475.0,Liquidated,US,2,2,Country-Specific,"Telecoms, Healthcare, Media, Financial Service...",88.5,180.74,0.0,1.8074,11.4,US,5.88,7.008845
1,9,Riverside Capital Appreciation Fund 2000,Riverside Company,Buyout,USD,412.8,Liquidated,US,4,3,Country-Specific,"Healthcare, Consumer Services, Manufacturing, ...",91.5,175.8,0.0,1.758,18.4,US,6.27,5.646135


# Adding year on MSCI world index

In [14]:
df_msci = pd.read_csv("msci_yoy.csv")
df_msci.head(2)

Unnamed: 0,Fund_Id,yoy_MSCI
0,3,-7.141309
1,9,23.560375


In [15]:
df = df.merge(df_msci, how="inner", on=["Fund_Id"])

In [16]:
df.head(2)

Unnamed: 0,Fund_Id,Fund_Name,Firm_Name,Fund_Type,Local_Currency,Fund_Size,Fund_Status,Fund_Focus,Fund_Number_Overall,Fund_Number_Series,...,Industry,Called_Pcent,Distr_DPI_Pcent,Value_RVPI_Pcent,Multiple,Net_IRR_Pcent,FirmCountry,GDP_yoy,TR_10yrs,yoy_MSCI
0,3,1818 Fund II,Brown Brothers Harriman,Buyout,USD,475.0,Liquidated,US,2,2,...,"Telecoms, Healthcare, Media, Financial Service...",88.5,180.74,0.0,1.8074,11.4,US,5.88,7.008845,-7.141309
1,9,Riverside Capital Appreciation Fund 2000,Riverside Company,Buyout,USD,412.8,Liquidated,US,4,3,...,"Healthcare, Consumer Services, Manufacturing, ...",91.5,175.8,0.0,1.758,18.4,US,6.27,5.646135,23.560375


# Adding new variables


In [17]:
df_nv = pd.read_csv("New_Variables.csv")
df_nv = df_nv.drop(['Unnamed: 0'], axis =1)
df_nv.shape

(1719, 4)

In [18]:
df = df.merge(df_nv, how="inner", on=["Fund_Id"])

In [19]:
df.head(2)

Unnamed: 0,Fund_Id,Fund_Name,Firm_Name,Fund_Type,Local_Currency,Fund_Size,Fund_Status,Fund_Focus,Fund_Number_Overall,Fund_Number_Series,...,Value_RVPI_Pcent,Multiple,Net_IRR_Pcent,FirmCountry,GDP_yoy,TR_10yrs,yoy_MSCI,Firm_Age,Funds_Raised_Last_Year,Pcent_Increase_Funds_Last_Year
0,3,1818 Fund II,Brown Brothers Harriman,Buyout,USD,475.0,Liquidated,US,2,2,...,0.0,1.8074,11.4,US,5.88,7.008845,-7.141309,4,101,0.90566
1,9,Riverside Capital Appreciation Fund 2000,Riverside Company,Buyout,USD,412.8,Liquidated,US,4,3,...,0.0,1.758,18.4,US,6.27,5.646135,23.560375,12,532,0.217391


In [20]:
# defining function for one hot encoding of variables
def one_hot(df, variable, top_x_labels):
    """
    df: Will take data frame as a data
    variable: column name (feature name or Variable)
    top_x_labels: top 'x' nbr. of labels for which one wants to hot encode, keeping rest as others
    return: will add "x + 1" new columns into the data frame
    """
    for label in top_x_labels:
        df[variable + '_' + label] = np.where(df[variable]==label,1,0)
    df[variable + '_' + 'other'] = np.where(~df[variable].isin(top_x_labels),1,0)

In [21]:
# one hot where we don't want other colum
def one_hot_code(df, variable, top_x_labels):
    """
    df: Will take data frame as a data
    variable: column name (feature name or Variable)
    top_x_labels: top 'x' nbr. of labels for which one wants to hot encode, not keeping rest as others
    return: will add "x" new columns into the data frame
    """
    for label in top_x_labels:
        df[variable + '_' + label] = np.where(df[variable]==label,1,0)

# Fund Number Series / Overall
Will this be a categorical or continuous variable? Because, will the difference between say overall number 3 and 4 is same as the difference between 6-7. In present representation, we treat them as equal as we consider them continuous. Refer to the end of this document for more exploratory analysis.

In [22]:
df.Fund_Number_Overall.value_counts().sort_values(ascending=False)

2     291
3     287
4     235
1     199
10    182
5     174
6     136
7      94
8      77
9      44
Name: Fund_Number_Overall, dtype: int64

# Fund_Type
1. Finding number of distinct labels, and if there are blanks then to fill it with rest randomly <br>
2. To hot encode into labels

In [23]:
df.Fund_Type.value_counts().sort_values(ascending=False)

Buyout         1058
Venture_Cap     661
Name: Fund_Type, dtype: int64

In [24]:
# Since,first five labels dominate, I choose to hot encode them
list_fund_type = list(df.Fund_Type.value_counts().sort_values(ascending=False).index)
one_hot_code(df, "Fund_Type", list_fund_type)

# Local_Currency

In [25]:
df.Local_Currency.value_counts().sort_values(ascending=False)

USD    1451
EUR     174
GBP      57
CAD      12
AUD      11
SEK       6
ZAR       3
NZD       2
DKK       2
NOK       1
Name: Local_Currency, dtype: int64

In [26]:
# I will here segregate into three labels - USD, EUR, Others
list_loc_cur = list(df.Local_Currency.value_counts().sort_values(ascending=False).head(2).index)
one_hot(df, "Local_Currency", list_loc_cur)

# Fund_Status 

In [27]:
df.Fund_Status.value_counts().sort_values(ascending=False)

Closed        1221
Liquidated     498
Name: Fund_Status, dtype: int64

In [28]:
list_fund_status = list(df.Fund_Status.value_counts().sort_values(ascending=False).head().index)
one_hot_code(df, "Fund_Status", list_fund_status)

# Fund_Focus

In [29]:
df.Fund_Focus.value_counts().sort_values(ascending=False)

US                      1316
Europe                   286
Asia                      59
Middle East & Israel      20
Americas                  19
Australasia               14
Africa                     3
Diversified Multi-Re       2
Name: Fund_Focus, dtype: int64

In [30]:
# I will segregate it into four labels US, Europe, Asia, Others
list_fund_focus = list(df.Fund_Focus.value_counts().sort_values(ascending=False).head(3).index)
one_hot(df, "Fund_Focus", list_fund_focus)

# Geographic_Scope

In [31]:
df.Geographic_Scope.value_counts().sort_values(ascending=False)

Country-Specific     774
Multi-Continental    381
Continental          368
Regional             156
US Regional           40
Name: Geographic_Scope, dtype: int64

In [32]:
df = df.replace({"Geographic_Scope":
{"Multi-Continental":"Diversified",
"Continental": "Diversified",
"Country-Specific":"Others",
"Regional":"Others",
"US Regional": "Others"}
})

In [33]:
# One hot encoding with top 3 and rest as others
list_geo_scope = list(df.Geographic_Scope.value_counts().sort_values(ascending=False).head(3).index)
one_hot_code(df, "Geographic_Scope", list_geo_scope)

# Industry
1. Industry has in total 1052 labels so to hot encode would create 1000+ new features,a procedure that will add lot of noise. <br>
2. So, it becomes relevant to choose top 5 or top 10 as per the distribution of count

In [34]:
df.Industry.value_counts().sort_values(ascending=False).head(10)

Diversified                                       326
Technology                                         42
Distribution, Consumer Services, Manufacturing     22
IT, Life Sciences                                  16
Healthcare                                         15
Financial Services                                 14
Life Sciences                                      13
Communications, Media                              11
Technology, Software                               11
Software                                           10
Name: Industry, dtype: int64

In [35]:
# As it can be observed that Diversified leads the chart, with the rest more or less equal. So it would be better to
# segregate it into binary variables Diversified and Non-Diversified. FUnd is focuessed or otherwise since the foucs also changes as per the year. For instance,
# 2020 was the year of SPAC
list_industry = list(df.Industry.value_counts().sort_values(ascending=False).head(1).index)
one_hot(df, "Industry", list_industry)

# FirmCountry 

In [36]:
df.FirmCountry.value_counts().sort_values(ascending=False).head(5)

US             1329
UK              142
Canada           23
France           21
Switzerland      20
Name: FirmCountry, dtype: int64

In [37]:
# Firm_Country has 44 labels, so it would be better if we can segregate countries into regions such as US, Eur or Others
# However, such approach will require mapping of US, and European countries. To do later. For now, I will segregate into
# US,Non-US(Others)
list_firm_country = list(df.FirmCountry.value_counts().sort_values(ascending=False).head(1).index)
one_hot(df, "FirmCountry", list_firm_country)

# Fund_Number_Series

In [38]:
df.Fund_Number_Series.value_counts(sort=True)

2     343
3     318
1     286
4     243
5     172
6     121
7      86
8      62
10     56
9      32
Name: Fund_Number_Series, dtype: int64

This is a proxy for management experience as it shows the the number of funds launched till date.

# Dropping main variables
this is to drop main variables as they are already hot encoded

In [39]:
df = df.drop(["Local_Currency", "Fund_Status", "Fund_Focus", "Geographic_Scope","Industry", 
"FirmCountry", "Local_Currency_other", "Fund_Focus_other", "Geographic_Scope_Others",
"Industry_other","FirmCountry_other","FirmCountry_US"], axis=1)

In [40]:
df = df.dropna()

In [41]:
df.to_csv("FinalDataSet_VC_BO.csv", index=False)

In [42]:
df.shape

(1719, 29)