In [1]:
#import necessary libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn import tree
import graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

# Load data

In [2]:
# importing data and reading data from csv file
data = pd.read_csv('data/investments_VC.csv')
# show dataset
data.head(2)

Unnamed: 0,permalink,name,homepage_url,category_list,market,funding_total_usd,status,country_code,state_code,region,...,secondary_market,product_crowdfunding,round_A,round_B,round_C,round_D,round_E,round_F,round_G,round_H
0,/organization/waywire,#waywire,http://www.waywire.com,|Entertainment|Politics|Social Media|News|,News,1750000,acquired,USA,NY,New York City,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,/organization/tv-communications,&TV Communications,http://enjoyandtv.com,|Games|,Games,4000000,operating,USA,CA,Los Angeles,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Describe dataset

In [3]:
desc = ['Static hyperlink for the startup on Crunchbase\'s website','name of the startup','Website address of the startup',
       'in which category the startups fall','which market the startup caters to','total funding received(in USD)',
        'current operating status','country of origin','state of origin','region','city of origin','total rounds of funding',
        'date of founding','month of founding','quarter of founding','year of founding','date of first funding','date of last funding',
        'seed funding received(in USD)','venture funding received(in USD)','funding received by diluting equity',
        'other undisclosed funding sources','funding received from convertible notes','funding received from debts',
        'funding received from angel investors','funding from grants','funding from private equity',
        'funding from equity dilution after IPO','funding from debts after IPO','funding from secondary markets',
        'funding from crowdfunding','round A funding','round B funding','round C funding','round D funding','round E funding',
       'round F funding', 'round G funding', 'round H funding']
data_details = pd.DataFrame(list(zip(data.columns, desc)), columns =['Column', 'Description'])
data_details

Unnamed: 0,Column,Description
0,permalink,Static hyperlink for the startup on Crunchbase...
1,name,name of the startup
2,homepage_url,Website address of the startup
3,category_list,in which category the startups fall
4,market,which market the startup caters to
5,funding_total_usd,total funding received(in USD)
6,status,current operating status
7,country_code,country of origin
8,state_code,state of origin
9,region,region


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54294 entries, 0 to 54293
Data columns (total 39 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   permalink             49438 non-null  object 
 1   name                  49437 non-null  object 
 2   homepage_url          45989 non-null  object 
 3   category_list         45477 non-null  object 
 4    market               45470 non-null  object 
 5    funding_total_usd    49438 non-null  object 
 6   status                48124 non-null  object 
 7   country_code          44165 non-null  object 
 8   state_code            30161 non-null  object 
 9   region                44165 non-null  object 
 10  city                  43322 non-null  object 
 11  funding_rounds        49438 non-null  float64
 12  founded_at            38554 non-null  object 
 13  founded_month         38482 non-null  object 
 14  founded_quarter       38482 non-null  object 
 15  founded_year       

In [5]:
# market and funcing contain spaces
# rename columns and remove spaces
data.rename(columns={' market ':'market',' funding_total_usd ':'funding_total_usd'},inplace=True)
data.head(2)

Unnamed: 0,permalink,name,homepage_url,category_list,market,funding_total_usd,status,country_code,state_code,region,...,secondary_market,product_crowdfunding,round_A,round_B,round_C,round_D,round_E,round_F,round_G,round_H
0,/organization/waywire,#waywire,http://www.waywire.com,|Entertainment|Politics|Social Media|News|,News,1750000,acquired,USA,NY,New York City,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,/organization/tv-communications,&TV Communications,http://enjoyandtv.com,|Games|,Games,4000000,operating,USA,CA,Los Angeles,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# show all duplicated entries of the dataset
print('Duplicated entries:',data.duplicated().sum())
# check if duplicated entries are NaN
# mean is the evidence, that all entries are NaN (because mean = 1)
data[data.duplicated()].isna().mean()

Duplicated entries: 4855


permalink               1.0
name                    1.0
homepage_url            1.0
category_list           1.0
market                  1.0
funding_total_usd       1.0
status                  1.0
country_code            1.0
state_code              1.0
region                  1.0
city                    1.0
funding_rounds          1.0
founded_at              1.0
founded_month           1.0
founded_quarter         1.0
founded_year            1.0
first_funding_at        1.0
last_funding_at         1.0
seed                    1.0
venture                 1.0
equity_crowdfunding     1.0
undisclosed             1.0
convertible_note        1.0
debt_financing          1.0
angel                   1.0
grant                   1.0
private_equity          1.0
post_ipo_equity         1.0
post_ipo_debt           1.0
secondary_market        1.0
product_crowdfunding    1.0
round_A                 1.0
round_B                 1.0
round_C                 1.0
round_D                 1.0
round_E             

# Preprocess data

In [7]:
# drop all duplicates because we have shown that all columns of the duplicated entries are NaN 
print(data.shape)
data = data.drop_duplicates()
#data.dropna(how='all', inplace=True)
print(data.shape)

(54294, 39)
(49439, 39)


In [8]:
data.isna().sum()

permalink                   1
name                        2
homepage_url             3450
category_list            3962
market                   3969
funding_total_usd           1
status                   1315
country_code             5274
state_code              19278
region                   5274
city                     6117
funding_rounds              1
founded_at              10885
founded_month           10957
founded_quarter         10957
founded_year            10957
first_funding_at            1
last_funding_at             1
seed                        1
venture                     1
equity_crowdfunding         1
undisclosed                 1
convertible_note            1
debt_financing              1
angel                       1
grant                       1
private_equity              1
post_ipo_equity             1
post_ipo_debt               1
secondary_market            1
product_crowdfunding        1
round_A                     1
round_B                     1
round_C   

In [9]:
print(data.shape)
print(data[data['status'].isna()].shape)
# drop all entries where status is NaN
# at the end we want to predict the status, so entries with no status aren't helpful
data = data[data['status'].notna()]
print(data.shape)

(49439, 39)
(1315, 39)
(48124, 39)


In [10]:
data.isna().sum()

permalink                   0
name                        1
homepage_url             3377
category_list            3582
market                   3589
funding_total_usd           0
status                      0
country_code             5067
state_code              18574
region                   5067
city                     5857
funding_rounds              0
founded_at              10488
founded_month           10560
founded_quarter         10560
founded_year            10560
first_funding_at            0
last_funding_at             0
seed                        0
venture                     0
equity_crowdfunding         0
undisclosed                 0
convertible_note            0
debt_financing              0
angel                       0
grant                       0
private_equity              0
post_ipo_equity             0
post_ipo_debt               0
secondary_market            0
product_crowdfunding        0
round_A                     0
round_B                     0
round_C   

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48124 entries, 0 to 49437
Data columns (total 39 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   permalink             48124 non-null  object 
 1   name                  48123 non-null  object 
 2   homepage_url          44747 non-null  object 
 3   category_list         44542 non-null  object 
 4   market                44535 non-null  object 
 5   funding_total_usd     48124 non-null  object 
 6   status                48124 non-null  object 
 7   country_code          43057 non-null  object 
 8   state_code            29550 non-null  object 
 9   region                43057 non-null  object 
 10  city                  42267 non-null  object 
 11  funding_rounds        48124 non-null  float64
 12  founded_at            37636 non-null  object 
 13  founded_month         37564 non-null  object 
 14  founded_quarter       37564 non-null  object 
 15  founded_year       

In [12]:
#Extracting year value from "first_funding_at" and changing to int
data['first_funding_at'] = data.first_funding_at.str.split("-").str[0]
data['first_funding_at'] = data['first_funding_at'].astype(int)
#Extracting year value from "last_funding_at" and changing to int
data['last_funding_at'] = data.last_funding_at.str.split("-").str[0]
data['last_funding_at'] = data['last_funding_at'].astype(int)
#Changing the values in column "funding_total_usd" from string to float
data['funding_total_usd'] = data['funding_total_usd'].str.strip().str.replace(",","")
data['funding_total_usd'] = data['funding_total_usd'].replace("-",0).astype("float")

In [13]:
data.isna().sum()

permalink                   0
name                        1
homepage_url             3377
category_list            3582
market                   3589
funding_total_usd           0
status                      0
country_code             5067
state_code              18574
region                   5067
city                     5857
funding_rounds              0
founded_at              10488
founded_month           10560
founded_quarter         10560
founded_year            10560
first_funding_at            0
last_funding_at             0
seed                        0
venture                     0
equity_crowdfunding         0
undisclosed                 0
convertible_note            0
debt_financing              0
angel                       0
grant                       0
private_equity              0
post_ipo_equity             0
post_ipo_debt               0
secondary_market            0
product_crowdfunding        0
round_A                     0
round_B                     0
round_C   

In [14]:
print(data.shape)
# check if columns are unique
print(f'Permalink: {data.permalink.unique().shape}')
print(f'Name: {data.name.unique().shape}')
print(f'Hompage url: {data.homepage_url.unique().shape}')
print(f'Category: {data.category_list.unique().shape}')
print(f'Market: {data.market.unique().shape}')
print(f'Funding usd: {data.funding_total_usd.unique().shape}')
print(f'Status: {data.status.unique().shape}')
print(f'Country code: {data.country_code.unique().shape}')
print(f'State code: {data.state_code.unique().shape}')
print(f'Region: {data.region.unique().shape}')
print(f'City: {data.city.unique().shape}')
print(f'Funding rounds: {data.funding_rounds.unique().shape}')
print(f'Founded at: {data.founded_at.unique().shape}')
print(f'Founded month: {data.founded_month.unique().shape}')
print(f'Founded quater: {data.founded_quarter.unique().shape}')
print(f'Founded year: {data.founded_year.unique().shape}')
print(f'First funding at: {data.first_funding_at.unique().shape}')
print(f'Last funding at: {data.last_funding_at.unique().shape}')
print(f'Seed: {data.seed.unique().shape}')
print(f'Venture: {data.venture.unique().shape}')
print(f'Equity_crowdfunding: {data.equity_crowdfunding.unique().shape}')
print(f'Undisclosed: {data.undisclosed.unique().shape}')
print(f'Convertible note: {data.convertible_note.unique().shape}')
print(f'Debt financing: {data.debt_financing.unique().shape}')
print(f'Angel: {data.angel.unique().shape}')
print(f'Grant: {data.grant.unique().shape}')
print(f'Private equity: {data.private_equity.unique().shape}')
print(f'Post ipo equity: {data.post_ipo_equity.unique().shape}')
print(f'Post ipo debt: {data.post_ipo_debt.unique().shape}')
print(f'Secondary market: {data.secondary_market.unique().shape}')
print(f'Product crowdfunding: {data.product_crowdfunding.unique().shape}')
print(f'Round A: {data.round_A.unique().shape}')
print(f'Round B: {data.round_B.unique().shape}')
print(f'Round C: {data.round_C.unique().shape}')
print(f'Round D: {data.round_D.unique().shape}')
print(f'Round E: {data.round_E.unique().shape}')
print(f'Round F: {data.round_F.unique().shape}')
print(f'Round G: {data.round_G.unique().shape}')
print(f'Round H: {data.round_H.unique().shape}')

(48124, 39)
Permalink: (48122,)
Name: (48039,)
Hompage url: (44623,)
Category: (16425,)
Market: (753,)
Funding usd: (14293,)
Status: (3,)
Country code: (116,)
State code: (62,)
Region: (1087,)
City: (4128,)
Funding rounds: (17,)
Founded at: (3352,)
Founded month: (419,)
Founded quater: (219,)
Founded year: (104,)
First funding at: (43,)
Last funding at: (40,)
Seed: (3246,)
Venture: (9120,)
Equity_crowdfunding: (249,)
Undisclosed: (676,)
Convertible note: (297,)
Debt financing: (1834,)
Angel: (988,)
Grant: (509,)
Private equity: (816,)
Post ipo equity: (203,)
Post ipo debt: (47,)
Secondary market: (19,)
Product crowdfunding: (171,)
Round A: (1994,)
Round B: (1252,)
Round C: (731,)
Round D: (455,)
Round E: (224,)
Round F: (109,)
Round G: (32,)
Round H: (5,)


In [15]:
data = data.drop(['permalink', 'name', 'homepage_url'], axis=1)
data.head(2)

category_list            3582
market                   3589
funding_total_usd           0
status                      0
country_code             5067
state_code              18574
region                   5067
city                     5857
funding_rounds              0
founded_at              10488
founded_month           10560
founded_quarter         10560
founded_year            10560
first_funding_at            0
last_funding_at             0
seed                        0
venture                     0
equity_crowdfunding         0
undisclosed                 0
convertible_note            0
debt_financing              0
angel                       0
grant                       0
private_equity              0
post_ipo_equity             0
post_ipo_debt               0
secondary_market            0
product_crowdfunding        0
round_A                     0
round_B                     0
round_C                     0
round_D                     0
round_E                     0
round_F   

In [10]:
data.isna().sum()

permalink                   1
name                        2
homepage_url             3450
category_list            3962
market                   3969
funding_total_usd           1
status                   1315
country_code             5274
state_code              19278
region                   5274
city                     6117
funding_rounds              1
founded_at              10885
founded_month           10957
founded_quarter         10957
founded_year            10957
first_funding_at            1
last_funding_at             1
seed                        1
venture                     1
equity_crowdfunding         1
undisclosed                 1
convertible_note            1
debt_financing              1
angel                       1
grant                       1
private_equity              1
post_ipo_equity             1
post_ipo_debt               1
secondary_market            1
product_crowdfunding        1
round_A                     1
round_B                     1
round_C   

In [10]:
reducedData = data.loc[:, ['status']]
reducedData.head()

Unnamed: 0,status
0,acquired
1,operating
2,operating
3,operating
4,operating


In [11]:
y = reducedData.loc[:, ['status']]