In [231]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import tensorflow as tf
from tensorflow import keras
from keras import layers

import gensim
from gensim.utils import simple_preprocess
from scipy.spatial.distance import cosine as cosine_distance

In [2]:
df = pd.read_csv('YCombinatorStartups.csv', index_col=0)

In [3]:
pd.set_option('display.max_columns', 500)

In [4]:
df.head()

Unnamed: 0,id,name,slug,former_names,small_logo_thumb_url,website,all_locations,long_description,one_liner,team_size,highlight_black,highlight_latinx,highlight_women,industry,subindustry,launched_at,tags,tags_highlighted,top_company,top_company_by_revenue,isHiring,nonprofit,batch,status,industries,regions,stage,app_video_public,demo_day_video_public,app_answers,question_answers,objectID
0,325,Dropbox,dropbox,[],https://bookface-images.s3.amazonaws.com/small...,http://dropbox.com,"San Francisco, CA, USA",Dropbox is building the world’s first smart wo...,Backup and share files in the cloud.,4000.0,False,False,False,B2B,B2B -> Productivity,1326791328,[],[],True,True,False,False,S07,Public,"['B2B', 'Productivity']","['United States of America', 'America / Canada']",Growth,False,False,,False,325
1,379,Reddit,reddit,[],https://bookface-images.s3.amazonaws.com/small...,http://reddit.com,"San Francisco, CA, USA",Founded by Steve Huffman and Alexis Ohanian in...,The frontpage of the internet.,201.0,False,False,False,Consumer,Consumer -> Content,1326791708,"['Community', 'Social Media', 'Social', 'Socia...",[],True,True,False,False,S05,Acquired,"['Consumer', 'Content']","['United States of America', 'America / Canada']",Early,False,False,,False,379
2,383,Twitch,twitch,"['Justin.tv', 'Twitch']",https://bookface-images.s3.amazonaws.com/small...,http://twitch.com,"San Francisco, CA, USA",Twitch is the world’s leading video platform a...,A global community creating the future of live...,2000.0,True,False,False,Consumer,Consumer -> Content,1326791723,"['Community', 'Gaming', 'Social Media', 'Video...",[],True,False,False,False,W07,Acquired,"['Consumer', 'Content']","['United States of America', 'America / Canada...",Growth,False,False,,False,383
3,356,Scribd,scribd,[],https://bookface-images.s3.amazonaws.com/small...,http://scribd.com,"San Francisco, CA, USA",Read and listen without limits. Unlimited* aud...,World's largest online library.,300.0,False,False,False,Consumer,Consumer -> Content,1326791580,[],[],True,True,True,False,S06,Active,"['Consumer', 'Content']","['United States of America', 'America / Canada']",Growth,False,False,,False,356
4,344,Weebly,weebly,[],https://bookface-images.s3.amazonaws.com/small...,http://weebly.com,"San Francisco, CA, USA",Company Information\r\nWeebly is a consumer se...,Build a free website that grows with your busi...,201.0,False,False,False,B2B,B2B -> Marketing,1326791493,[],[],True,False,False,False,W07,Acquired,"['B2B', 'Marketing']","['United States of America', 'America / Canada']",Growth,False,False,,False,344


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4410 entries, 0 to 4409
Data columns (total 32 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      4410 non-null   int64  
 1   name                    4410 non-null   object 
 2   slug                    4410 non-null   object 
 3   former_names            4410 non-null   object 
 4   small_logo_thumb_url    4410 non-null   object 
 5   website                 4388 non-null   object 
 6   all_locations           4253 non-null   object 
 7   long_description        4072 non-null   object 
 8   one_liner               4216 non-null   object 
 9   team_size               4344 non-null   float64
 10  highlight_black         4410 non-null   bool   
 11  highlight_latinx        4410 non-null   bool   
 12  highlight_women         4410 non-null   bool   
 13  industry                4410 non-null   object 
 14  subindustry             4410 non-null   objec

In [6]:
for col in df.columns:
    if df[col].dtype != 'O':
        print(col, 'sum:', df[col].sum(), '\t\taverage:', df[col].mean())

id sum: 61628171 		average: 13974.641950113379
team_size sum: 231161.0 		average: 53.213858195211785
highlight_black sum: 246 		average: 0.055782312925170066
highlight_latinx sum: 452 		average: 0.10249433106575964
highlight_women sum: 642 		average: 0.145578231292517
launched_at sum: 6820780942422 		average: 1546662345.2204082
top_company sum: 340 		average: 0.07709750566893424
top_company_by_revenue sum: 48 		average: 0.010884353741496598
isHiring sum: 947 		average: 0.2147392290249433
nonprofit sum: 43 		average: 0.009750566893424037
app_video_public sum: 51 		average: 0.011564625850340135
demo_day_video_public sum: 111 		average: 0.025170068027210883
question_answers sum: 62 		average: 0.014058956916099773
objectID sum: 61628171 		average: 13974.641950113379


In [7]:
for col in df.columns:
    if df[col].dtype == 'O':
        print(df[col].value_counts(), '\n')

name
Index             3
Apollo            3
Atlas             3
Haven             3
Feather           3
                 ..
Reebeez           1
ApolloShield      1
Yoshi Mobility    1
OMG Digital       1
Stellar           1
Name: count, Length: 4358, dtype: int64 

slug
dropbox                      1
dealls-jobs-and-mentoring    1
nara                         1
powerhouse-ai                1
circular                     1
                            ..
ready-education              1
smartpath                    1
curtsy                       1
airfordable                  1
stellar                      1
Name: count, Length: 4410, dtype: int64 

former_names
[]                                             2549
['Spot']                                          2
['Stack']                                         2
['Kyte']                                          2
['Zelos']                                         2
                                               ... 
['Root Software']   

In [8]:
df.isna().sum()

id                           0
name                         0
slug                         0
former_names                 0
small_logo_thumb_url         0
website                     22
all_locations              157
long_description           338
one_liner                  194
team_size                   66
highlight_black              0
highlight_latinx             0
highlight_women              0
industry                     0
subindustry                  0
launched_at                  0
tags                         0
tags_highlighted             0
top_company                  0
top_company_by_revenue       0
isHiring                     0
nonprofit                    0
batch                        0
status                       0
industries                   0
regions                      0
stage                        0
app_video_public             0
demo_day_video_public        0
app_answers               4298
question_answers             0
objectID                     0
dtype: i

I want to get rid of unneccessary columns or columns overly specific to Y Combinator. I'll drop the name column instead of slug because all the slugs were unique, but I'll probably drop the slugs later on too before modelling. I will create a column of the number of former names instead of the current list of former names. Logo, website, and one liner I'll make into binary columns of has it or not. For nulls in team size column, I will probably set these values to the mean of the column. Locations/regions and industry/subindustry/tags I'll need to do more complicated one-hot-encoding on. Descriptions I need to do word embeddings for. I'll use launched at and batch to get years since founding, but this may be a measure of success instead of a feature to train the model on. Top company and top company by revenue I'll either use as measures for success or drop. Status and stage will be dummied; either or both of these will also be measure of success. 

In [9]:
df.drop(columns=['id', 'name', 'tags_highlighted', 'app_video_public', 'demo_day_video_public', 'app_answers', 'question_answers', 'objectID'], inplace=True)

In [10]:
# creating a new column for number of former names
df['num_former_names'] = df['former_names'].apply(lambda row: 0 if len(row) == 2 else len(row.replace('[', '').replace(']', '').replace("'", "").replace(' ', '').split(',')))

In [11]:
# creating new column for has logo or not
df['has_logo'] = np.where(df['small_logo_thumb_url'] == '/company/thumb/missing.png', False, True)

In [12]:
# creating new column for has website or not
df['has_website'] = np.where(df['website'].isna() | df['website'] == 'http://', False, True)

In [13]:
# creating new column for has one liner or not
df['has_one_liner'] = np.where(df['one_liner'].isna(), False, True)

In [14]:
# check to see that this all worked
df.head()

Unnamed: 0,slug,former_names,small_logo_thumb_url,website,all_locations,long_description,one_liner,team_size,highlight_black,highlight_latinx,highlight_women,industry,subindustry,launched_at,tags,top_company,top_company_by_revenue,isHiring,nonprofit,batch,status,industries,regions,stage,num_former_names,has_logo,has_website,has_one_liner
0,dropbox,[],https://bookface-images.s3.amazonaws.com/small...,http://dropbox.com,"San Francisco, CA, USA",Dropbox is building the world’s first smart wo...,Backup and share files in the cloud.,4000.0,False,False,False,B2B,B2B -> Productivity,1326791328,[],True,True,False,False,S07,Public,"['B2B', 'Productivity']","['United States of America', 'America / Canada']",Growth,0,True,True,True
1,reddit,[],https://bookface-images.s3.amazonaws.com/small...,http://reddit.com,"San Francisco, CA, USA",Founded by Steve Huffman and Alexis Ohanian in...,The frontpage of the internet.,201.0,False,False,False,Consumer,Consumer -> Content,1326791708,"['Community', 'Social Media', 'Social', 'Socia...",True,True,False,False,S05,Acquired,"['Consumer', 'Content']","['United States of America', 'America / Canada']",Early,0,True,True,True
2,twitch,"['Justin.tv', 'Twitch']",https://bookface-images.s3.amazonaws.com/small...,http://twitch.com,"San Francisco, CA, USA",Twitch is the world’s leading video platform a...,A global community creating the future of live...,2000.0,True,False,False,Consumer,Consumer -> Content,1326791723,"['Community', 'Gaming', 'Social Media', 'Video...",True,False,False,False,W07,Acquired,"['Consumer', 'Content']","['United States of America', 'America / Canada...",Growth,2,True,True,True
3,scribd,[],https://bookface-images.s3.amazonaws.com/small...,http://scribd.com,"San Francisco, CA, USA",Read and listen without limits. Unlimited* aud...,World's largest online library.,300.0,False,False,False,Consumer,Consumer -> Content,1326791580,[],True,True,True,False,S06,Active,"['Consumer', 'Content']","['United States of America', 'America / Canada']",Growth,0,True,True,True
4,weebly,[],https://bookface-images.s3.amazonaws.com/small...,http://weebly.com,"San Francisco, CA, USA",Company Information\r\nWeebly is a consumer se...,Build a free website that grows with your busi...,201.0,False,False,False,B2B,B2B -> Marketing,1326791493,[],True,False,False,False,W07,Acquired,"['B2B', 'Marketing']","['United States of America', 'America / Canada']",Growth,0,True,True,True


In [15]:
# 2549 should have 0 former names
df['num_former_names'].value_counts()

num_former_names
0     2549
1     1065
2      439
3      203
4       82
5       42
6       10
7       10
8        5
14       2
10       2
11       1
Name: count, dtype: int64

In [16]:
# 714 should not have a logo, 27 should not have a website, 194 should not have a one liner
df[['has_logo', 'has_website', 'has_one_liner']].sum() - 4410

has_logo        -714
has_website        0
has_one_liner   -194
dtype: int64

In [17]:
# website didn't work, what happened?
df['website'].isna().sum() 

22

In [18]:
(df['website'] == 'http://').sum()

5

In [19]:
(df['website'].isna() | (df['website'] == 'http://')).sum()

27

In [20]:
# try putting parantheses around each clause
df['has_website'] = np.where((df['website'].isna()) | (df['website'] == 'http://'), False, True)

In [21]:
# check again
df['has_website'].sum() - 4410

-27

In [22]:
# set nulls in team size to the mean of the column
for i in range(len(df['team_size'])):
    if np.isnan(df['team_size'][i]):
        df.at[i, 'team_size'] = 53.2

In [23]:
df.isna().sum()

slug                        0
former_names                0
small_logo_thumb_url        0
website                    22
all_locations             157
long_description          338
one_liner                 194
team_size                   0
highlight_black             0
highlight_latinx            0
highlight_women             0
industry                    0
subindustry                 0
launched_at                 0
tags                        0
top_company                 0
top_company_by_revenue      0
isHiring                    0
nonprofit                   0
batch                       0
status                      0
industries                  0
regions                     0
stage                       0
num_former_names            0
has_logo                    0
has_website                 0
has_one_liner               0
dtype: int64

Now I want to figure out how to deal with the location columns. Do I want information from both all_locations and regions?

In [24]:
df['all_locations']

0                                  San Francisco, CA, USA
1                                  San Francisco, CA, USA
2                                  San Francisco, CA, USA
3                                  San Francisco, CA, USA
4                                  San Francisco, CA, USA
                              ...                        
4405                                               Remote
4406    San Francisco, CA, USA; Sunnyvale, CA, USA; Re...
4407                       San Francisco, CA, USA; Remote
4408    College Park, MD, USA; Rockville, MD, USA; Remote
4409                               San Francisco, CA, USA
Name: all_locations, Length: 4410, dtype: object

In [25]:
df['regions']

0        ['United States of America', 'America / Canada']
1        ['United States of America', 'America / Canada']
2       ['United States of America', 'America / Canada...
3        ['United States of America', 'America / Canada']
4        ['United States of America', 'America / Canada']
                              ...                        
4405                           ['Remote', 'Fully Remote']
4406    ['United States of America', 'America / Canada...
4407    ['United States of America', 'America / Canada...
4408    ['United States of America', 'America / Canada...
4409    ['United States of America', 'America / Canada...
Name: regions, Length: 4410, dtype: object

all_locations has cities, states, and countries; regions has only countries. Ideally I'd like to get the cities out of all_locations and the countries out of regions. I'll get the cities first.

In [26]:
df['all_locations'][0].split(';')

['San Francisco, CA, USA']

In [27]:
type(df['all_locations'][0].split(';'))

list

In [28]:
df['location_lists'] = df['all_locations'].apply(lambda row: row.split(';') if type(row)==str else [])

In [29]:
df.tail()

Unnamed: 0,slug,former_names,small_logo_thumb_url,website,all_locations,long_description,one_liner,team_size,highlight_black,highlight_latinx,highlight_women,industry,subindustry,launched_at,tags,top_company,top_company_by_revenue,isHiring,nonprofit,batch,status,industries,regions,stage,num_former_names,has_logo,has_website,has_one_liner,location_lists
4405,emerge-career,"['Ameelio', 'Ameelio Emerge', 'Emerge']",https://bookface-images.s3.amazonaws.com/small...,https://www.emergecareer.com/,Remote,Emerge sells online job training to the govern...,Selling online job training to the government,3.0,True,True,False,Education,Education,1657112954,"['Education', 'GovTech']",False,False,False,False,S22,Active,['Education'],"['Remote', 'Fully Remote']",Early,3,True,True,True,[Remote]
4406,opensight,['FastFile'],https://bookface-images.s3.amazonaws.com/small...,https://www.opensight.ai,"San Francisco, CA, USA; Sunnyvale, CA, USA; Re...",OpenSight helps companies quickly scale up the...,AI-powered customer support automation for fas...,3.0,False,False,True,B2B,B2B -> Operations,1673668209,"['Generative AI', 'B2B', 'Customer Success', '...",False,False,False,False,W23,Inactive,"['B2B', 'Operations']","['United States of America', 'America / Canada...",Early,1,True,True,True,"[San Francisco, CA, USA, Sunnyvale, CA, USA, ..."
4407,mogara,[],https://bookface-images.s3.amazonaws.com/small...,https://mogara.com,"San Francisco, CA, USA; Remote",Mogara automates software R&D capitalization.\...,Automatic software R&D capitalization,2.0,False,False,True,B2B,B2B -> Finance and Accounting,1674525873,"['Finance', 'B2B']",False,False,False,False,W23,Inactive,"['B2B', 'Finance and Accounting']","['United States of America', 'America / Canada...",Early,0,True,True,True,"[San Francisco, CA, USA, Remote]"
4408,manatee,"['Quandry', 'PreStacks']",https://bookface-images.s3.amazonaws.com/small...,https://www.trymanatee.com/?utm_source=bookface,"College Park, MD, USA; Rockville, MD, USA; Remote",We let you see everything about how prospects ...,DocSend for demos.,2.0,False,False,False,B2B,B2B -> Sales,1657678584,"['B2B', 'Sales']",False,False,False,False,S22,Inactive,"['B2B', 'Sales']","['United States of America', 'America / Canada...",Early,2,True,True,True,"[College Park, MD, USA, Rockville, MD, USA, ..."
4409,stellar,[],https://bookface-images.s3.amazonaws.com/small...,http://www.stellarapp.io,"San Francisco, CA, USA",A fool-proof goals platform using GPT to help ...,AI-powered business goals,3.0,False,False,True,B2B,B2B,1662916743,"['SaaS', 'B2B', 'Productivity', 'Analytics']",False,False,False,False,S22,Inactive,['B2B'],"['United States of America', 'America / Canada...",Early,0,True,True,True,"[San Francisco, CA, USA]"


In [30]:
df.explode('location_lists')['location_lists'].value_counts()

location_lists
San Francisco, CA, USA             1352
 Remote                            1110
New York, NY, USA                   407
 San Francisco, CA, USA             174
London, England, United Kingdom     117
                                   ... 
 Fort Worth, TX, USA                  1
Algeria                               1
 Charleston, SC, USA                  1
Kiel, SH, Germany                     1
 Rockville, MD, USA                   1
Name: count, Length: 621, dtype: int64

In [31]:
# exploding on location_lists yields 621 unique locations but there are leading whitespaces in a lot of them. How many locations are there really?
locations = []

for loc_list in df['location_lists']:
    for loc in loc_list:
        locations.append(loc.lstrip())
        
len(set(locations))

475

In [32]:
# looks like I need to remove these whitespaces
df['location_lists'] = df['location_lists'].apply(lambda row: [x.lstrip() for x in row]) 

In [33]:
df['location_lists'][4406]

['San Francisco, CA, USA', 'Sunnyvale, CA, USA', 'Remote']

In [34]:
# check
df.explode('location_lists')['location_lists'].value_counts()

location_lists
San Francisco, CA, USA             1526
Remote                             1155
New York, NY, USA                   480
London, England, United Kingdom     164
Bengaluru, KA, India                136
                                   ... 
Des Moines, IA, USA                   1
SC, Brazil                            1
Hanoi, Vietnam                        1
Aarhus, Denmark                       1
Rockville, MD, USA                    1
Name: count, Length: 475, dtype: int64

In [35]:
# that worked, now how many cities do I want to dummy?
df.explode('location_lists')['location_lists'].value_counts(normalize=True)[:10].sum()

0.6305661577608141

In [36]:
# the top 10 cities account for 63% of all mentioned cities, how many are there that are mentioned at least 1% of the time
(df.explode('location_lists')['location_lists'].value_counts(normalize=True) >= .01).sum()

14

In [37]:
df.explode('location_lists')['location_lists'].value_counts(normalize=True)[:14]

location_lists
San Francisco, CA, USA             0.242684
Remote                             0.183683
New York, NY, USA                  0.076336
London, England, United Kingdom    0.026081
Bengaluru, KA, India               0.021628
Los Angeles, CA, USA               0.021310
Palo Alto, CA, USA                 0.017653
Mountain View, CA, USA             0.014631
Toronto, ON, Canada                0.014472
Boston, MA, USA                    0.012087
Seattle, WA, USA                   0.011609
Mexico City, CDMX, Mexico          0.011609
Paris, Île-de-France, France       0.010814
Austin, TX, USA                    0.010496
Name: proportion, dtype: float64

I will make columns for these 14 cities and another for other cities, so 15 total city columns.

In [38]:
# first get list of these cities
list(df.explode('location_lists')['location_lists'].value_counts()[:14].index)

['San Francisco, CA, USA',
 'Remote',
 'New York, NY, USA',
 'London, England, United Kingdom',
 'Bengaluru, KA, India',
 'Los Angeles, CA, USA',
 'Palo Alto, CA, USA',
 'Mountain View, CA, USA',
 'Toronto, ON, Canada',
 'Boston, MA, USA',
 'Seattle, WA, USA',
 'Mexico City, CDMX, Mexico',
 'Paris, Île-de-France, France',
 'Austin, TX, USA']

In [39]:
# creating the dummy columns and populating them based on the location_lists column 
top_cities = list(df.explode('location_lists')['location_lists'].value_counts()[:14].index)
top_cities.sort()
for city in top_cities:
    df[city] = df.apply(lambda row: 1 if city in row['location_lists'] else 0, axis=1)

In [40]:
# the other_city column gets populated for any row that does not already have a top city column checked off
df['other_city'] = df.apply(lambda row: 1 if len(np.setdiff1d(row['location_lists'], top_cities)) == len(row['location_lists']) else 0, axis=1)

In [41]:
df.tail()

Unnamed: 0,slug,former_names,small_logo_thumb_url,website,all_locations,long_description,one_liner,team_size,highlight_black,highlight_latinx,highlight_women,industry,subindustry,launched_at,tags,top_company,top_company_by_revenue,isHiring,nonprofit,batch,status,industries,regions,stage,num_former_names,has_logo,has_website,has_one_liner,location_lists,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city
4405,emerge-career,"['Ameelio', 'Ameelio Emerge', 'Emerge']",https://bookface-images.s3.amazonaws.com/small...,https://www.emergecareer.com/,Remote,Emerge sells online job training to the govern...,Selling online job training to the government,3.0,True,True,False,Education,Education,1657112954,"['Education', 'GovTech']",False,False,False,False,S22,Active,['Education'],"['Remote', 'Fully Remote']",Early,3,True,True,True,[Remote],0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4406,opensight,['FastFile'],https://bookface-images.s3.amazonaws.com/small...,https://www.opensight.ai,"San Francisco, CA, USA; Sunnyvale, CA, USA; Re...",OpenSight helps companies quickly scale up the...,AI-powered customer support automation for fas...,3.0,False,False,True,B2B,B2B -> Operations,1673668209,"['Generative AI', 'B2B', 'Customer Success', '...",False,False,False,False,W23,Inactive,"['B2B', 'Operations']","['United States of America', 'America / Canada...",Early,1,True,True,True,"[San Francisco, CA, USA, Sunnyvale, CA, USA, R...",0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
4407,mogara,[],https://bookface-images.s3.amazonaws.com/small...,https://mogara.com,"San Francisco, CA, USA; Remote",Mogara automates software R&D capitalization.\...,Automatic software R&D capitalization,2.0,False,False,True,B2B,B2B -> Finance and Accounting,1674525873,"['Finance', 'B2B']",False,False,False,False,W23,Inactive,"['B2B', 'Finance and Accounting']","['United States of America', 'America / Canada...",Early,0,True,True,True,"[San Francisco, CA, USA, Remote]",0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
4408,manatee,"['Quandry', 'PreStacks']",https://bookface-images.s3.amazonaws.com/small...,https://www.trymanatee.com/?utm_source=bookface,"College Park, MD, USA; Rockville, MD, USA; Remote",We let you see everything about how prospects ...,DocSend for demos.,2.0,False,False,False,B2B,B2B -> Sales,1657678584,"['B2B', 'Sales']",False,False,False,False,S22,Inactive,"['B2B', 'Sales']","['United States of America', 'America / Canada...",Early,2,True,True,True,"[College Park, MD, USA, Rockville, MD, USA, Re...",0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4409,stellar,[],https://bookface-images.s3.amazonaws.com/small...,http://www.stellarapp.io,"San Francisco, CA, USA",A fool-proof goals platform using GPT to help ...,AI-powered business goals,3.0,False,False,True,B2B,B2B,1662916743,"['SaaS', 'B2B', 'Productivity', 'Analytics']",False,False,False,False,S22,Inactive,['B2B'],"['United States of America', 'America / Canada...",Early,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [42]:
# the other column didn't populate correctly, what happened?
np.setdiff1d(df['location_lists'][4408], top_cities)

array(['College Park, MD, USA', 'Rockville, MD, USA'], dtype='<U21')

In [43]:
len(np.setdiff1d(df['location_lists'][4408], top_cities))

2

In [44]:
df['location_lists'][4408]

['College Park, MD, USA', 'Rockville, MD, USA', 'Remote']

In [45]:
len(df['location_lists'][4408])

3

In [46]:
# the logic above wasn't correct, try this instead
df['other_city'] = df.apply(lambda row: 1 if len(np.setdiff1d(row['location_lists'], top_cities)) > 0 else 0, axis=1)

In [47]:
df.tail()

Unnamed: 0,slug,former_names,small_logo_thumb_url,website,all_locations,long_description,one_liner,team_size,highlight_black,highlight_latinx,highlight_women,industry,subindustry,launched_at,tags,top_company,top_company_by_revenue,isHiring,nonprofit,batch,status,industries,regions,stage,num_former_names,has_logo,has_website,has_one_liner,location_lists,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city
4405,emerge-career,"['Ameelio', 'Ameelio Emerge', 'Emerge']",https://bookface-images.s3.amazonaws.com/small...,https://www.emergecareer.com/,Remote,Emerge sells online job training to the govern...,Selling online job training to the government,3.0,True,True,False,Education,Education,1657112954,"['Education', 'GovTech']",False,False,False,False,S22,Active,['Education'],"['Remote', 'Fully Remote']",Early,3,True,True,True,[Remote],0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4406,opensight,['FastFile'],https://bookface-images.s3.amazonaws.com/small...,https://www.opensight.ai,"San Francisco, CA, USA; Sunnyvale, CA, USA; Re...",OpenSight helps companies quickly scale up the...,AI-powered customer support automation for fas...,3.0,False,False,True,B2B,B2B -> Operations,1673668209,"['Generative AI', 'B2B', 'Customer Success', '...",False,False,False,False,W23,Inactive,"['B2B', 'Operations']","['United States of America', 'America / Canada...",Early,1,True,True,True,"[San Francisco, CA, USA, Sunnyvale, CA, USA, R...",0,0,0,0,0,0,0,0,0,0,1,1,0,0,1
4407,mogara,[],https://bookface-images.s3.amazonaws.com/small...,https://mogara.com,"San Francisco, CA, USA; Remote",Mogara automates software R&D capitalization.\...,Automatic software R&D capitalization,2.0,False,False,True,B2B,B2B -> Finance and Accounting,1674525873,"['Finance', 'B2B']",False,False,False,False,W23,Inactive,"['B2B', 'Finance and Accounting']","['United States of America', 'America / Canada...",Early,0,True,True,True,"[San Francisco, CA, USA, Remote]",0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
4408,manatee,"['Quandry', 'PreStacks']",https://bookface-images.s3.amazonaws.com/small...,https://www.trymanatee.com/?utm_source=bookface,"College Park, MD, USA; Rockville, MD, USA; Remote",We let you see everything about how prospects ...,DocSend for demos.,2.0,False,False,False,B2B,B2B -> Sales,1657678584,"['B2B', 'Sales']",False,False,False,False,S22,Inactive,"['B2B', 'Sales']","['United States of America', 'America / Canada...",Early,2,True,True,True,"[College Park, MD, USA, Rockville, MD, USA, Re...",0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
4409,stellar,[],https://bookface-images.s3.amazonaws.com/small...,http://www.stellarapp.io,"San Francisco, CA, USA",A fool-proof goals platform using GPT to help ...,AI-powered business goals,3.0,False,False,True,B2B,B2B,1662916743,"['SaaS', 'B2B', 'Productivity', 'Analytics']",False,False,False,False,S22,Inactive,['B2B'],"['United States of America', 'America / Canada...",Early,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


Now do the same for the regions column.

In [48]:
df['regions']

0        ['United States of America', 'America / Canada']
1        ['United States of America', 'America / Canada']
2       ['United States of America', 'America / Canada...
3        ['United States of America', 'America / Canada']
4        ['United States of America', 'America / Canada']
                              ...                        
4405                           ['Remote', 'Fully Remote']
4406    ['United States of America', 'America / Canada...
4407    ['United States of America', 'America / Canada...
4408    ['United States of America', 'America / Canada...
4409    ['United States of America', 'America / Canada...
Name: regions, Length: 4410, dtype: object

In [49]:
df['regions'][0].replace('[', '').replace(']', '').replace("'", "").split(',')

['United States of America', ' America / Canada']

In [50]:
type(df['regions'][0].replace('[', '').replace(']', '').replace("'", "").split(','))

list

In [51]:
df['region_lists'] = df['regions'].apply(lambda row: row.replace('[', '').replace(']', '').replace("'", "").split(',') if type(row)==str else [])

In [52]:
df.head()

Unnamed: 0,slug,former_names,small_logo_thumb_url,website,all_locations,long_description,one_liner,team_size,highlight_black,highlight_latinx,highlight_women,industry,subindustry,launched_at,tags,top_company,top_company_by_revenue,isHiring,nonprofit,batch,status,industries,regions,stage,num_former_names,has_logo,has_website,has_one_liner,location_lists,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city,region_lists
0,dropbox,[],https://bookface-images.s3.amazonaws.com/small...,http://dropbox.com,"San Francisco, CA, USA",Dropbox is building the world’s first smart wo...,Backup and share files in the cloud.,4000.0,False,False,False,B2B,B2B -> Productivity,1326791328,[],True,True,False,False,S07,Public,"['B2B', 'Productivity']","['United States of America', 'America / Canada']",Growth,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]"
1,reddit,[],https://bookface-images.s3.amazonaws.com/small...,http://reddit.com,"San Francisco, CA, USA",Founded by Steve Huffman and Alexis Ohanian in...,The frontpage of the internet.,201.0,False,False,False,Consumer,Consumer -> Content,1326791708,"['Community', 'Social Media', 'Social', 'Socia...",True,True,False,False,S05,Acquired,"['Consumer', 'Content']","['United States of America', 'America / Canada']",Early,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]"
2,twitch,"['Justin.tv', 'Twitch']",https://bookface-images.s3.amazonaws.com/small...,http://twitch.com,"San Francisco, CA, USA",Twitch is the world’s leading video platform a...,A global community creating the future of live...,2000.0,True,False,False,Consumer,Consumer -> Content,1326791723,"['Community', 'Gaming', 'Social Media', 'Video...",True,False,False,False,W07,Acquired,"['Consumer', 'Content']","['United States of America', 'America / Canada...",Growth,2,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada, ..."
3,scribd,[],https://bookface-images.s3.amazonaws.com/small...,http://scribd.com,"San Francisco, CA, USA",Read and listen without limits. Unlimited* aud...,World's largest online library.,300.0,False,False,False,Consumer,Consumer -> Content,1326791580,[],True,True,True,False,S06,Active,"['Consumer', 'Content']","['United States of America', 'America / Canada']",Growth,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]"
4,weebly,[],https://bookface-images.s3.amazonaws.com/small...,http://weebly.com,"San Francisco, CA, USA",Company Information\r\nWeebly is a consumer se...,Build a free website that grows with your busi...,201.0,False,False,False,B2B,B2B -> Marketing,1326791493,[],True,False,False,False,W07,Acquired,"['B2B', 'Marketing']","['United States of America', 'America / Canada']",Growth,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]"


In [53]:
df.explode('region_lists')['region_lists'].value_counts()

region_lists
 America / Canada           3233
United States of America    2988
 Remote                     2655
 Partly Remote              1572
 Fully Remote               1155
                            ... 
Ivory Coast                    1
Lithuania                      1
Croatia                        1
Belgium                        1
Japan                          1
Name: count, Length: 155, dtype: int64

In [54]:
# exploding on region_lists yields 155 unique regions but there are leading whitespaces in a lot of them. How many regions are there really?
regions = []

for reg_list in df['region_lists']:
    for reg in reg_list:
        regions.append(reg.lstrip())
        
len(set(regions))

103

In [55]:
# looks like I need to remove these whitespaces
df['region_lists'] = df['region_lists'].apply(lambda row: [x.lstrip() for x in row]) 

In [56]:
df['region_lists'][4406]

['United States of America', 'America / Canada', 'Remote', 'Fully Remote']

In [57]:
# check
df.explode('region_lists')['region_lists'].value_counts()

region_lists
America / Canada            3233
United States of America    3097
Remote                      2727
Partly Remote               1572
Fully Remote                1155
                            ... 
Latvia                         1
Bangladesh                     1
Nepal                          1
Thailand                       1
Benin                          1
Name: count, Length: 103, dtype: int64

In [58]:
# that worked, now how many countries/regions do I want to dummy?
df.explode('region_lists')['region_lists'].value_counts(normalize=True)[:10].sum()

0.892147523135547

In [59]:
# the top 10 regions account for 89% of all mentioned regions, how many are there that are mentioned at least 1% of the time
(df.explode('region_lists')['region_lists'].value_counts(normalize=True) >= .01).sum()

11

In [60]:
df.explode('region_lists')['region_lists'].value_counts(normalize=True)[:11]

region_lists
America / Canada            0.219992
United States of America    0.210738
Remote                      0.185561
Partly Remote               0.106968
Fully Remote                0.078593
Europe                      0.028987
South Asia                  0.016739
Latin America               0.015991
India                       0.015923
United Kingdom              0.012657
Canada                      0.011159
Name: proportion, dtype: float64

I will make columns for these 11 regions and another for other regions, so 12 total region columns. I expect that I will have to drop some of these, specifically the remote columns, because they will probably be collinear with each other or the remote city column.

In [61]:
# first get list of these cities
list(df.explode('region_lists')['region_lists'].value_counts()[:11].index)

['America / Canada',
 'United States of America',
 'Remote',
 'Partly Remote',
 'Fully Remote',
 'Europe',
 'South Asia',
 'Latin America',
 'India',
 'United Kingdom',
 'Canada']

In [62]:
# creating the dummy columns and populating them based on the region_lists column 
top_regions = list(df.explode('region_lists')['region_lists'].value_counts()[:11].index)
top_regions.sort()
for region in top_regions:
    df[region + ' - region'] = df.apply(lambda row: 1 if region in row['region_lists'] else 0, axis=1)

In [63]:
# the other_region column gets populated for any row that does not already have a top region column checked off
df['other_region'] = df.apply(lambda row: 1 if len(np.setdiff1d(row['region_lists'], top_regions)) > 0 else 0, axis=1)

In [64]:
# check
df[df['other_region'] == 1]

Unnamed: 0,slug,former_names,small_logo_thumb_url,website,all_locations,long_description,one_liner,team_size,highlight_black,highlight_latinx,highlight_women,industry,subindustry,launched_at,tags,top_company,top_company_by_revenue,isHiring,nonprofit,batch,status,industries,regions,stage,num_former_names,has_logo,has_website,has_one_liner,location_lists,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city,region_lists,America / Canada - region,Canada - region,Europe - region,Fully Remote - region,India - region,Latin America - region,Partly Remote - region,Remote - region,South Asia - region,United Kingdom - region,United States of America - region,other_region
22,anywhere-fm,[],/company/thumb/missing.png,http://anywhere.fm,,Anywhere.fm lets you upload your music collect...,,11.0,False,False,False,Consumer,Consumer -> Content,1326791428,[],False,False,False,False,S07,Acquired,"['Consumer', 'Content']",['Unspecified'],Early,0,False,True,False,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[Unspecified],0,0,0,0,0,0,0,0,0,0,0,1
33,snipshot,[],https://bookface-images.s3.amazonaws.com/small...,https://snipshot.com,,Snipshot is a photo editor that lets you start...,We sold Snipshot to Ansa in 2013.,0.0,False,False,False,Consumer,Consumer -> Content,1326791640,['Media'],False,False,False,False,W06,Inactive,"['Consumer', 'Content']",['Unspecified'],Growth,0,True,True,True,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[Unspecified],0,0,0,0,0,0,0,0,0,0,0,1
37,pollground,[],/company/thumb/missing.png,http://pollground.com,,A social polling site. Pollground allows you t...,Online polls.,0.0,False,False,False,Consumer,Consumer -> Social,1326791614,['Market Research'],False,False,False,False,S06,Inactive,"['Consumer', 'Social']",['Unspecified'],Early,0,False,True,True,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[Unspecified],0,0,0,0,0,0,0,0,0,0,0,1
39,talkito,[],/company/thumb/missing.png,http://talkito.com,,,,0.0,False,False,False,Consumer,Consumer -> Social,1326791607,[],False,False,False,False,S06,Inactive,"['Consumer', 'Social']",['Unspecified'],Early,0,False,True,False,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[Unspecified],0,0,0,0,0,0,0,0,0,0,0,1
40,thinkature,[],/company/thumb/missing.png,http://thinkature.com,,,,0.0,False,False,False,B2B,B2B -> Productivity,1326791603,[],False,False,False,False,S06,Inactive,"['B2B', 'Productivity']",['Unspecified'],Early,0,False,True,False,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[Unspecified],0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4359,blitz,[],https://bookface-images.s3.amazonaws.com/small...,https://www.blitznocode.com/,"Paris, Île-de-France, France",Blitz is a no-code platform to build internal ...,"Build apps, automate tasks, and scale operatio...",4.0,False,True,False,B2B,B2B -> Operations,1656995564,"['SaaS', 'B2B', 'No-code']",False,False,False,False,S22,Active,"['B2B', 'Operations']","['France', 'Europe', 'Remote', 'Partly Remote']",Early,0,True,True,True,"[Paris, Île-de-France, France]",0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,"[France, Europe, Remote, Partly Remote]",0,0,1,0,0,0,1,1,0,0,0,1
4360,erad,[],https://bookface-images.s3.amazonaws.com/small...,https://erad.co/,"Riyadh, Riyadh Province, Saudi Arabia",erad provides startups in the Middle East with...,Non-dilutive funding and payments in the Middl...,5.0,False,False,False,Fintech,Fintech,1653900418,"['Fintech', 'Payments', 'B2B']",False,False,True,False,S22,Active,['Fintech'],"['Saudi Arabia', 'Middle East and North Africa...",Early,0,True,True,True,"[Riyadh, Riyadh Province, Saudi Arabia]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,"[Saudi Arabia, Middle East and North Africa, R...",0,0,0,0,0,0,1,1,0,0,0,1
4365,tailor,[],https://bookface-images.s3.amazonaws.com/small...,https://www.tailor.tech/,"Tokyo, Tokyo, Japan",Tailor is a backend platform that helps enterp...,Highly Customizable Headless ERP,16.0,False,False,False,B2B,B2B,1654046243,"['Developer Tools', 'SaaS', 'B2B', 'API', 'Ent...",False,False,True,False,S22,Active,['B2B'],"['Japan', 'East Asia', 'Remote', 'Partly Remote']",Early,0,True,True,True,"[Tokyo, Tokyo, Japan]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,"[Japan, East Asia, Remote, Partly Remote]",0,0,0,0,0,0,1,1,0,0,0,1
4370,stackup,[],https://bookface-images.s3.amazonaws.com/small...,https://www.stackup.sh,"Los Angeles, CA, USA; Melbourne, VIC, Australi...",Stackup is a suite of open-source developer to...,Stackup is a platform for building user-friend...,3.0,False,False,False,B2B,"B2B -> Engineering, Product and Design",1653642506,"['Developer Tools', 'SaaS', 'Crypto / Web3', '...",False,False,True,False,S22,Active,"['B2B', 'Engineering, Product and Design']","['United States of America', 'Australia', 'Ame...",Early,0,True,True,True,"[Los Angeles, CA, USA, Melbourne, VIC, Austral...",0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,"[United States of America, Australia, America ...",1,0,0,1,0,0,0,1,0,0,1,1


In [65]:
# the column populated correctly, but it looks like there were regions listed as Unspecified, how many?
(df.explode('region_lists')['region_lists'] == 'Unspecified').sum()

131

In [66]:
# how many instances were there of the least common region I included?
(df.explode('region_lists')['region_lists'] == 'Canada').sum()

164

In [67]:
# I will make a column for Unspecified, redo what I did before to achieve this
top_regions = list(df.explode('region_lists')['region_lists'].value_counts()[:11].index) + ['Unspecified']
top_regions.sort()
for region in top_regions:
    df[region + ' - region'] = df.apply(lambda row: 1 if region in row['region_lists'] else 0, axis=1)

In [68]:
df['other_region'] = df.apply(lambda row: 1 if len(np.setdiff1d(row['region_lists'], top_regions)) > 0 else 0, axis=1)

In [69]:
# check
df[df['other_region'] == 1]

Unnamed: 0,slug,former_names,small_logo_thumb_url,website,all_locations,long_description,one_liner,team_size,highlight_black,highlight_latinx,highlight_women,industry,subindustry,launched_at,tags,top_company,top_company_by_revenue,isHiring,nonprofit,batch,status,industries,regions,stage,num_former_names,has_logo,has_website,has_one_liner,location_lists,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city,region_lists,America / Canada - region,Canada - region,Europe - region,Fully Remote - region,India - region,Latin America - region,Partly Remote - region,Remote - region,South Asia - region,United Kingdom - region,United States of America - region,other_region,Unspecified - region
77,benchling,[],https://bookface-images.s3.amazonaws.com/small...,http://benchling.com,"San Francisco, CA, USA; Boston, MA, USA; Züric...","Biotechnology is rewriting life as we know it,...",Unlocking the power of biotech with modern sof...,850.0,False,False,False,B2B,B2B,1336096997,"['SaaS', 'B2B', 'Biotech']",True,True,True,False,S12,Active,['B2B'],"['United States of America', 'Switzerland', 'U...",Growth,0,True,True,True,"[San Francisco, CA, USA, Boston, MA, USA, Züri...",0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,"[United States of America, Switzerland, United...",1,0,1,0,0,0,1,1,0,1,1,1,0
79,rappi,[],https://bookface-images.s3.amazonaws.com/small...,http://www.rappi.com,"Bogotá, Bogota, Colombia","Rappi is a mega high growth, Series B, consume...",On-demand delivery and financial services for ...,5700.0,False,True,False,Consumer,Consumer -> Food and Beverage,1453256409,"['Fintech', 'Delivery', 'Latin America']",True,True,True,False,W16,Active,"['Consumer', 'Food and Beverage']","['Colombia', 'Latin America']",Growth,0,True,True,True,"[Bogotá, Bogota, Colombia]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,"[Colombia, Latin America]",0,0,0,0,0,1,0,0,0,0,0,1,0
90,xendit,[],https://bookface-images.s3.amazonaws.com/small...,https://www.xendit.co/,"Jakarta, Jakarta, Indonesia",Xendit provides payment solutions that simplif...,Provides payment infrastructure for Southeast ...,700.0,False,False,True,Fintech,Fintech -> Payments,1430156131,"['Fintech', 'Payments']",True,False,True,False,S15,Active,"['Fintech', 'Payments']","['Indonesia', 'Southeast Asia', 'Remote', 'Par...",Growth,0,True,True,True,"[Jakarta, Jakarta, Indonesia]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,"[Indonesia, Southeast Asia, Remote, Partly Rem...",0,0,0,0,0,0,1,1,0,0,0,1,0
95,algolia,[],https://bookface-images.s3.amazonaws.com/small...,http://www.algolia.com,"San Francisco, CA, USA; Paris, Île-de-France, ...",Our mission is to make every search interactio...,A developer-friendly and enterprise-grade sear...,810.0,False,False,False,B2B,"B2B -> Engineering, Product and Design",1384980277,"['Developer Tools', 'SaaS', 'B2B']",True,True,True,False,W14,Active,"['B2B', 'Engineering, Product and Design']","['United States of America', 'France', 'Americ...",Growth,0,True,True,True,"[San Francisco, CA, USA, Paris, Île-de-France,...",0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,"[United States of America, France, America / C...",1,0,1,0,0,0,1,1,0,0,1,1,0
101,wave,"['Wave', 'Wave Mobile Money']",https://bookface-images.s3.amazonaws.com/small...,https://wave.com,"Dakar, Dakar Region, Senegal; Remote",Building extremely affordable financial infras...,Mobile money app for Africa,1500.0,False,False,False,Fintech,Fintech -> Payments,1638361727,['Fintech'],True,True,False,False,W12,Active,"['Fintech', 'Payments']","['Senegal', 'Africa', 'Remote', 'Fully Remote']",Early,2,True,True,True,"[Dakar, Dakar Region, Senegal, Remote]",0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,"[Senegal, Africa, Remote, Fully Remote]",0,0,0,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4359,blitz,[],https://bookface-images.s3.amazonaws.com/small...,https://www.blitznocode.com/,"Paris, Île-de-France, France",Blitz is a no-code platform to build internal ...,"Build apps, automate tasks, and scale operatio...",4.0,False,True,False,B2B,B2B -> Operations,1656995564,"['SaaS', 'B2B', 'No-code']",False,False,False,False,S22,Active,"['B2B', 'Operations']","['France', 'Europe', 'Remote', 'Partly Remote']",Early,0,True,True,True,"[Paris, Île-de-France, France]",0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,"[France, Europe, Remote, Partly Remote]",0,0,1,0,0,0,1,1,0,0,0,1,0
4360,erad,[],https://bookface-images.s3.amazonaws.com/small...,https://erad.co/,"Riyadh, Riyadh Province, Saudi Arabia",erad provides startups in the Middle East with...,Non-dilutive funding and payments in the Middl...,5.0,False,False,False,Fintech,Fintech,1653900418,"['Fintech', 'Payments', 'B2B']",False,False,True,False,S22,Active,['Fintech'],"['Saudi Arabia', 'Middle East and North Africa...",Early,0,True,True,True,"[Riyadh, Riyadh Province, Saudi Arabia]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,"[Saudi Arabia, Middle East and North Africa, R...",0,0,0,0,0,0,1,1,0,0,0,1,0
4365,tailor,[],https://bookface-images.s3.amazonaws.com/small...,https://www.tailor.tech/,"Tokyo, Tokyo, Japan",Tailor is a backend platform that helps enterp...,Highly Customizable Headless ERP,16.0,False,False,False,B2B,B2B,1654046243,"['Developer Tools', 'SaaS', 'B2B', 'API', 'Ent...",False,False,True,False,S22,Active,['B2B'],"['Japan', 'East Asia', 'Remote', 'Partly Remote']",Early,0,True,True,True,"[Tokyo, Tokyo, Japan]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,"[Japan, East Asia, Remote, Partly Remote]",0,0,0,0,0,0,1,1,0,0,0,1,0
4370,stackup,[],https://bookface-images.s3.amazonaws.com/small...,https://www.stackup.sh,"Los Angeles, CA, USA; Melbourne, VIC, Australi...",Stackup is a suite of open-source developer to...,Stackup is a platform for building user-friend...,3.0,False,False,False,B2B,"B2B -> Engineering, Product and Design",1653642506,"['Developer Tools', 'SaaS', 'Crypto / Web3', '...",False,False,True,False,S22,Active,"['B2B', 'Engineering, Product and Design']","['United States of America', 'Australia', 'Ame...",Early,0,True,True,True,"[Los Angeles, CA, USA, Melbourne, VIC, Austral...",0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,"[United States of America, Australia, America ...",1,0,0,1,0,0,0,1,0,0,1,1,0


In [70]:
# looks better, checking how many rows have the unspecifed column checked off
df[df['Unspecified - region'] == 1]

Unnamed: 0,slug,former_names,small_logo_thumb_url,website,all_locations,long_description,one_liner,team_size,highlight_black,highlight_latinx,highlight_women,industry,subindustry,launched_at,tags,top_company,top_company_by_revenue,isHiring,nonprofit,batch,status,industries,regions,stage,num_former_names,has_logo,has_website,has_one_liner,location_lists,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city,region_lists,America / Canada - region,Canada - region,Europe - region,Fully Remote - region,India - region,Latin America - region,Partly Remote - region,Remote - region,South Asia - region,United Kingdom - region,United States of America - region,other_region,Unspecified - region
22,anywhere-fm,[],/company/thumb/missing.png,http://anywhere.fm,,Anywhere.fm lets you upload your music collect...,,11.0,False,False,False,Consumer,Consumer -> Content,1326791428,[],False,False,False,False,S07,Acquired,"['Consumer', 'Content']",['Unspecified'],Early,0,False,True,False,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[Unspecified],0,0,0,0,0,0,0,0,0,0,0,0,1
33,snipshot,[],https://bookface-images.s3.amazonaws.com/small...,https://snipshot.com,,Snipshot is a photo editor that lets you start...,We sold Snipshot to Ansa in 2013.,0.0,False,False,False,Consumer,Consumer -> Content,1326791640,['Media'],False,False,False,False,W06,Inactive,"['Consumer', 'Content']",['Unspecified'],Growth,0,True,True,True,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[Unspecified],0,0,0,0,0,0,0,0,0,0,0,0,1
37,pollground,[],/company/thumb/missing.png,http://pollground.com,,A social polling site. Pollground allows you t...,Online polls.,0.0,False,False,False,Consumer,Consumer -> Social,1326791614,['Market Research'],False,False,False,False,S06,Inactive,"['Consumer', 'Social']",['Unspecified'],Early,0,False,True,True,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[Unspecified],0,0,0,0,0,0,0,0,0,0,0,0,1
39,talkito,[],/company/thumb/missing.png,http://talkito.com,,,,0.0,False,False,False,Consumer,Consumer -> Social,1326791607,[],False,False,False,False,S06,Inactive,"['Consumer', 'Social']",['Unspecified'],Early,0,False,True,False,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[Unspecified],0,0,0,0,0,0,0,0,0,0,0,0,1
40,thinkature,[],/company/thumb/missing.png,http://thinkature.com,,,,0.0,False,False,False,B2B,B2B -> Productivity,1326791603,[],False,False,False,False,S06,Inactive,"['B2B', 'Productivity']",['Unspecified'],Early,0,False,True,False,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[Unspecified],0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4167,rubbrband,[],https://bookface-images.s3.amazonaws.com/small...,https://rubbrband.com,,Rubbrband is building an AI that qualitatively...,Evaluation for AI-generated images,3.0,False,False,False,B2B,"B2B -> Engineering, Product and Design",1666397912,[],False,False,False,False,W23,Active,"['B2B', 'Engineering, Product and Design']",['Unspecified'],Early,0,True,True,True,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[Unspecified],0,0,0,0,0,0,0,0,0,0,0,0,1
4195,lavo-life-sciences,[],https://bookface-images.s3.amazonaws.com/small...,https://www.lavo.ai/,,Lavo Life Sciences runs simulations of drug mo...,AI for drug formulation,3.0,False,False,False,Healthcare,Healthcare -> Drug Discovery and Delivery,1674082073,"['AI-powered Drug Discovery', 'Machine Learnin...",False,False,False,False,W23,Active,"['Healthcare', 'Drug Discovery and Delivery']",['Unspecified'],Early,0,True,True,True,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[Unspecified],0,0,0,0,0,0,0,0,0,0,0,0,1
4301,hypeshot,['the402'],https://bookface-images.s3.amazonaws.com/small...,https://www.hypeshot.io/,,,create and sell NFTs live,2.0,False,False,False,Consumer,Consumer -> Content,1662071336,"['Crypto / Web3', 'Consumer', 'Entertainment']",False,False,True,False,S22,Active,"['Consumer', 'Content']",['Unspecified'],Early,1,True,True,True,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[Unspecified],0,0,0,0,0,0,0,0,0,0,0,0,1
4303,ten-lives,['Terraferma Foods'],https://bookface-images.s3.amazonaws.com/small...,https://tenlives.com,,"We make cat food with animal protein, without ...",Decarbonizing pet food with AI.,2.0,False,False,True,Consumer,Consumer -> Food and Beverage,1658109207,[],False,False,False,False,S22,Active,"['Consumer', 'Food and Beverage']",['Unspecified'],Early,1,True,True,True,[],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[Unspecified],0,0,0,0,0,0,0,0,0,0,0,0,1


In [71]:
df['num_locations'] = df['location_lists'].apply(len)
df['num_regions'] = df['region_lists'].apply(len)

Now that the locations have been dummied, I'll deal with the industry/subindustry/tags now. There are a bunch of columns relating to these, let's look at them.

In [72]:
df.columns

Index(['slug', 'former_names', 'small_logo_thumb_url', 'website',
       'all_locations', 'long_description', 'one_liner', 'team_size',
       'highlight_black', 'highlight_latinx', 'highlight_women', 'industry',
       'subindustry', 'launched_at', 'tags', 'top_company',
       'top_company_by_revenue', 'isHiring', 'nonprofit', 'batch', 'status',
       'industries', 'regions', 'stage', 'num_former_names', 'has_logo',
       'has_website', 'has_one_liner', 'location_lists', 'Austin, TX, USA',
       'Bengaluru, KA, India', 'Boston, MA, USA',
       'London, England, United Kingdom', 'Los Angeles, CA, USA',
       'Mexico City, CDMX, Mexico', 'Mountain View, CA, USA',
       'New York, NY, USA', 'Palo Alto, CA, USA',
       'Paris, Île-de-France, France', 'Remote', 'San Francisco, CA, USA',
       'Seattle, WA, USA', 'Toronto, ON, Canada', 'other_city', 'region_lists',
       'America / Canada - region', 'Canada - region', 'Europe - region',
       'Fully Remote - region', 'India - reg

In [73]:
df[['industry', 'subindustry', 'tags', 'industries']]

Unnamed: 0,industry,subindustry,tags,industries
0,B2B,B2B -> Productivity,[],"['B2B', 'Productivity']"
1,Consumer,Consumer -> Content,"['Community', 'Social Media', 'Social', 'Socia...","['Consumer', 'Content']"
2,Consumer,Consumer -> Content,"['Community', 'Gaming', 'Social Media', 'Video...","['Consumer', 'Content']"
3,Consumer,Consumer -> Content,[],"['Consumer', 'Content']"
4,B2B,B2B -> Marketing,[],"['B2B', 'Marketing']"
...,...,...,...,...
4405,Education,Education,"['Education', 'GovTech']",['Education']
4406,B2B,B2B -> Operations,"['Generative AI', 'B2B', 'Customer Success', '...","['B2B', 'Operations']"
4407,B2B,B2B -> Finance and Accounting,"['Finance', 'B2B']","['B2B', 'Finance and Accounting']"
4408,B2B,B2B -> Sales,"['B2B', 'Sales']","['B2B', 'Sales']"


the industries column looks like a combination of the industry and subindustry columns, so I'll focus on that column. If I make list type columns for both industries and tags then I should be able to create a third column that simply adds them together. Once I have that I can follow the same process as above to create dummy columns. 

In [74]:
df['industries'][0].replace('[', '').replace(']', '').replace("'", "").split(',')

['B2B', ' Productivity']

In [75]:
type(df['industries'][0].replace('[', '').replace(']', '').replace("'", "").split(','))

list

In [76]:
df['industries_lists'] = df['industries'].apply(lambda row: row.replace('[', '').replace(']', '').replace("'", "").split(',') if type(row)==str else [])

In [77]:
# get rid of leading whitespaces
df['industries_lists'] = df['industries_lists'].apply(lambda row: [x.lstrip() for x in row])

In [78]:
# check
df['industries_lists'][0]

['B2B', 'Productivity']

In [79]:
# now create a lists column for tags
df['tags'][1].replace('[', '').replace(']', '').replace("'", "").split(',')

['Community', ' Social Media', ' Social', ' Social Network']

In [80]:
type(df['tags'][1].replace('[', '').replace(']', '').replace("'", "").split(','))

list

In [81]:
df['tags_lists'] = df['tags'].apply(lambda row: row.replace('[', '').replace(']', '').replace("'", "").split(',') if type(row)==str else [])

In [82]:
# get rid of leading whitespaces
df['tags_lists'] = df['tags_lists'].apply(lambda row: [x.lstrip() for x in row])

In [83]:
# check
df['tags_lists'][1]

['Community', 'Social Media', 'Social', 'Social Network']

In [84]:
# create third column that combines these two, removing duplicates along the way. First see how to achieve this
df['industries_lists'][5] + df['tags_lists'][5]

['Consumer', 'Gaming', 'Gaming', 'Social']

In [85]:
type(df['industries_lists'][5] + df['tags_lists'][5])

list

In [86]:
set(df['industries_lists'][5] + df['tags_lists'][5])

{'Consumer', 'Gaming', 'Social'}

In [87]:
list(set(df['industries_lists'][5] + df['tags_lists'][5]))

['Consumer', 'Social', 'Gaming']

In [88]:
# create the column
df['industries_and_tags'] = df['industries_lists'] + df['tags_lists']

In [89]:
df['industries_and_tags'][5]

['Consumer', 'Gaming', 'Gaming', 'Social']

In [90]:
# remove duplicates
df['industries_and_tags'] = df['industries_and_tags'].apply(set).apply(list)

In [91]:
# check
df['industries_and_tags'][5]

['Consumer', 'Social', 'Gaming']

In [92]:
df.head()

Unnamed: 0,slug,former_names,small_logo_thumb_url,website,all_locations,long_description,one_liner,team_size,highlight_black,highlight_latinx,highlight_women,industry,subindustry,launched_at,tags,top_company,top_company_by_revenue,isHiring,nonprofit,batch,status,industries,regions,stage,num_former_names,has_logo,has_website,has_one_liner,location_lists,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city,region_lists,America / Canada - region,Canada - region,Europe - region,Fully Remote - region,India - region,Latin America - region,Partly Remote - region,Remote - region,South Asia - region,United Kingdom - region,United States of America - region,other_region,Unspecified - region,num_locations,num_regions,industries_lists,tags_lists,industries_and_tags
0,dropbox,[],https://bookface-images.s3.amazonaws.com/small...,http://dropbox.com,"San Francisco, CA, USA",Dropbox is building the world’s first smart wo...,Backup and share files in the cloud.,4000.0,False,False,False,B2B,B2B -> Productivity,1326791328,[],True,True,False,False,S07,Public,"['B2B', 'Productivity']","['United States of America', 'America / Canada']",Growth,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[B2B, Productivity]",[],"[, B2B, Productivity]"
1,reddit,[],https://bookface-images.s3.amazonaws.com/small...,http://reddit.com,"San Francisco, CA, USA",Founded by Steve Huffman and Alexis Ohanian in...,The frontpage of the internet.,201.0,False,False,False,Consumer,Consumer -> Content,1326791708,"['Community', 'Social Media', 'Social', 'Socia...",True,True,False,False,S05,Acquired,"['Consumer', 'Content']","['United States of America', 'America / Canada']",Early,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[Consumer, Content]","[Community, Social Media, Social, Social Network]","[Community, Content, Social, Social Network, C..."
2,twitch,"['Justin.tv', 'Twitch']",https://bookface-images.s3.amazonaws.com/small...,http://twitch.com,"San Francisco, CA, USA",Twitch is the world’s leading video platform a...,A global community creating the future of live...,2000.0,True,False,False,Consumer,Consumer -> Content,1326791723,"['Community', 'Gaming', 'Social Media', 'Video...",True,False,False,False,W07,Acquired,"['Consumer', 'Content']","['United States of America', 'America / Canada...",Growth,2,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada, R...",1,0,0,0,0,0,1,1,0,0,1,0,0,1,4,"[Consumer, Content]","[Community, Gaming, Social Media, Video, Socia...","[Video, Community, Content, Social Network, Co..."
3,scribd,[],https://bookface-images.s3.amazonaws.com/small...,http://scribd.com,"San Francisco, CA, USA",Read and listen without limits. Unlimited* aud...,World's largest online library.,300.0,False,False,False,Consumer,Consumer -> Content,1326791580,[],True,True,True,False,S06,Active,"['Consumer', 'Content']","['United States of America', 'America / Canada']",Growth,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[Consumer, Content]",[],"[Content, Consumer, ]"
4,weebly,[],https://bookface-images.s3.amazonaws.com/small...,http://weebly.com,"San Francisco, CA, USA",Company Information\r\nWeebly is a consumer se...,Build a free website that grows with your busi...,201.0,False,False,False,B2B,B2B -> Marketing,1326791493,[],True,False,False,False,W07,Acquired,"['B2B', 'Marketing']","['United States of America', 'America / Canada']",Growth,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[B2B, Marketing]",[],"[, B2B, Marketing]"


In [93]:
df.at[4, 'industries_and_tags']

['', 'B2B', 'Marketing']

In [94]:
df.at[4, 'industries_and_tags'] = [x for x in df.at[4, 'industries_and_tags'] if x]

In [95]:
df.at[4, 'industries_and_tags']

['B2B', 'Marketing']

In [96]:
df.at[4, 'industries_and_tags']

['B2B', 'Marketing']

In [97]:
# there are empty strings in some of the industries_and_tags lists, remove these
for i in range(len(df['industries_and_tags'])):
    df.at[i, 'industries_and_tags'] = [x for x in df.at[i, 'industries_and_tags'] if x]

In [98]:
df.head()

Unnamed: 0,slug,former_names,small_logo_thumb_url,website,all_locations,long_description,one_liner,team_size,highlight_black,highlight_latinx,highlight_women,industry,subindustry,launched_at,tags,top_company,top_company_by_revenue,isHiring,nonprofit,batch,status,industries,regions,stage,num_former_names,has_logo,has_website,has_one_liner,location_lists,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city,region_lists,America / Canada - region,Canada - region,Europe - region,Fully Remote - region,India - region,Latin America - region,Partly Remote - region,Remote - region,South Asia - region,United Kingdom - region,United States of America - region,other_region,Unspecified - region,num_locations,num_regions,industries_lists,tags_lists,industries_and_tags
0,dropbox,[],https://bookface-images.s3.amazonaws.com/small...,http://dropbox.com,"San Francisco, CA, USA",Dropbox is building the world’s first smart wo...,Backup and share files in the cloud.,4000.0,False,False,False,B2B,B2B -> Productivity,1326791328,[],True,True,False,False,S07,Public,"['B2B', 'Productivity']","['United States of America', 'America / Canada']",Growth,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[B2B, Productivity]",[],"[B2B, Productivity]"
1,reddit,[],https://bookface-images.s3.amazonaws.com/small...,http://reddit.com,"San Francisco, CA, USA",Founded by Steve Huffman and Alexis Ohanian in...,The frontpage of the internet.,201.0,False,False,False,Consumer,Consumer -> Content,1326791708,"['Community', 'Social Media', 'Social', 'Socia...",True,True,False,False,S05,Acquired,"['Consumer', 'Content']","['United States of America', 'America / Canada']",Early,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[Consumer, Content]","[Community, Social Media, Social, Social Network]","[Community, Content, Social, Social Network, C..."
2,twitch,"['Justin.tv', 'Twitch']",https://bookface-images.s3.amazonaws.com/small...,http://twitch.com,"San Francisco, CA, USA",Twitch is the world’s leading video platform a...,A global community creating the future of live...,2000.0,True,False,False,Consumer,Consumer -> Content,1326791723,"['Community', 'Gaming', 'Social Media', 'Video...",True,False,False,False,W07,Acquired,"['Consumer', 'Content']","['United States of America', 'America / Canada...",Growth,2,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada, R...",1,0,0,0,0,0,1,1,0,0,1,0,0,1,4,"[Consumer, Content]","[Community, Gaming, Social Media, Video, Socia...","[Video, Community, Content, Social Network, Co..."
3,scribd,[],https://bookface-images.s3.amazonaws.com/small...,http://scribd.com,"San Francisco, CA, USA",Read and listen without limits. Unlimited* aud...,World's largest online library.,300.0,False,False,False,Consumer,Consumer -> Content,1326791580,[],True,True,True,False,S06,Active,"['Consumer', 'Content']","['United States of America', 'America / Canada']",Growth,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[Consumer, Content]",[],"[Content, Consumer]"
4,weebly,[],https://bookface-images.s3.amazonaws.com/small...,http://weebly.com,"San Francisco, CA, USA",Company Information\r\nWeebly is a consumer se...,Build a free website that grows with your busi...,201.0,False,False,False,B2B,B2B -> Marketing,1326791493,[],True,False,False,False,W07,Acquired,"['B2B', 'Marketing']","['United States of America', 'America / Canada']",Growth,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[B2B, Marketing]",[],"[B2B, Marketing]"


In [99]:
# create columns that specifies length of this column
df['num_tags'] = df['industries_and_tags'].apply(len)

In [100]:
(df['num_tags'] == 0).sum()

0

In [101]:
# how many tags are there now?
df.explode('industries_and_tags')['industries_and_tags']

0                B2B
0       Productivity
1          Community
1            Content
1             Social
            ...     
4408           Sales
4409             B2B
4409            SaaS
4409       Analytics
4409    Productivity
Name: industries_and_tags, Length: 17446, dtype: object

In [102]:
# how many do I want to dummy?
df.explode('industries_and_tags')['industries_and_tags'].value_counts(normalize=True)[:50].sum()

0.7139172303106729

In [103]:
# the top 50 tags account for 72% of mentioned tags, I'll go with this
list(df.explode('industries_and_tags')['industries_and_tags'].value_counts()[:50].index)

['B2B',
 'SaaS',
 'Consumer',
 'Fintech',
 'Healthcare',
 'Developer Tools',
 'Product and Design',
 'Engineering',
 'Artificial Intelligence',
 'Marketplace',
 'Education',
 'Productivity',
 'Industrials',
 'Infrastructure',
 'AI',
 'Machine Learning',
 'Analytics',
 'Payments',
 'E-commerce',
 'Social',
 'Climate',
 'Generative AI',
 'Retail',
 'Marketing',
 'Open Source',
 'Real Estate and Construction',
 'API',
 'Home and Personal',
 'Health Tech',
 'Sales',
 'Operations',
 'Consumer Health and Wellness',
 'Biotech',
 'Security',
 'Supply Chain and Logistics',
 'Gaming',
 'Consumer Health Services',
 'Content',
 'Crypto / Web3',
 'Healthcare IT',
 'Food and Beverage',
 'Logistics',
 'Hardware',
 'Finance and Accounting',
 'Enterprise',
 'Consumer Finance',
 'Data Engineering',
 'Human Resources',
 'Proptech',
 'Housing and Real Estate']

I will make columns for these 50 tags and another for other tags, so 51 total tag columns.

In [104]:
# creating the dummy columns and populating them based on the industries_and_tags column 
top_tags = list(df.explode('industries_and_tags')['industries_and_tags'].value_counts()[:50].index)
top_tags.sort()
for tag in top_tags:
    df[tag] = df.apply(lambda row: 1 if tag in row['industries_and_tags'] else 0, axis=1)

In [105]:
# the other_tag column gets populated for any row that does not already have a top tag column checked off
df['other_tag'] = df.apply(lambda row: 1 if len(np.setdiff1d(row['industries_and_tags'], top_tags)) > 0 else 0, axis=1)

In [106]:
# check
df[df['other_tag'] == 1]

Unnamed: 0,slug,former_names,small_logo_thumb_url,website,all_locations,long_description,one_liner,team_size,highlight_black,highlight_latinx,highlight_women,industry,subindustry,launched_at,tags,top_company,top_company_by_revenue,isHiring,nonprofit,batch,status,industries,regions,stage,num_former_names,has_logo,has_website,has_one_liner,location_lists,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city,region_lists,America / Canada - region,Canada - region,Europe - region,Fully Remote - region,India - region,Latin America - region,Partly Remote - region,Remote - region,South Asia - region,United Kingdom - region,United States of America - region,other_region,Unspecified - region,num_locations,num_regions,industries_lists,tags_lists,industries_and_tags,num_tags,AI,API,Analytics,Artificial Intelligence,B2B,Biotech,Climate,Consumer,Consumer Finance,Consumer Health Services,Consumer Health and Wellness,Content,Crypto / Web3,Data Engineering,Developer Tools,E-commerce,Education,Engineering,Enterprise,Finance and Accounting,Fintech,Food and Beverage,Gaming,Generative AI,Hardware,Health Tech,Healthcare,Healthcare IT,Home and Personal,Housing and Real Estate,Human Resources,Industrials,Infrastructure,Logistics,Machine Learning,Marketing,Marketplace,Open Source,Operations,Payments,Product and Design,Productivity,Proptech,Real Estate and Construction,Retail,SaaS,Sales,Security,Social,Supply Chain and Logistics,other_tag
1,reddit,[],https://bookface-images.s3.amazonaws.com/small...,http://reddit.com,"San Francisco, CA, USA",Founded by Steve Huffman and Alexis Ohanian in...,The frontpage of the internet.,201.0,False,False,False,Consumer,Consumer -> Content,1326791708,"['Community', 'Social Media', 'Social', 'Socia...",True,True,False,False,S05,Acquired,"['Consumer', 'Content']","['United States of America', 'America / Canada']",Early,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[Consumer, Content]","[Community, Social Media, Social, Social Network]","[Community, Content, Social, Social Network, C...",6,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
2,twitch,"['Justin.tv', 'Twitch']",https://bookface-images.s3.amazonaws.com/small...,http://twitch.com,"San Francisco, CA, USA",Twitch is the world’s leading video platform a...,A global community creating the future of live...,2000.0,True,False,False,Consumer,Consumer -> Content,1326791723,"['Community', 'Gaming', 'Social Media', 'Video...",True,False,False,False,W07,Acquired,"['Consumer', 'Content']","['United States of America', 'America / Canada...",Growth,2,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada, R...",1,0,0,0,0,0,1,1,0,0,1,0,0,1,4,"[Consumer, Content]","[Community, Gaming, Social Media, Video, Socia...","[Video, Community, Content, Social Network, Co...",7,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
9,kiko,[],https://bookface-images.s3.amazonaws.com/small...,http://kiko.com,"Cambridge, MA, USA",Kiko is a web calendar for anyone who wants to...,We're the best online calendar solution to eve...,0.0,False,False,False,B2B,B2B -> Productivity,1326791700,['Calendar'],False,False,False,False,S05,Acquired,"['B2B', 'Productivity']","['United States of America', 'America / Canada']",Early,0,True,True,True,"[Cambridge, MA, USA]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[B2B, Productivity]",[Calendar],"[B2B, Calendar, Productivity]",3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
18,heysan,[],https://bookface-images.s3.amazonaws.com/small...,http://heysan.com,"San Francisco, CA, USA",Heysan was a free mobile messaging and communi...,Heysan was a mobile messenger and community,0.0,False,False,False,Consumer,Consumer -> Social,1326791558,"['Community', 'Messaging']",False,False,False,False,W07,Acquired,"['Consumer', 'Social']","['United States of America', 'America / Canada']",Early,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[Consumer, Social]","[Community, Messaging]","[Consumer, Social, Community, Messaging]",4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
20,zenter,[],https://bookface-images.s3.amazonaws.com/small...,https://googleblog.blogspot.com/2007/06/more-s...,"Mountain View, CA, USA",Zenter is an web based presentation app that p...,Zenter is the gmail of slides,2.0,False,False,False,B2B,B2B -> Productivity,1326791525,"['Documents', 'SaaS', 'B2B', 'Productivity']",False,False,False,False,W07,Acquired,"['B2B', 'Productivity']","['United States of America', 'America / Canada']",Early,0,True,True,True,"[Mountain View, CA, USA]",0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[B2B, Productivity]","[Documents, SaaS, B2B, Productivity]","[B2B, SaaS, Documents, Productivity]",4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4403,silkchart,['SandyLane'],https://bookface-images.s3.amazonaws.com/small...,https://www.silkchart.com,"San Francisco, CA, USA; Remote",SilkChart helps reps improve their performance...,AI-powered sales coach,4.0,True,False,True,B2B,B2B -> Sales,1657442843,"['Artificial Intelligence', 'SaaS', 'B2B', 'Sa...",False,False,True,False,S22,Active,"['B2B', 'Sales']","['United States of America', 'America / Canada...",Early,1,True,True,True,"[San Francisco, CA, USA, Remote]",0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,"[United States of America, America / Canada, R...",1,0,0,1,0,0,0,1,0,0,1,0,0,2,4,"[B2B, Sales]","[Artificial Intelligence, SaaS, B2B, Sales, Sa...","[Sales, B2B, Sales Enablement, SaaS, Artificia...",5,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1
4404,keylika,[],https://bookface-images.s3.amazonaws.com/small...,https://www.keylika.com,"Walnut Creek, CA, USA",Keylika is an early stage biopharma startup de...,Redefining the Standard of Care for Iron Defic...,1.0,False,False,False,Healthcare,Healthcare -> Therapeutics,1658519436,"['Medical Devices', 'Biotech', 'Nanotechnology...",False,False,False,False,S22,Active,"['Healthcare', 'Therapeutics']","['United States of America', 'America / Canada...",Early,0,True,True,True,"[Walnut Creek, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,"[United States of America, America / Canada, R...",1,0,0,0,0,0,1,1,0,0,1,0,0,1,4,"[Healthcare, Therapeutics]","[Medical Devices, Biotech, Nanotechnology, The...","[Therapeutics, Drug discovery, Medical Devices...",6,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4405,emerge-career,"['Ameelio', 'Ameelio Emerge', 'Emerge']",https://bookface-images.s3.amazonaws.com/small...,https://www.emergecareer.com/,Remote,Emerge sells online job training to the govern...,Selling online job training to the government,3.0,True,True,False,Education,Education,1657112954,"['Education', 'GovTech']",False,False,False,False,S22,Active,['Education'],"['Remote', 'Fully Remote']",Early,3,True,True,True,[Remote],0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,"[Remote, Fully Remote]",0,0,0,1,0,0,0,1,0,0,0,0,0,1,2,[Education],"[Education, GovTech]","[Education, GovTech]",2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4406,opensight,['FastFile'],https://bookface-images.s3.amazonaws.com/small...,https://www.opensight.ai,"San Francisco, CA, USA; Sunnyvale, CA, USA; Re...",OpenSight helps companies quickly scale up the...,AI-powered customer support automation for fas...,3.0,False,False,True,B2B,B2B -> Operations,1673668209,"['Generative AI', 'B2B', 'Customer Success', '...",False,False,False,False,W23,Inactive,"['B2B', 'Operations']","['United States of America', 'America / Canada...",Early,1,True,True,True,"[San Francisco, CA, USA, Sunnyvale, CA, USA, R...",0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,"[United States of America, America / Canada, R...",1,0,0,1,0,0,0,1,0,0,1,0,0,3,4,"[B2B, Operations]","[Generative AI, B2B, Customer Success, Custome...","[Operations, Customer Success, B2B, Generative...",5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1


In [107]:
# looks like other might have gotten checked off for every row
df['other_tag'].sum()

2981

Moving on to getting the age of the companies now. First I'll check the timestamps to see if I can get the founding date from those. If that doesn't work then I'll use the batch to get the year and season of founding.

In [108]:
from datetime import datetime

# using reddit as an example
timestamp = 1326791708

# convert the timestamp to a datetime object in the local timezone
datetime.fromtimestamp(timestamp)

datetime.datetime(2012, 1, 17, 4, 15, 8)

This says reddit was launched in 2012 but it was actually launched in 2005 (which is accurately identified by the batch). I'll ignore timestamps and get the years since founding from the batch.

In [109]:
df['years_since_founding'] = df['batch'].apply(lambda row: 23 - int(row.replace('W', '').replace('S', '').replace('IK', '')))

In [110]:
df.head()

Unnamed: 0,slug,former_names,small_logo_thumb_url,website,all_locations,long_description,one_liner,team_size,highlight_black,highlight_latinx,highlight_women,industry,subindustry,launched_at,tags,top_company,top_company_by_revenue,isHiring,nonprofit,batch,status,industries,regions,stage,num_former_names,has_logo,has_website,has_one_liner,location_lists,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city,region_lists,America / Canada - region,Canada - region,Europe - region,Fully Remote - region,India - region,Latin America - region,Partly Remote - region,Remote - region,South Asia - region,United Kingdom - region,United States of America - region,other_region,Unspecified - region,num_locations,num_regions,industries_lists,tags_lists,industries_and_tags,num_tags,AI,API,Analytics,Artificial Intelligence,B2B,Biotech,Climate,Consumer,Consumer Finance,Consumer Health Services,Consumer Health and Wellness,Content,Crypto / Web3,Data Engineering,Developer Tools,E-commerce,Education,Engineering,Enterprise,Finance and Accounting,Fintech,Food and Beverage,Gaming,Generative AI,Hardware,Health Tech,Healthcare,Healthcare IT,Home and Personal,Housing and Real Estate,Human Resources,Industrials,Infrastructure,Logistics,Machine Learning,Marketing,Marketplace,Open Source,Operations,Payments,Product and Design,Productivity,Proptech,Real Estate and Construction,Retail,SaaS,Sales,Security,Social,Supply Chain and Logistics,other_tag,years_since_founding
0,dropbox,[],https://bookface-images.s3.amazonaws.com/small...,http://dropbox.com,"San Francisco, CA, USA",Dropbox is building the world’s first smart wo...,Backup and share files in the cloud.,4000.0,False,False,False,B2B,B2B -> Productivity,1326791328,[],True,True,False,False,S07,Public,"['B2B', 'Productivity']","['United States of America', 'America / Canada']",Growth,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[B2B, Productivity]",[],"[B2B, Productivity]",2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,16
1,reddit,[],https://bookface-images.s3.amazonaws.com/small...,http://reddit.com,"San Francisco, CA, USA",Founded by Steve Huffman and Alexis Ohanian in...,The frontpage of the internet.,201.0,False,False,False,Consumer,Consumer -> Content,1326791708,"['Community', 'Social Media', 'Social', 'Socia...",True,True,False,False,S05,Acquired,"['Consumer', 'Content']","['United States of America', 'America / Canada']",Early,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[Consumer, Content]","[Community, Social Media, Social, Social Network]","[Community, Content, Social, Social Network, C...",6,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,18
2,twitch,"['Justin.tv', 'Twitch']",https://bookface-images.s3.amazonaws.com/small...,http://twitch.com,"San Francisco, CA, USA",Twitch is the world’s leading video platform a...,A global community creating the future of live...,2000.0,True,False,False,Consumer,Consumer -> Content,1326791723,"['Community', 'Gaming', 'Social Media', 'Video...",True,False,False,False,W07,Acquired,"['Consumer', 'Content']","['United States of America', 'America / Canada...",Growth,2,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada, R...",1,0,0,0,0,0,1,1,0,0,1,0,0,1,4,"[Consumer, Content]","[Community, Gaming, Social Media, Video, Socia...","[Video, Community, Content, Social Network, Co...",7,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,16
3,scribd,[],https://bookface-images.s3.amazonaws.com/small...,http://scribd.com,"San Francisco, CA, USA",Read and listen without limits. Unlimited* aud...,World's largest online library.,300.0,False,False,False,Consumer,Consumer -> Content,1326791580,[],True,True,True,False,S06,Active,"['Consumer', 'Content']","['United States of America', 'America / Canada']",Growth,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[Consumer, Content]",[],"[Content, Consumer]",2,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17
4,weebly,[],https://bookface-images.s3.amazonaws.com/small...,http://weebly.com,"San Francisco, CA, USA",Company Information\r\nWeebly is a consumer se...,Build a free website that grows with your busi...,201.0,False,False,False,B2B,B2B -> Marketing,1326791493,[],True,False,False,False,W07,Acquired,"['B2B', 'Marketing']","['United States of America', 'America / Canada']",Growth,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[B2B, Marketing]",[],"[B2B, Marketing]",2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16


I will dummy the stage column now.

In [111]:
#status_dummy = pd.get_dummies(df['status'])

In [112]:
#df = pd.concat([df, status_dummy], axis=1)

In [113]:
stage_dummy = pd.get_dummies(df['stage'])

In [114]:
df = pd.concat([df, stage_dummy], axis=1)

In [115]:
df.head()

Unnamed: 0,slug,former_names,small_logo_thumb_url,website,all_locations,long_description,one_liner,team_size,highlight_black,highlight_latinx,highlight_women,industry,subindustry,launched_at,tags,top_company,top_company_by_revenue,isHiring,nonprofit,batch,status,industries,regions,stage,num_former_names,has_logo,has_website,has_one_liner,location_lists,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city,region_lists,America / Canada - region,Canada - region,Europe - region,Fully Remote - region,India - region,Latin America - region,Partly Remote - region,Remote - region,South Asia - region,United Kingdom - region,United States of America - region,other_region,Unspecified - region,num_locations,num_regions,industries_lists,tags_lists,industries_and_tags,num_tags,AI,API,Analytics,Artificial Intelligence,B2B,Biotech,Climate,Consumer,Consumer Finance,Consumer Health Services,Consumer Health and Wellness,Content,Crypto / Web3,Data Engineering,Developer Tools,E-commerce,Education,Engineering,Enterprise,Finance and Accounting,Fintech,Food and Beverage,Gaming,Generative AI,Hardware,Health Tech,Healthcare,Healthcare IT,Home and Personal,Housing and Real Estate,Human Resources,Industrials,Infrastructure,Logistics,Machine Learning,Marketing,Marketplace,Open Source,Operations,Payments,Product and Design,Productivity,Proptech,Real Estate and Construction,Retail,SaaS,Sales,Security,Social,Supply Chain and Logistics,other_tag,years_since_founding,Early,Growth
0,dropbox,[],https://bookface-images.s3.amazonaws.com/small...,http://dropbox.com,"San Francisco, CA, USA",Dropbox is building the world’s first smart wo...,Backup and share files in the cloud.,4000.0,False,False,False,B2B,B2B -> Productivity,1326791328,[],True,True,False,False,S07,Public,"['B2B', 'Productivity']","['United States of America', 'America / Canada']",Growth,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[B2B, Productivity]",[],"[B2B, Productivity]",2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,16,False,True
1,reddit,[],https://bookface-images.s3.amazonaws.com/small...,http://reddit.com,"San Francisco, CA, USA",Founded by Steve Huffman and Alexis Ohanian in...,The frontpage of the internet.,201.0,False,False,False,Consumer,Consumer -> Content,1326791708,"['Community', 'Social Media', 'Social', 'Socia...",True,True,False,False,S05,Acquired,"['Consumer', 'Content']","['United States of America', 'America / Canada']",Early,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[Consumer, Content]","[Community, Social Media, Social, Social Network]","[Community, Content, Social, Social Network, C...",6,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,18,True,False
2,twitch,"['Justin.tv', 'Twitch']",https://bookface-images.s3.amazonaws.com/small...,http://twitch.com,"San Francisco, CA, USA",Twitch is the world’s leading video platform a...,A global community creating the future of live...,2000.0,True,False,False,Consumer,Consumer -> Content,1326791723,"['Community', 'Gaming', 'Social Media', 'Video...",True,False,False,False,W07,Acquired,"['Consumer', 'Content']","['United States of America', 'America / Canada...",Growth,2,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada, R...",1,0,0,0,0,0,1,1,0,0,1,0,0,1,4,"[Consumer, Content]","[Community, Gaming, Social Media, Video, Socia...","[Video, Community, Content, Social Network, Co...",7,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,16,False,True
3,scribd,[],https://bookface-images.s3.amazonaws.com/small...,http://scribd.com,"San Francisco, CA, USA",Read and listen without limits. Unlimited* aud...,World's largest online library.,300.0,False,False,False,Consumer,Consumer -> Content,1326791580,[],True,True,True,False,S06,Active,"['Consumer', 'Content']","['United States of America', 'America / Canada']",Growth,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[Consumer, Content]",[],"[Content, Consumer]",2,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,False,True
4,weebly,[],https://bookface-images.s3.amazonaws.com/small...,http://weebly.com,"San Francisco, CA, USA",Company Information\r\nWeebly is a consumer se...,Build a free website that grows with your busi...,201.0,False,False,False,B2B,B2B -> Marketing,1326791493,[],True,False,False,False,W07,Acquired,"['B2B', 'Marketing']","['United States of America', 'America / Canada']",Growth,0,True,True,True,"[San Francisco, CA, USA]",0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,"[United States of America, America / Canada]",1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,"[B2B, Marketing]",[],"[B2B, Marketing]",2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,False,True


The only things left to deal with are processing the description column and creating a success column. I'll push off processing the description column since I want to do an analysis without it first anyway. However, I do need to drop the rows where description is missing since I want to be able to make a direct comparison between the models with and without it. I just want to quickly check if dropping these rows changes the proportion of statuses, since that is the main measure of success.

In [116]:
df['status'].value_counts(normalize=True)

status
Active      0.716780
Inactive    0.160998
Acquired    0.118367
Public      0.003855
Name: proportion, dtype: float64

In [117]:
df.dropna(subset='long_description')['status'].value_counts(normalize=True)

status
Active      0.726916
Inactive    0.148084
Acquired    0.120825
Public      0.004175
Name: proportion, dtype: float64

In [118]:
# close enough, I'll drop these nulls
df.dropna(subset='long_description', inplace=True)

In [119]:
# reset index
df.reset_index(drop=True, inplace=True)

In [120]:
df.shape

(4072, 118)

I will train my models on the subset of the data where success is known (so on those companies where the status is one of public, acquired, or inactive) and test them on the rest of the data (where the status is active). I still need to check for multicollinearity first. I also want to iterate on this process with and without information that intuitively wouldn't be known at a companies inception. For example, team size or age of company would both be low numbers when a company is new. First I'll use all the columns then redo the models without the potentially problematic columns.

In [121]:
df['status'].value_counts()

status
Active      2960
Inactive     603
Acquired     492
Public        17
Name: count, dtype: int64

In [122]:
# create column for success
df['success'] = np.where((df['status'] == 'Public') | (df['status'] == 'Acquired'), 1, 0)

In [123]:
# create column that indicates if status is active or not
df['active'] = np.where(df['status'] == 'Active', 1, 0)

In [124]:
df['success'].sum()

509

In [125]:
# check using numbers above
492+17

509

In [126]:
df['active'].sum()

2960

In [127]:
# now drop the non-numeric columns
mod_df = df.drop(columns=['slug', 'former_names', 'small_logo_thumb_url', 'website',
       'all_locations', 'long_description', 'one_liner', 'industry',
       'subindustry', 'launched_at', 'tags', 'batch', 'status',
       'industries', 'regions', 'stage', 'location_lists', 'region_lists', 'industries_lists', 'tags_lists', 'industries_and_tags'])

In [128]:
mod_df.head()

Unnamed: 0,team_size,highlight_black,highlight_latinx,highlight_women,top_company,top_company_by_revenue,isHiring,nonprofit,num_former_names,has_logo,has_website,has_one_liner,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city,America / Canada - region,Canada - region,Europe - region,Fully Remote - region,India - region,Latin America - region,Partly Remote - region,Remote - region,South Asia - region,United Kingdom - region,United States of America - region,other_region,Unspecified - region,num_locations,num_regions,num_tags,AI,API,Analytics,Artificial Intelligence,B2B,Biotech,Climate,Consumer,Consumer Finance,Consumer Health Services,Consumer Health and Wellness,Content,Crypto / Web3,Data Engineering,Developer Tools,E-commerce,Education,Engineering,Enterprise,Finance and Accounting,Fintech,Food and Beverage,Gaming,Generative AI,Hardware,Health Tech,Healthcare,Healthcare IT,Home and Personal,Housing and Real Estate,Human Resources,Industrials,Infrastructure,Logistics,Machine Learning,Marketing,Marketplace,Open Source,Operations,Payments,Product and Design,Productivity,Proptech,Real Estate and Construction,Retail,SaaS,Sales,Security,Social,Supply Chain and Logistics,other_tag,years_since_founding,Early,Growth,success,active
0,4000.0,False,False,False,True,True,False,False,0,True,True,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,16,False,True,1,0
1,201.0,False,False,False,True,True,False,False,0,True,True,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,6,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,18,True,False,1,0
2,2000.0,True,False,False,True,False,False,False,2,True,True,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,1,4,7,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,16,False,True,1,0
3,300.0,False,False,False,True,True,True,False,0,True,True,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,2,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,False,True,0,1
4,201.0,False,False,False,True,False,False,False,0,True,True,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,2,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,False,True,1,0


In [134]:
# removing columns whose VIF was too high
mod_df.drop(columns=['Fully Remote - region', 'Partly Remote - region', 'Remote - region', 'Engineering', 'Product and Design', 'America / Canada - region', 'Growth', 'South Asia - region', 'United Kingdom - region', 'num_locations'], inplace=True)

In [153]:
# try again
X_withC = sm.add_constant(mod_df.drop(columns=['success', 'active']))

In [154]:
X_withC.head()

Unnamed: 0,const,team_size,highlight_black,highlight_latinx,highlight_women,top_company,top_company_by_revenue,isHiring,nonprofit,num_former_names,has_logo,has_website,has_one_liner,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city,Canada - region,Europe - region,India - region,Latin America - region,United States of America - region,other_region,Unspecified - region,num_regions,num_tags,AI,API,Analytics,Artificial Intelligence,B2B,Biotech,Climate,Consumer,Consumer Finance,Consumer Health Services,Consumer Health and Wellness,Content,Crypto / Web3,Data Engineering,Developer Tools,E-commerce,Education,Enterprise,Finance and Accounting,Fintech,Food and Beverage,Gaming,Generative AI,Hardware,Health Tech,Healthcare,Healthcare IT,Home and Personal,Housing and Real Estate,Human Resources,Industrials,Infrastructure,Logistics,Machine Learning,Marketing,Marketplace,Open Source,Operations,Payments,Productivity,Proptech,Real Estate and Construction,Retail,SaaS,Sales,Security,Social,Supply Chain and Logistics,other_tag,years_since_founding,Early
0,1.0,4000.0,False,False,False,True,True,False,False,0,True,True,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,2,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,16,False
1,1.0,201.0,False,False,False,True,True,False,False,0,True,True,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,2,6,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,18,True
2,1.0,2000.0,True,False,False,True,False,False,False,2,True,True,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,4,7,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,16,False
3,1.0,300.0,False,False,False,True,True,True,False,0,True,True,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,2,2,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,False
4,1.0,201.0,False,False,False,True,False,False,False,0,True,True,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,2,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,False


In [155]:
for col in X_withC.columns:
    X_withC[col] = X_withC[col].astype(np.float64)

In [156]:
X_withC.head()

Unnamed: 0,const,team_size,highlight_black,highlight_latinx,highlight_women,top_company,top_company_by_revenue,isHiring,nonprofit,num_former_names,has_logo,has_website,has_one_liner,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city,Canada - region,Europe - region,India - region,Latin America - region,United States of America - region,other_region,Unspecified - region,num_regions,num_tags,AI,API,Analytics,Artificial Intelligence,B2B,Biotech,Climate,Consumer,Consumer Finance,Consumer Health Services,Consumer Health and Wellness,Content,Crypto / Web3,Data Engineering,Developer Tools,E-commerce,Education,Enterprise,Finance and Accounting,Fintech,Food and Beverage,Gaming,Generative AI,Hardware,Health Tech,Healthcare,Healthcare IT,Home and Personal,Housing and Real Estate,Human Resources,Industrials,Infrastructure,Logistics,Machine Learning,Marketing,Marketplace,Open Source,Operations,Payments,Productivity,Proptech,Real Estate and Construction,Retail,SaaS,Sales,Security,Social,Supply Chain and Logistics,other_tag,years_since_founding,Early
0,1.0,4000.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0
1,1.0,201.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,18.0,1.0
2,1.0,2000.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,16.0,0.0
3,1.0,300.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0
4,1.0,201.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0


In [165]:
# create the initial train and test sets
train = mod_df[mod_df['active'] == 0]
test = mod_df[mod_df['active'] == 1]

In [166]:
# drop the active column from these sets
train = train.drop(columns='active')
test = test.drop(columns='active')

In [169]:
# create X and y from train dataframe
X = train.drop(columns='success')
y = train['success']

In [173]:
# do train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Modelling

## Logistic Regression Model <a class="anchor" id="logregmodel"></a>

In [188]:
# instantiate a logistic regression model
logreg_model = LogisticRegression(max_iter=10000)

# fit it to training data
logreg_model.fit(X_train, y_train)

# score the accuracy of the model
lr_train_score = logreg_model.score(X_train, y_train)
lr_test_score = logreg_model.score(X_test, y_test)

print(f"Score on training set: {lr_train_score}")
print(f"Score on test set: {lr_test_score}")

Score on training set: 0.7457817772778402
Score on test set: 0.6771300448430493


In [189]:
# Generate confusion matrix
lr_pred = logreg_model.predict(X_test)
cf_matrix = confusion_matrix(y_test, lr_pred)

# label rows and columns
cf_df = pd.DataFrame(
    cf_matrix, 
    columns=["Predicted Failure", "Predicted Success"],
    index=["True Failure", "True Success"]
)

display(cf_df)

Unnamed: 0,Predicted Failure,Predicted Success
True Failure,94,15
True Success,57,57


In [190]:
lr_precision = precision_score(y_test, lr_pred)
lr_recall = recall_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred)
print(f'precision = {lr_precision}, recall = {lr_recall}, f1 = {lr_f1}')

precision = 0.7916666666666666, recall = 0.5, f1 = 0.6129032258064516


In [191]:
# creating a dataframe of the coefficients
coef = pd.DataFrame(logreg_model.coef_,  columns = X_train.columns)

In [192]:
coef

Unnamed: 0,team_size,highlight_black,highlight_latinx,highlight_women,top_company,top_company_by_revenue,isHiring,nonprofit,num_former_names,has_logo,has_website,has_one_liner,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city,Canada - region,Europe - region,India - region,Latin America - region,United States of America - region,other_region,Unspecified - region,num_regions,num_tags,AI,API,Analytics,Artificial Intelligence,B2B,Biotech,Climate,Consumer,Consumer Finance,Consumer Health Services,Consumer Health and Wellness,Content,Crypto / Web3,Data Engineering,Developer Tools,E-commerce,Education,Enterprise,Finance and Accounting,Fintech,Food and Beverage,Gaming,Generative AI,Hardware,Health Tech,Healthcare,Healthcare IT,Home and Personal,Housing and Real Estate,Human Resources,Industrials,Infrastructure,Logistics,Machine Learning,Marketing,Marketplace,Open Source,Operations,Payments,Productivity,Proptech,Real Estate and Construction,Retail,SaaS,Sales,Security,Social,Supply Chain and Logistics,other_tag,years_since_founding,Early
0,0.015454,-0.019778,-0.205002,0.350843,1.787762,0.142749,3e-06,0.124754,-0.154924,0.599989,-0.005632,0.627414,1.277441,-0.422325,0.48861,0.074449,0.029944,0.021456,0.047286,-0.038499,-0.200841,-0.457027,-1.023146,0.428756,-0.706172,0.137623,0.204059,-0.325683,-0.014237,0.239458,-0.540421,-0.354583,-0.00953,-1.083418,0.285586,-0.116256,-0.074384,0.564202,0.416462,0.504231,0.679185,0.465626,-1.429672,0.394608,0.592336,-0.123076,-0.669097,0.197086,-0.07092,-0.638793,0.03168,0.014293,0.031829,-0.373571,0.113362,0.336524,-0.427214,-0.438395,-0.153775,-0.543041,-0.165967,0.092225,0.642281,0.087857,0.413016,-0.591035,-0.251231,0.609682,0.22966,-0.205986,0.527996,-0.501616,0.331,0.357707,0.585814,0.017887,-0.670411,-0.074284,-0.461505,0.436826,-0.11848,0.589802,-0.068472,-0.148151,-0.02631,0.043551,-0.985987


In [193]:
# adding the odds ratios to the dataframe to look at them side by side with the coefficients
coef.loc[1] = np.exp(logreg_model.coef_)[0]
coef.rename({0:'coefficients', 1:'odds_ratios'}, inplace=True)

In [194]:
# sorting by coefficients, descending --> positively correlates with success
coef.sort_values(axis=1, by='coefficients', ascending=False)

Unnamed: 0,top_company,"Austin, TX, USA",B2B,Healthcare IT,has_one_liner,Infrastructure,has_logo,Consumer Finance,Security,Payments,API,Marketing,Artificial Intelligence,"Boston, MA, USA",Biotech,SaaS,"San Francisco, CA, USA",Analytics,Housing and Real Estate,Consumer,Operations,highlight_women,Fintech,Open Source,num_regions,India - region,Logistics,other_city,Content,top_company_by_revenue,"Toronto, ON, Canada",nonprofit,Finance and Accounting,Healthcare,Home and Personal,"London, England, United Kingdom","Mountain View, CA, USA",years_since_founding,Education,Developer Tools,"Los Angeles, CA, USA","Mexico City, CDMX, Mexico",Productivity,team_size,E-commerce,isHiring,has_website,other_region,Europe - region,highlight_black,other_tag,"New York, NY, USA",Social,Crypto / Web3,Real Estate and Construction,AI,num_tags,Sales,Consumer Health Services,Supply Chain and Logistics,Generative AI,num_former_names,Health Tech,"Palo Alto, CA, USA",highlight_latinx,Machine Learning,Industrials,Canada - region,United States of America - region,Enterprise,"Bengaluru, KA, India",Food and Beverage,Gaming,"Paris, Île-de-France, France",Retail,Marketplace,Latin America - region,Hardware,Human Resources,Data Engineering,Consumer Health and Wellness,Proptech,"Seattle, WA, USA",Early,Remote,Unspecified - region,Climate
coefficients,1.787762,1.277441,0.679185,0.642281,0.627414,0.609682,0.599989,0.592336,0.589802,0.585814,0.564202,0.527996,0.504231,0.48861,0.465626,0.436826,0.428756,0.416462,0.413016,0.394608,0.357707,0.350843,0.336524,0.331,0.285586,0.239458,0.22966,0.204059,0.197086,0.142749,0.137623,0.124754,0.113362,0.092225,0.087857,0.074449,0.047286,0.043551,0.031829,0.03168,0.029944,0.021456,0.017887,0.015454,0.014293,3e-06,-0.005632,-0.00953,-0.014237,-0.019778,-0.02631,-0.038499,-0.068472,-0.07092,-0.074284,-0.074384,-0.116256,-0.11848,-0.123076,-0.148151,-0.153775,-0.154924,-0.165967,-0.200841,-0.205002,-0.205986,-0.251231,-0.325683,-0.354583,-0.373571,-0.422325,-0.427214,-0.438395,-0.457027,-0.461505,-0.501616,-0.540421,-0.543041,-0.591035,-0.638793,-0.669097,-0.670411,-0.706172,-0.985987,-1.023146,-1.083418,-1.429672
odds_ratios,5.976065,3.587447,1.972269,1.900812,1.872761,1.839847,1.822099,1.808208,1.803631,1.796452,1.758044,1.695531,1.655713,1.630049,1.593012,1.547787,1.535346,1.516587,1.511369,1.483802,1.430047,1.420264,1.400073,1.39236,1.330541,1.27056,1.258172,1.226371,1.217849,1.153441,1.147543,1.13287,1.120037,1.096612,1.091832,1.07729,1.048422,1.044513,1.032341,1.032187,1.030397,1.021687,1.018048,1.015574,1.014395,1.000003,0.994384,0.990515,0.985863,0.980417,0.974033,0.962232,0.93382,0.931536,0.928408,0.928315,0.890247,0.888269,0.884197,0.862301,0.857465,0.85648,0.847074,0.818042,0.814646,0.813845,0.777843,0.722034,0.701466,0.688272,0.655521,0.652324,0.645071,0.633163,0.630334,0.605552,0.582503,0.580979,0.553754,0.527929,0.512171,0.511498,0.49353,0.373071,0.359462,0.338437,0.239387


In [195]:
# sorting by coefficients, ascending --> negatively correlates with success
coef.sort_values(axis=1, by='coefficients')

Unnamed: 0,Climate,Unspecified - region,Remote,Early,"Seattle, WA, USA",Proptech,Consumer Health and Wellness,Data Engineering,Human Resources,Hardware,Latin America - region,Marketplace,Retail,"Paris, Île-de-France, France",Gaming,Food and Beverage,"Bengaluru, KA, India",Enterprise,United States of America - region,Canada - region,Industrials,Machine Learning,highlight_latinx,"Palo Alto, CA, USA",Health Tech,num_former_names,Generative AI,Supply Chain and Logistics,Consumer Health Services,Sales,num_tags,AI,Real Estate and Construction,Crypto / Web3,Social,"New York, NY, USA",other_tag,highlight_black,Europe - region,other_region,has_website,isHiring,E-commerce,team_size,Productivity,"Mexico City, CDMX, Mexico","Los Angeles, CA, USA",Developer Tools,Education,years_since_founding,"Mountain View, CA, USA","London, England, United Kingdom",Home and Personal,Healthcare,Finance and Accounting,nonprofit,"Toronto, ON, Canada",top_company_by_revenue,Content,other_city,Logistics,India - region,num_regions,Open Source,Fintech,highlight_women,Operations,Consumer,Housing and Real Estate,Analytics,"San Francisco, CA, USA",SaaS,Biotech,"Boston, MA, USA",Artificial Intelligence,Marketing,API,Payments,Security,Consumer Finance,has_logo,Infrastructure,has_one_liner,Healthcare IT,B2B,"Austin, TX, USA",top_company
coefficients,-1.429672,-1.083418,-1.023146,-0.985987,-0.706172,-0.670411,-0.669097,-0.638793,-0.591035,-0.543041,-0.540421,-0.501616,-0.461505,-0.457027,-0.438395,-0.427214,-0.422325,-0.373571,-0.354583,-0.325683,-0.251231,-0.205986,-0.205002,-0.200841,-0.165967,-0.154924,-0.153775,-0.148151,-0.123076,-0.11848,-0.116256,-0.074384,-0.074284,-0.07092,-0.068472,-0.038499,-0.02631,-0.019778,-0.014237,-0.00953,-0.005632,3e-06,0.014293,0.015454,0.017887,0.021456,0.029944,0.03168,0.031829,0.043551,0.047286,0.074449,0.087857,0.092225,0.113362,0.124754,0.137623,0.142749,0.197086,0.204059,0.22966,0.239458,0.285586,0.331,0.336524,0.350843,0.357707,0.394608,0.413016,0.416462,0.428756,0.436826,0.465626,0.48861,0.504231,0.527996,0.564202,0.585814,0.589802,0.592336,0.599989,0.609682,0.627414,0.642281,0.679185,1.277441,1.787762
odds_ratios,0.239387,0.338437,0.359462,0.373071,0.49353,0.511498,0.512171,0.527929,0.553754,0.580979,0.582503,0.605552,0.630334,0.633163,0.645071,0.652324,0.655521,0.688272,0.701466,0.722034,0.777843,0.813845,0.814646,0.818042,0.847074,0.85648,0.857465,0.862301,0.884197,0.888269,0.890247,0.928315,0.928408,0.931536,0.93382,0.962232,0.974033,0.980417,0.985863,0.990515,0.994384,1.000003,1.014395,1.015574,1.018048,1.021687,1.030397,1.032187,1.032341,1.044513,1.048422,1.07729,1.091832,1.096612,1.120037,1.13287,1.147543,1.153441,1.217849,1.226371,1.258172,1.27056,1.330541,1.39236,1.400073,1.420264,1.430047,1.483802,1.511369,1.516587,1.535346,1.547787,1.593012,1.630049,1.655713,1.695531,1.758044,1.796452,1.803631,1.808208,1.822099,1.839847,1.872761,1.900812,1.972269,3.587447,5.976065


## Decision Tree Model <a class="anchor" id="dtmodel"></a>

In [196]:
# instantiate decision tree
dt_model = DecisionTreeClassifier(max_depth=7)

# fit it to training data
dt_model.fit(X_train, y_train)

# score the accuracy
dt_train_score = dt_model.score(X_train, y_train)
dt_test_score = dt_model.score(X_test, y_test)

print(f"Score on training set: {dt_train_score}")
print(f"Score on test set: {dt_test_score}")

Score on training set: 0.8042744656917885
Score on test set: 0.6502242152466368


In [197]:
dt_pred = dt_model.predict(X_test)
dt_precision = precision_score(y_test, dt_pred)
dt_recall = recall_score(y_test, dt_pred)
dt_f1 = f1_score(y_test, dt_pred)
print(f'precision = {dt_precision}, recall = {dt_recall}, f1 = {dt_f1}')

precision = 0.6836734693877551, recall = 0.5877192982456141, f1 = 0.6320754716981133


## Random Forest Model <a class="anchor" id="rfmodel"></a>

In [198]:
# instantiate
rf_model = RandomForestClassifier(n_estimators=50, max_depth=9)

# fit
rf_model.fit(X_train, y_train)

# score
rf_train_score = rf_model.score(X_train, y_train)
rf_test_score = rf_model.score(X_test, y_test)

print(f"Score on training set: {rf_train_score}")
print(f"Score on test set: {rf_test_score}")

Score on training set: 0.8233970753655793
Score on test set: 0.695067264573991


In [199]:
rf_pred = rf_model.predict(X_test)
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)
print(f'precision = {rf_precision}, recall = {rf_recall}, f1 = {rf_f1}')

precision = 0.8108108108108109, recall = 0.5263157894736842, f1 = 0.6382978723404256


## Support Vector Machine Model <a class="anchor" id="svmmodel"></a>

In [200]:
# instantiate
svm_model = LinearSVC(dual='auto')

# fit
svm_model.fit(X_train, y_train)

# score
svm_train_score = svm_model.score(X_train, y_train)
svm_test_score = svm_model.score(X_test, y_test)

print(f"Score on training set: {svm_train_score}")
print(f"Score on test set: {svm_test_score}")

Score on training set: 0.7401574803149606
Score on test set: 0.6681614349775785


In [201]:
svm_pred = svm_model.predict(X_test)
svm_precision = precision_score(y_test, svm_pred)
svm_recall = recall_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred)
print(f'precision = {svm_precision}, recall = {svm_recall}, f1 = {svm_f1}')

precision = 0.7702702702702703, recall = 0.5, f1 = 0.6063829787234043


In [202]:
# instantiate
svc_model = SVC()

# fit
svc_model.fit(X_train, y_train)

# score
svc_train_score = svc_model.score(X_train, y_train)
svc_test_score = svc_model.score(X_test, y_test)

print(f"Score on training set: {svc_train_score}")
print(f"Score on test set: {svc_test_score}")

Score on training set: 0.6479190101237345
Score on test set: 0.5605381165919282


# Word Embeddings

In [211]:
model = gensim.models.KeyedVectors.load_word2vec_format(
    'lexvec.commoncrawl.300d.W.pos.vectors', binary=False
)

In [None]:
model2 = gensim.models.KeyedVectors.load_word2vec_format(
    'lexvec-wikipedia-word-vectors', binary=False
)

In [213]:
model.vectors.shape

(2000000, 300)

In [215]:
df['long_description'][0]

'Dropbox is building the world’s first smart workspace.\r\nBack in 2007, making work better for people meant designing a simpler way to keep files in sync. Today, it means designing products that reduce busywork so you can focus on the work that matters.\r\n\r\nMost “productivity tools” get in your way. They constantly ping, distract, and disrupt your team’s flow, so you spend your days switching between apps and tracking down feedback. It’s busywork, not the meaningful stuff. We want to change this. \r\n\r\nWe believe there’s a more enlightened way to work. Dropbox helps people be organized, stay focused, and get in sync with their teams.'

In [217]:
desc = df['long_description'][0]

tokenized = simple_preprocess(desc)

print(tokenized)

['dropbox', 'is', 'building', 'the', 'world', 'first', 'smart', 'workspace', 'back', 'in', 'making', 'work', 'better', 'for', 'people', 'meant', 'designing', 'simpler', 'way', 'to', 'keep', 'files', 'in', 'sync', 'today', 'it', 'means', 'designing', 'products', 'that', 'reduce', 'busywork', 'so', 'you', 'can', 'focus', 'on', 'the', 'work', 'that', 'matters', 'most', 'productivity', 'tools', 'get', 'in', 'your', 'way', 'they', 'constantly', 'ping', 'distract', 'and', 'disrupt', 'your', 'team', 'flow', 'so', 'you', 'spend', 'your', 'days', 'switching', 'between', 'apps', 'and', 'tracking', 'down', 'feedback', 'it', 'busywork', 'not', 'the', 'meaningful', 'stuff', 'we', 'want', 'to', 'change', 'this', 'we', 'believe', 'there', 'more', 'enlightened', 'way', 'to', 'work', 'dropbox', 'helps', 'people', 'be', 'organized', 'stay', 'focused', 'and', 'get', 'in', 'sync', 'with', 'their', 'teams']


In [218]:
word_embeddings = np.stack([model[word] for word in tokenized])
word_embeddings.shape

(102, 300)

In [219]:
word_embeddings.mean(axis=0).shape

(300,)

In [220]:
def sentence2vec(text):
    """
    Embed a sentence by averaging the word vectors of the tokenized text. Out-of-vocabulary words are replaced by the zero-vector.
    -----
    
    Input: text (string)
    Output: embedding vector (np.array)
    """
    tokenized = simple_preprocess(text)
    
    word_embeddings = [np.zeros(300)]
    for word in tokenized:
        # if the word is in the model then embed
        if word in model:
            vector = model[word]
        # add zeros for out-of-vocab words
        else:
            vector = np.zeros(300)
            
        word_embeddings.append(vector)
    
    # average the word vectors
    sentence_embedding = np.stack(word_embeddings).mean(axis=0)
    
    return sentence_embedding

In [228]:
dropbox = sentence2vec(df['long_description'][0])

In [230]:
reddit = sentence2vec(df['long_description'][1])

In [240]:
reddit.shape

(300,)

In [232]:
cosine_distance(dropbox, reddit)

0.26934127757553317

In [260]:
# now create a new dataframe that has all the features from before concatenated with the word embeddings for each row's description
# column names will be embed 1 through 300
embeddings = pd.DataFrame(columns=[str(x) for x in range(1, 301)])

In [261]:
embeddings

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300


In [262]:
for i in range(len(df['long_description'])):
    embeddings.loc[i] = sentence2vec(df['long_description'][i])

In [263]:
embeddings.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300
0,-0.012317,0.057962,-0.00428,-0.065468,0.017166,0.020322,0.01026,-0.041826,0.033459,-0.025339,-0.009442,0.05963,0.0198,0.110621,-0.032039,-0.04004,-0.026598,0.029308,0.012961,-0.051861,0.015379,0.019517,-0.015711,0.00015,-0.009759,0.010246,-0.064791,0.07874,-0.005576,0.00299,0.061068,0.026288,0.051026,0.081943,0.05992,0.002325,0.026035,0.052339,0.046128,-0.038038,-0.077274,-0.075645,0.057859,-0.101632,-0.03746,-0.002661,0.088954,-0.015326,-0.014149,-0.052736,0.069397,-0.072194,-0.052505,-0.030569,-0.046104,0.044662,0.032549,0.034161,-0.046095,0.007733,0.005278,-0.006667,-0.023111,0.001431,0.010152,-0.053458,0.005141,-0.043019,-0.029078,-0.007225,-0.014356,-0.035448,-0.063953,-0.034618,0.004803,-0.01329,-0.015247,-0.042822,-0.011008,-0.068611,-0.004259,-0.103977,-0.096593,-0.00471,0.077831,-0.056452,0.057874,-0.004201,-0.058885,-0.014621,0.029993,-0.01161,0.029127,0.05453,-0.039637,0.103173,-0.024941,0.039605,-0.118019,-0.026757,-0.049257,-0.015806,0.045254,-0.000693,-0.008152,0.002924,0.046726,0.004465,-0.058077,-0.032932,0.023592,-0.006918,0.008864,-0.054953,-0.050144,0.095811,-0.059375,0.028743,0.037478,-0.035895,-0.002667,0.100011,-0.034884,0.006224,-0.027502,-0.107062,0.008726,-0.038902,0.002447,0.01199,-0.059408,0.067119,-0.029586,0.074272,0.087575,-0.016203,-0.104529,0.020069,-0.005061,-0.02029,-0.057631,-0.085919,-0.006956,-0.014459,0.001429,-0.081658,0.005445,0.048047,0.104376,-0.008559,0.037547,0.010641,0.049654,-0.017577,0.012316,-0.051718,0.002168,-0.00877,-0.085678,-0.00287,-0.039238,0.038353,0.073392,0.031152,0.043569,0.090239,0.069478,-0.03544,0.11009,-0.062988,-0.044831,-0.006027,-0.091348,-0.038392,0.023617,0.00519,-0.097036,-0.003612,0.033736,-0.053642,0.035948,-0.031467,0.081459,0.022531,-0.080701,0.013001,-0.092177,-0.02716,0.044867,0.008185,0.012569,0.012237,0.043546,0.043198,-0.007195,-0.006776,0.002999,0.087368,-0.031379,0.182042,0.025828,-0.057733,-0.073188,0.015434,0.013741,-0.008775,0.007391,-0.018836,0.041056,-0.011617,-0.034941,0.046269,0.003306,-0.078995,0.000796,0.003666,0.00878,-0.061906,0.130383,0.063447,-0.03918,-0.053853,-0.015598,0.030893,-0.054284,0.029541,0.025251,0.042739,0.006253,-0.007237,-0.010026,-0.024071,-0.049525,0.031683,0.00389,-0.02399,-0.035286,-0.039391,-0.075753,0.008394,-0.060022,0.051096,0.04822,0.060955,-0.020574,-0.063227,0.04472,-0.000654,-0.039825,-0.00081,-0.03081,0.002471,-0.029655,0.009088,-0.041395,-0.097371,0.054033,0.019108,-0.01401,0.008799,-0.099116,0.022856,0.03729,0.082297,-0.066029,0.068852,0.014328,0.020137,0.01045,0.019573,0.061073,-0.035276,0.019441,-0.003396,0.058954,0.070687,0.101444,-0.082844,0.053795,-0.029197,0.012972,-0.043696,-0.009466,0.004437,-0.044073,0.029016,-0.020407,-0.004954,-0.011953,0.01385,-0.065447,-0.042941,-0.091139,-0.009111,-0.056382,-0.008814,-0.094955,-0.013726,-0.039655,-0.007641
1,-0.012925,0.068325,0.012582,-0.070207,-0.006114,0.071036,0.011452,0.025784,0.05202,-0.004709,-0.024533,-0.003633,0.005719,0.122335,-0.034514,-0.030305,-0.060178,-0.003238,0.043227,-0.006075,0.020879,-0.02879,-0.023512,-0.001608,0.011362,0.007179,-0.107738,0.027416,-0.007873,-0.021935,0.032564,-0.005588,0.082752,0.039168,0.072265,0.004811,0.01022,0.009857,0.040663,-0.063294,-0.060462,-0.072189,0.046104,-0.11193,-0.026987,-0.000943,0.078608,0.058025,0.023301,-0.019724,0.022918,-0.036565,-0.111122,-0.027927,-0.06402,0.039201,0.046129,-0.017671,-0.076886,0.04347,-0.015192,-0.002078,0.031842,-0.037026,-0.021741,-0.028309,-0.000319,-0.018193,-0.014527,0.032582,-0.034241,0.008494,-0.08921,0.028734,-0.005046,0.075087,0.0071,-0.053404,0.011009,-0.024965,0.02324,-0.060663,-0.052475,-0.02347,0.052054,-0.027264,0.080446,0.069228,-0.039326,-0.079565,0.037417,-0.025408,-0.001506,0.022264,-0.0656,0.03992,-0.010281,0.04723,-0.094832,-0.02719,-0.023838,-0.017591,0.054413,-0.010517,-0.017039,-0.006603,0.10561,-0.059525,-0.030704,-0.042872,-0.002478,-0.037133,0.016353,-0.03199,-0.077429,0.058486,-0.009783,0.007172,0.014928,-0.029913,0.018931,0.082843,-0.055909,-0.014278,-0.023734,-0.131666,-0.010279,0.002609,-0.003505,-0.021328,-0.009261,0.084499,-0.014882,0.018882,0.082696,-0.044539,-0.053992,-0.022757,-0.027735,0.036891,-0.041944,-0.004116,0.010547,-0.033496,-0.070917,-0.103963,-0.026561,0.041257,0.025513,0.011483,0.037078,0.029244,0.013852,-0.027163,0.062535,-0.034476,0.010595,-0.04044,-0.010511,0.016163,-0.089664,-0.019296,0.075761,0.067993,0.085181,0.071149,-0.017012,-0.017615,0.015798,-0.084797,-0.044507,-0.040418,-0.03715,-0.030255,-0.012973,-0.007074,-0.09043,0.033601,0.047784,0.006313,-0.030555,0.017526,0.018222,-0.026743,-0.030122,-0.017233,-0.093659,-0.035227,0.09703,0.011413,0.055526,0.002365,0.052427,0.024868,-0.006487,0.051826,0.000865,0.052408,-0.014941,0.128541,0.032864,-0.062775,-0.059052,0.034846,0.011675,-0.015743,0.054379,0.032923,0.021307,-0.03115,0.004337,0.046209,0.012611,-0.07051,-0.048028,0.058425,-0.020251,-0.019029,0.074983,0.083061,-0.04996,-0.062724,0.026844,-0.024707,-0.069096,0.00658,-0.057327,0.039397,0.012258,0.01542,-0.046398,0.007405,-0.072151,0.037831,0.034621,-0.025803,-0.00514,-0.009319,-0.027796,-0.007878,-0.013354,0.053684,0.055932,0.039703,-0.039781,-0.016011,0.027415,-0.024483,-0.053968,0.03345,-0.007738,-0.066576,0.005537,0.001885,-0.039934,-0.068756,0.037087,-0.001129,-0.010486,-0.005867,-0.042985,0.045079,0.034643,0.042481,-0.001885,0.08048,0.007911,0.00874,-0.001496,0.022673,0.0448,-0.02077,0.035179,-0.042033,0.043001,0.053765,0.066017,-0.014329,-0.033259,-0.038259,0.026634,-0.064755,0.00726,-0.008814,0.044452,0.018872,-0.041313,0.067779,0.045921,0.043908,-0.057447,-0.002487,-0.069005,-0.009714,-0.046641,0.010742,-0.057428,-0.057351,0.027911,-0.048602
2,-0.029181,0.089071,0.01515,-0.096253,0.000133,0.054852,0.015016,0.040473,0.003746,-0.0334,0.008702,0.020586,-0.004574,0.129011,-0.056941,0.009195,-0.057836,-0.017242,-0.035053,-0.033125,0.052225,-0.050664,-0.028922,-0.014833,0.00906,0.039198,-0.111285,0.022531,0.010937,0.003754,0.05244,0.06807,0.045806,0.064925,0.072028,0.017937,0.013093,0.019399,0.003008,-0.054742,-0.048361,-0.044113,0.060665,-0.081479,-0.068766,0.076939,0.077312,0.018426,0.064554,0.021598,0.052803,-0.005911,-0.108612,-0.063006,-0.050817,0.074147,0.063231,-0.016047,-0.052114,-0.018032,-0.009582,-0.038365,-0.002102,-0.035351,-0.025087,-0.029383,-0.0297,-0.029463,0.008663,0.080649,0.005357,-0.022571,-0.093611,-0.021234,-0.013611,0.002509,-0.023165,-0.082038,-0.055828,-0.008474,0.04722,-0.067384,-0.041188,-0.047526,0.093196,-0.099472,0.050333,0.017146,-0.030982,-0.032998,0.092624,-0.01124,0.006113,0.028093,-0.063653,0.050621,-0.017388,0.038956,-0.032984,-0.048004,-0.059146,-0.075851,0.039941,0.006904,0.002847,0.053438,0.115178,-0.035026,-0.016118,-0.038,-0.0725,-0.018949,0.063706,-0.01324,-0.126617,0.009197,-0.030123,-0.027377,0.054129,-0.043818,0.00387,0.080212,-0.040174,0.047089,-0.018194,-0.108438,-0.039702,-0.037866,-0.009613,-0.03342,-0.038282,0.058471,-0.019447,0.038302,0.109962,0.017062,-0.076576,0.043384,-0.049199,-0.010697,-0.035717,-0.065439,0.028339,-0.002434,0.002149,-0.042937,-0.024223,0.05671,0.104877,-0.013064,0.043854,0.03313,0.029689,0.000603,0.015301,-0.050891,0.003282,-0.068314,-0.012024,0.00361,-0.084194,-0.008695,0.061739,0.048132,0.060997,0.025957,0.069691,-0.036494,0.07588,-0.108493,-0.057049,-0.057617,-0.031346,0.028427,0.054989,0.00807,-0.068675,-0.01507,0.057834,0.024277,-0.044628,0.060331,0.031633,-0.063601,0.001935,-0.020711,-0.101364,-0.01009,0.097091,0.050297,0.032007,-0.027513,0.054518,-0.004226,-0.057694,0.034912,-0.004414,0.044882,-0.065989,0.143545,-0.032316,-0.041998,-0.015124,0.036994,-0.00953,-0.049543,0.008269,0.025122,0.02974,-0.028407,0.063032,0.047257,-0.021762,-0.120383,-0.042622,0.026721,-0.010975,-0.05684,0.062169,0.133135,-0.056864,-0.016516,0.007985,0.002118,-0.079442,0.029553,0.015503,0.019077,0.01637,-0.04205,-0.032902,0.011067,-0.040924,0.041928,0.015556,0.017211,0.02847,0.005642,-0.044495,-0.002462,-0.006012,0.059784,0.075199,0.045789,-0.029441,0.019732,-0.001593,-0.015809,-0.062697,-0.003406,-0.013217,-0.035968,-0.003262,-0.042702,-0.078845,-0.047939,0.056677,-0.018103,-0.022322,-0.000669,-0.05722,0.074602,0.030007,0.106048,-0.01352,0.068457,0.014476,0.028526,-0.001098,0.036018,0.046017,-0.016539,0.031878,0.004717,0.060658,-0.013691,0.063258,-0.019897,0.051765,-0.024048,0.034322,-0.032805,-0.039179,-0.044717,-0.015082,0.021996,0.013447,0.047895,0.003928,0.01185,-0.062292,0.038654,-0.050963,0.015019,-0.033382,0.006723,-0.034423,-0.064644,-0.043332,-0.008316
3,0.000565,0.081831,-0.023884,-0.062085,-0.004643,0.002487,-0.024667,-0.01843,-0.00168,-0.02489,-0.013761,0.012421,0.05053,0.146332,-0.048669,-0.038185,-0.017294,0.030368,0.055898,-0.048022,0.029388,-0.007646,0.002491,-0.003677,0.026631,0.029276,-0.095247,0.065438,-0.000917,-0.020601,0.04583,0.005318,0.061817,0.056365,0.097407,-0.001812,0.013194,-0.00376,0.014847,-0.068991,-0.079257,-0.068577,0.079831,-0.053023,-0.06996,0.035004,0.085839,0.002184,-0.037845,-0.01847,0.033375,-0.06208,-0.142034,-0.031675,-0.051455,0.026113,0.058289,0.016082,-0.038199,-0.011879,0.015461,-0.015013,0.023052,0.00294,0.011858,-0.061447,-0.006705,-0.050642,-0.053772,-0.015263,0.020349,-0.016082,-0.033813,0.005487,-0.040068,0.051573,0.008212,-0.079654,-0.007213,-0.023425,0.02686,-0.067554,-0.064925,0.007495,0.067808,-0.015241,0.060687,0.010666,-0.011276,-0.084183,0.023167,0.015084,-0.001079,0.059728,-0.064013,0.069736,-0.016537,0.016044,-0.089359,-0.018737,-0.029255,-0.053422,0.045195,-0.006384,-0.027963,0.007602,0.051566,-0.024279,-0.02791,-0.041073,-0.004481,-0.021417,-0.020258,-0.015045,-0.080685,0.053854,-0.076386,0.021436,0.055555,-0.043951,0.031706,0.095553,-0.022395,-0.013888,-0.028245,-0.118269,-0.003378,-0.011548,0.038565,-0.009736,-0.037318,0.063481,0.000865,0.023888,0.067544,-0.010241,-0.04641,0.01599,-0.003399,0.003943,-0.022837,-0.058131,0.012285,-0.061609,-0.021375,-0.103423,0.006479,0.036656,0.079912,0.066505,0.075789,0.020034,0.06541,0.028328,-0.003942,-0.044858,-0.002396,-0.022516,-0.037839,-0.021627,-0.10628,-0.000782,0.08784,0.056394,0.061429,0.047131,0.054796,-0.011076,0.078151,-0.090052,-0.045226,-0.000783,-0.045408,-0.0269,0.018668,-0.020646,-0.094224,0.018914,0.064063,0.003783,-0.021872,-0.001703,0.057606,0.017829,-0.041111,-0.003645,-0.048168,-0.031757,0.078562,0.048802,0.051103,0.001612,0.040735,0.054167,0.016101,0.038673,-0.04933,0.069623,-0.042776,0.108023,0.007575,-0.055501,-0.056711,-0.006731,0.003692,-0.013554,0.044648,-0.004804,-0.009801,-0.022916,-0.03351,0.067465,0.031865,-0.09985,-0.026145,0.001841,0.012262,-0.053901,0.096106,0.110842,-0.068507,-0.036928,2.7e-05,0.034136,-0.030734,0.013342,0.004869,0.035869,0.032193,-0.009326,-0.054101,-0.048267,-0.031649,0.043119,-0.011225,-0.034764,-0.01026,-0.013023,-0.010645,0.016932,-0.036283,0.070133,0.065875,0.05812,0.000468,-0.037147,0.037283,-0.012312,-0.064457,-0.019237,-0.029979,-0.051699,-0.01278,-0.010797,-0.054104,-0.096128,0.043332,0.020209,0.024109,0.034667,-0.038525,-0.000579,-0.002408,0.070042,-0.038923,0.059514,0.017851,0.044447,0.01938,0.042349,-0.001268,-0.035257,0.031711,-0.038535,0.055307,0.063442,0.087636,0.014155,-0.016443,-0.042355,0.032029,-0.065171,0.024269,0.020719,0.030495,0.035247,-0.028253,0.030109,0.038719,0.025247,-0.062824,-0.001652,-0.082428,0.048707,-0.024009,0.018111,-0.048337,-0.037789,-0.018611,0.011336
4,0.007876,0.06305,-0.017276,-0.077545,-0.019022,0.017432,0.027112,0.011188,0.035279,-0.021088,-0.013109,0.028218,0.033441,0.15495,-0.01547,-0.039279,-0.009749,0.009035,0.019892,-0.056281,0.028947,-0.018572,0.01064,-0.012964,-0.001425,0.031276,-0.069615,0.01841,0.037617,-0.042898,0.051519,0.019778,0.050116,0.063972,0.090834,0.005196,0.052382,-0.009045,0.017207,-0.06229,-0.07997,-0.05383,0.038189,-0.081633,-0.066215,0.052787,0.104077,0.025096,0.026491,-0.014731,0.012101,-0.023659,-0.073216,-0.035542,-0.104285,0.032864,0.048078,-0.001116,-0.054142,0.022082,-0.009627,0.014647,-0.000815,-0.036124,-0.005699,-0.025199,0.025575,-0.008682,0.016861,-0.04532,-0.013889,-0.043297,-0.079856,0.024288,-0.017155,0.0364,-0.014145,-0.083429,0.017569,-0.030499,0.029601,-0.070537,-0.065231,-0.01289,0.067462,-0.037502,0.010791,0.049747,-0.046371,-0.065296,0.026081,-0.032364,0.019569,0.006795,-0.041912,0.04541,-0.018157,0.059466,-0.054445,-0.026229,-0.041554,0.020493,0.071074,0.031906,-0.014925,-0.017365,0.057727,-0.019537,-0.017075,-0.020095,0.030538,0.010147,-0.007434,-0.043473,-0.053317,0.077024,-0.028011,0.023106,0.029856,-0.010197,-0.005054,0.041942,-0.002008,-0.006141,-0.046635,-0.111161,-0.017975,-0.014036,0.027196,-0.020361,-0.009124,0.059861,-0.027153,0.044617,0.100163,-0.01721,-0.028003,-0.010327,-0.010363,0.024975,-0.058736,-0.067962,0.016231,-0.047341,-0.002873,-0.112224,-0.021316,0.048766,0.078449,0.019122,0.06452,0.0091,0.077022,0.026073,0.043289,-0.024171,-0.018261,-0.038977,-0.060643,-0.030725,-0.08477,0.015668,0.091074,0.041432,0.037227,0.057856,0.032466,-0.010286,0.038812,-0.077287,-0.049205,0.005122,-0.03095,-0.020938,0.025032,0.002624,-0.078252,0.007227,0.044513,0.00535,-0.006332,0.014094,0.076125,-0.007806,-0.027211,-0.064854,-0.066903,0.001068,0.04657,0.017761,0.033944,0.04416,0.076674,-0.004111,-0.011887,0.033293,0.00703,0.052233,-0.029528,0.154325,0.040341,-0.056313,-0.07185,0.002547,0.006832,-0.030825,0.060665,-0.009865,0.013256,-0.047004,-0.004811,0.038247,0.018905,-0.099373,-0.023546,0.037968,-0.007057,-0.038502,0.070323,0.088535,-0.072756,-0.045297,0.008133,0.018626,-0.020049,0.003498,0.034088,0.05924,-0.005049,0.019282,-0.028091,0.001909,-0.067206,0.041854,0.016912,-0.007389,0.005633,-0.032644,-0.05489,-0.032116,-0.015984,0.007899,0.049997,0.055743,0.020651,-0.061688,0.009789,-0.002601,-0.069934,0.00755,0.010509,-0.030204,-0.004872,0.029648,-0.035615,-0.083204,0.035749,0.039674,-0.003712,0.012814,-0.054307,0.01689,-0.023737,0.038391,-0.059822,0.096328,0.018987,0.008253,0.029995,0.029852,0.0203,-0.010143,0.035545,-0.006307,0.054816,0.02417,0.07648,-0.056344,-0.025782,-0.018064,0.017894,-0.055717,-0.015519,0.011895,-0.000575,0.029044,-0.041722,0.02981,0.019966,-0.016637,-0.102845,0.012527,-0.04477,-0.000489,-0.057994,0.03025,-0.031844,-0.048505,0.008678,-0.021535


In [264]:
embeddings.shape

(4072, 300)

In [265]:
test = mod_df.copy()

In [266]:
test.shape

(4072, 89)

In [267]:
pd.concat([test, embeddings], axis=1)

Unnamed: 0,team_size,highlight_black,highlight_latinx,highlight_women,top_company,top_company_by_revenue,isHiring,nonprofit,num_former_names,has_logo,has_website,has_one_liner,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city,Canada - region,Europe - region,India - region,Latin America - region,United States of America - region,other_region,Unspecified - region,num_regions,num_tags,AI,API,Analytics,Artificial Intelligence,B2B,Biotech,Climate,Consumer,Consumer Finance,Consumer Health Services,Consumer Health and Wellness,Content,Crypto / Web3,Data Engineering,Developer Tools,E-commerce,Education,Enterprise,Finance and Accounting,Fintech,Food and Beverage,Gaming,Generative AI,Hardware,Health Tech,Healthcare,Healthcare IT,Home and Personal,Housing and Real Estate,Human Resources,Industrials,Infrastructure,Logistics,Machine Learning,Marketing,Marketplace,Open Source,Operations,Payments,Productivity,Proptech,Real Estate and Construction,Retail,SaaS,Sales,Security,Social,Supply Chain and Logistics,other_tag,years_since_founding,Early,success,active,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300
0,4000.0,False,False,False,True,True,False,False,0,True,True,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,2,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,16,False,1,0,-0.012317,0.057962,-0.004280,-0.065468,0.017166,0.020322,0.010260,-0.041826,0.033459,-0.025339,-0.009442,0.059630,0.019800,0.110621,-0.032039,-0.040040,-0.026598,0.029308,0.012961,-0.051861,0.015379,0.019517,-0.015711,0.000150,-0.009759,0.010246,-0.064791,0.078740,-0.005576,0.002990,0.061068,0.026288,0.051026,0.081943,0.059920,0.002325,0.026035,0.052339,0.046128,-0.038038,-0.077274,-0.075645,0.057859,-0.101632,-0.037460,-0.002661,0.088954,-0.015326,-0.014149,-0.052736,0.069397,-0.072194,-0.052505,-0.030569,-0.046104,0.044662,0.032549,0.034161,-0.046095,0.007733,0.005278,-0.006667,-0.023111,0.001431,0.010152,-0.053458,0.005141,-0.043019,-0.029078,-0.007225,-0.014356,-0.035448,-0.063953,-0.034618,0.004803,-0.013290,-0.015247,-0.042822,-0.011008,-0.068611,-0.004259,-0.103977,-0.096593,-0.004710,0.077831,-0.056452,0.057874,-0.004201,-0.058885,-0.014621,0.029993,-0.011610,0.029127,0.054530,-0.039637,0.103173,-0.024941,0.039605,-0.118019,-0.026757,-0.049257,-0.015806,0.045254,-0.000693,-0.008152,0.002924,0.046726,0.004465,-0.058077,-0.032932,0.023592,-0.006918,0.008864,-0.054953,-0.050144,0.095811,-0.059375,0.028743,0.037478,-0.035895,-0.002667,0.100011,-0.034884,0.006224,-0.027502,-0.107062,0.008726,-0.038902,0.002447,0.011990,-0.059408,0.067119,-0.029586,0.074272,0.087575,-0.016203,-0.104529,0.020069,-0.005061,-0.020290,-0.057631,-0.085919,-0.006956,-0.014459,0.001429,-0.081658,0.005445,0.048047,0.104376,-0.008559,0.037547,0.010641,0.049654,-0.017577,0.012316,-0.051718,0.002168,-0.008770,-0.085678,-0.002870,-0.039238,0.038353,0.073392,0.031152,0.043569,0.090239,0.069478,-0.035440,0.110090,-0.062988,-0.044831,-0.006027,-0.091348,-0.038392,0.023617,0.005190,-0.097036,-0.003612,0.033736,-0.053642,0.035948,-0.031467,0.081459,0.022531,-0.080701,0.013001,-0.092177,-0.027160,0.044867,0.008185,0.012569,0.012237,0.043546,0.043198,-0.007195,-0.006776,0.002999,0.087368,-0.031379,0.182042,0.025828,-0.057733,-0.073188,0.015434,0.013741,-0.008775,0.007391,-0.018836,0.041056,-0.011617,-0.034941,0.046269,0.003306,-0.078995,0.000796,0.003666,0.008780,-0.061906,0.130383,0.063447,-0.039180,-0.053853,-0.015598,0.030893,-0.054284,0.029541,0.025251,0.042739,0.006253,-0.007237,-0.010026,-0.024071,-0.049525,0.031683,0.003890,-0.023990,-0.035286,-0.039391,-0.075753,0.008394,-0.060022,0.051096,0.048220,0.060955,-0.020574,-0.063227,0.044720,-0.000654,-0.039825,-0.000810,-0.030810,0.002471,-0.029655,0.009088,-0.041395,-0.097371,0.054033,0.019108,-0.014010,0.008799,-0.099116,0.022856,0.037290,0.082297,-0.066029,0.068852,0.014328,0.020137,0.010450,0.019573,0.061073,-0.035276,0.019441,-0.003396,0.058954,0.070687,0.101444,-0.082844,0.053795,-0.029197,0.012972,-0.043696,-0.009466,0.004437,-0.044073,0.029016,-0.020407,-0.004954,-0.011953,0.013850,-0.065447,-0.042941,-0.091139,-0.009111,-0.056382,-0.008814,-0.094955,-0.013726,-0.039655,-0.007641
1,201.0,False,False,False,True,True,False,False,0,True,True,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,2,6,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,18,True,1,0,-0.012925,0.068325,0.012582,-0.070207,-0.006114,0.071036,0.011452,0.025784,0.052020,-0.004709,-0.024533,-0.003633,0.005719,0.122335,-0.034514,-0.030305,-0.060178,-0.003238,0.043227,-0.006075,0.020879,-0.028790,-0.023512,-0.001608,0.011362,0.007179,-0.107738,0.027416,-0.007873,-0.021935,0.032564,-0.005588,0.082752,0.039168,0.072265,0.004811,0.010220,0.009857,0.040663,-0.063294,-0.060462,-0.072189,0.046104,-0.111930,-0.026987,-0.000943,0.078608,0.058025,0.023301,-0.019724,0.022918,-0.036565,-0.111122,-0.027927,-0.064020,0.039201,0.046129,-0.017671,-0.076886,0.043470,-0.015192,-0.002078,0.031842,-0.037026,-0.021741,-0.028309,-0.000319,-0.018193,-0.014527,0.032582,-0.034241,0.008494,-0.089210,0.028734,-0.005046,0.075087,0.007100,-0.053404,0.011009,-0.024965,0.023240,-0.060663,-0.052475,-0.023470,0.052054,-0.027264,0.080446,0.069228,-0.039326,-0.079565,0.037417,-0.025408,-0.001506,0.022264,-0.065600,0.039920,-0.010281,0.047230,-0.094832,-0.027190,-0.023838,-0.017591,0.054413,-0.010517,-0.017039,-0.006603,0.105610,-0.059525,-0.030704,-0.042872,-0.002478,-0.037133,0.016353,-0.031990,-0.077429,0.058486,-0.009783,0.007172,0.014928,-0.029913,0.018931,0.082843,-0.055909,-0.014278,-0.023734,-0.131666,-0.010279,0.002609,-0.003505,-0.021328,-0.009261,0.084499,-0.014882,0.018882,0.082696,-0.044539,-0.053992,-0.022757,-0.027735,0.036891,-0.041944,-0.004116,0.010547,-0.033496,-0.070917,-0.103963,-0.026561,0.041257,0.025513,0.011483,0.037078,0.029244,0.013852,-0.027163,0.062535,-0.034476,0.010595,-0.040440,-0.010511,0.016163,-0.089664,-0.019296,0.075761,0.067993,0.085181,0.071149,-0.017012,-0.017615,0.015798,-0.084797,-0.044507,-0.040418,-0.037150,-0.030255,-0.012973,-0.007074,-0.090430,0.033601,0.047784,0.006313,-0.030555,0.017526,0.018222,-0.026743,-0.030122,-0.017233,-0.093659,-0.035227,0.097030,0.011413,0.055526,0.002365,0.052427,0.024868,-0.006487,0.051826,0.000865,0.052408,-0.014941,0.128541,0.032864,-0.062775,-0.059052,0.034846,0.011675,-0.015743,0.054379,0.032923,0.021307,-0.031150,0.004337,0.046209,0.012611,-0.070510,-0.048028,0.058425,-0.020251,-0.019029,0.074983,0.083061,-0.049960,-0.062724,0.026844,-0.024707,-0.069096,0.006580,-0.057327,0.039397,0.012258,0.015420,-0.046398,0.007405,-0.072151,0.037831,0.034621,-0.025803,-0.005140,-0.009319,-0.027796,-0.007878,-0.013354,0.053684,0.055932,0.039703,-0.039781,-0.016011,0.027415,-0.024483,-0.053968,0.033450,-0.007738,-0.066576,0.005537,0.001885,-0.039934,-0.068756,0.037087,-0.001129,-0.010486,-0.005867,-0.042985,0.045079,0.034643,0.042481,-0.001885,0.080480,0.007911,0.008740,-0.001496,0.022673,0.044800,-0.020770,0.035179,-0.042033,0.043001,0.053765,0.066017,-0.014329,-0.033259,-0.038259,0.026634,-0.064755,0.007260,-0.008814,0.044452,0.018872,-0.041313,0.067779,0.045921,0.043908,-0.057447,-0.002487,-0.069005,-0.009714,-0.046641,0.010742,-0.057428,-0.057351,0.027911,-0.048602
2,2000.0,True,False,False,True,False,False,False,2,True,True,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,4,7,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,16,False,1,0,-0.029181,0.089071,0.015150,-0.096253,0.000133,0.054852,0.015016,0.040473,0.003746,-0.033400,0.008702,0.020586,-0.004574,0.129011,-0.056941,0.009195,-0.057836,-0.017242,-0.035053,-0.033125,0.052225,-0.050664,-0.028922,-0.014833,0.009060,0.039198,-0.111285,0.022531,0.010937,0.003754,0.052440,0.068070,0.045806,0.064925,0.072028,0.017937,0.013093,0.019399,0.003008,-0.054742,-0.048361,-0.044113,0.060665,-0.081479,-0.068766,0.076939,0.077312,0.018426,0.064554,0.021598,0.052803,-0.005911,-0.108612,-0.063006,-0.050817,0.074147,0.063231,-0.016047,-0.052114,-0.018032,-0.009582,-0.038365,-0.002102,-0.035351,-0.025087,-0.029383,-0.029700,-0.029463,0.008663,0.080649,0.005357,-0.022571,-0.093611,-0.021234,-0.013611,0.002509,-0.023165,-0.082038,-0.055828,-0.008474,0.047220,-0.067384,-0.041188,-0.047526,0.093196,-0.099472,0.050333,0.017146,-0.030982,-0.032998,0.092624,-0.011240,0.006113,0.028093,-0.063653,0.050621,-0.017388,0.038956,-0.032984,-0.048004,-0.059146,-0.075851,0.039941,0.006904,0.002847,0.053438,0.115178,-0.035026,-0.016118,-0.038000,-0.072500,-0.018949,0.063706,-0.013240,-0.126617,0.009197,-0.030123,-0.027377,0.054129,-0.043818,0.003870,0.080212,-0.040174,0.047089,-0.018194,-0.108438,-0.039702,-0.037866,-0.009613,-0.033420,-0.038282,0.058471,-0.019447,0.038302,0.109962,0.017062,-0.076576,0.043384,-0.049199,-0.010697,-0.035717,-0.065439,0.028339,-0.002434,0.002149,-0.042937,-0.024223,0.056710,0.104877,-0.013064,0.043854,0.033130,0.029689,0.000603,0.015301,-0.050891,0.003282,-0.068314,-0.012024,0.003610,-0.084194,-0.008695,0.061739,0.048132,0.060997,0.025957,0.069691,-0.036494,0.075880,-0.108493,-0.057049,-0.057617,-0.031346,0.028427,0.054989,0.008070,-0.068675,-0.015070,0.057834,0.024277,-0.044628,0.060331,0.031633,-0.063601,0.001935,-0.020711,-0.101364,-0.010090,0.097091,0.050297,0.032007,-0.027513,0.054518,-0.004226,-0.057694,0.034912,-0.004414,0.044882,-0.065989,0.143545,-0.032316,-0.041998,-0.015124,0.036994,-0.009530,-0.049543,0.008269,0.025122,0.029740,-0.028407,0.063032,0.047257,-0.021762,-0.120383,-0.042622,0.026721,-0.010975,-0.056840,0.062169,0.133135,-0.056864,-0.016516,0.007985,0.002118,-0.079442,0.029553,0.015503,0.019077,0.016370,-0.042050,-0.032902,0.011067,-0.040924,0.041928,0.015556,0.017211,0.028470,0.005642,-0.044495,-0.002462,-0.006012,0.059784,0.075199,0.045789,-0.029441,0.019732,-0.001593,-0.015809,-0.062697,-0.003406,-0.013217,-0.035968,-0.003262,-0.042702,-0.078845,-0.047939,0.056677,-0.018103,-0.022322,-0.000669,-0.057220,0.074602,0.030007,0.106048,-0.013520,0.068457,0.014476,0.028526,-0.001098,0.036018,0.046017,-0.016539,0.031878,0.004717,0.060658,-0.013691,0.063258,-0.019897,0.051765,-0.024048,0.034322,-0.032805,-0.039179,-0.044717,-0.015082,0.021996,0.013447,0.047895,0.003928,0.011850,-0.062292,0.038654,-0.050963,0.015019,-0.033382,0.006723,-0.034423,-0.064644,-0.043332,-0.008316
3,300.0,False,False,False,True,True,True,False,0,True,True,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,2,2,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,False,0,1,0.000565,0.081831,-0.023884,-0.062085,-0.004643,0.002487,-0.024667,-0.018430,-0.001680,-0.024890,-0.013761,0.012421,0.050530,0.146332,-0.048669,-0.038185,-0.017294,0.030368,0.055898,-0.048022,0.029388,-0.007646,0.002491,-0.003677,0.026631,0.029276,-0.095247,0.065438,-0.000917,-0.020601,0.045830,0.005318,0.061817,0.056365,0.097407,-0.001812,0.013194,-0.003760,0.014847,-0.068991,-0.079257,-0.068577,0.079831,-0.053023,-0.069960,0.035004,0.085839,0.002184,-0.037845,-0.018470,0.033375,-0.062080,-0.142034,-0.031675,-0.051455,0.026113,0.058289,0.016082,-0.038199,-0.011879,0.015461,-0.015013,0.023052,0.002940,0.011858,-0.061447,-0.006705,-0.050642,-0.053772,-0.015263,0.020349,-0.016082,-0.033813,0.005487,-0.040068,0.051573,0.008212,-0.079654,-0.007213,-0.023425,0.026860,-0.067554,-0.064925,0.007495,0.067808,-0.015241,0.060687,0.010666,-0.011276,-0.084183,0.023167,0.015084,-0.001079,0.059728,-0.064013,0.069736,-0.016537,0.016044,-0.089359,-0.018737,-0.029255,-0.053422,0.045195,-0.006384,-0.027963,0.007602,0.051566,-0.024279,-0.027910,-0.041073,-0.004481,-0.021417,-0.020258,-0.015045,-0.080685,0.053854,-0.076386,0.021436,0.055555,-0.043951,0.031706,0.095553,-0.022395,-0.013888,-0.028245,-0.118269,-0.003378,-0.011548,0.038565,-0.009736,-0.037318,0.063481,0.000865,0.023888,0.067544,-0.010241,-0.046410,0.015990,-0.003399,0.003943,-0.022837,-0.058131,0.012285,-0.061609,-0.021375,-0.103423,0.006479,0.036656,0.079912,0.066505,0.075789,0.020034,0.065410,0.028328,-0.003942,-0.044858,-0.002396,-0.022516,-0.037839,-0.021627,-0.106280,-0.000782,0.087840,0.056394,0.061429,0.047131,0.054796,-0.011076,0.078151,-0.090052,-0.045226,-0.000783,-0.045408,-0.026900,0.018668,-0.020646,-0.094224,0.018914,0.064063,0.003783,-0.021872,-0.001703,0.057606,0.017829,-0.041111,-0.003645,-0.048168,-0.031757,0.078562,0.048802,0.051103,0.001612,0.040735,0.054167,0.016101,0.038673,-0.049330,0.069623,-0.042776,0.108023,0.007575,-0.055501,-0.056711,-0.006731,0.003692,-0.013554,0.044648,-0.004804,-0.009801,-0.022916,-0.033510,0.067465,0.031865,-0.099850,-0.026145,0.001841,0.012262,-0.053901,0.096106,0.110842,-0.068507,-0.036928,0.000027,0.034136,-0.030734,0.013342,0.004869,0.035869,0.032193,-0.009326,-0.054101,-0.048267,-0.031649,0.043119,-0.011225,-0.034764,-0.010260,-0.013023,-0.010645,0.016932,-0.036283,0.070133,0.065875,0.058120,0.000468,-0.037147,0.037283,-0.012312,-0.064457,-0.019237,-0.029979,-0.051699,-0.012780,-0.010797,-0.054104,-0.096128,0.043332,0.020209,0.024109,0.034667,-0.038525,-0.000579,-0.002408,0.070042,-0.038923,0.059514,0.017851,0.044447,0.019380,0.042349,-0.001268,-0.035257,0.031711,-0.038535,0.055307,0.063442,0.087636,0.014155,-0.016443,-0.042355,0.032029,-0.065171,0.024269,0.020719,0.030495,0.035247,-0.028253,0.030109,0.038719,0.025247,-0.062824,-0.001652,-0.082428,0.048707,-0.024009,0.018111,-0.048337,-0.037789,-0.018611,0.011336
4,201.0,False,False,False,True,False,False,False,0,True,True,True,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,2,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16,False,1,0,0.007876,0.063050,-0.017276,-0.077545,-0.019022,0.017432,0.027112,0.011188,0.035279,-0.021088,-0.013109,0.028218,0.033441,0.154950,-0.015470,-0.039279,-0.009749,0.009035,0.019892,-0.056281,0.028947,-0.018572,0.010640,-0.012964,-0.001425,0.031276,-0.069615,0.018410,0.037617,-0.042898,0.051519,0.019778,0.050116,0.063972,0.090834,0.005196,0.052382,-0.009045,0.017207,-0.062290,-0.079970,-0.053830,0.038189,-0.081633,-0.066215,0.052787,0.104077,0.025096,0.026491,-0.014731,0.012101,-0.023659,-0.073216,-0.035542,-0.104285,0.032864,0.048078,-0.001116,-0.054142,0.022082,-0.009627,0.014647,-0.000815,-0.036124,-0.005699,-0.025199,0.025575,-0.008682,0.016861,-0.045320,-0.013889,-0.043297,-0.079856,0.024288,-0.017155,0.036400,-0.014145,-0.083429,0.017569,-0.030499,0.029601,-0.070537,-0.065231,-0.012890,0.067462,-0.037502,0.010791,0.049747,-0.046371,-0.065296,0.026081,-0.032364,0.019569,0.006795,-0.041912,0.045410,-0.018157,0.059466,-0.054445,-0.026229,-0.041554,0.020493,0.071074,0.031906,-0.014925,-0.017365,0.057727,-0.019537,-0.017075,-0.020095,0.030538,0.010147,-0.007434,-0.043473,-0.053317,0.077024,-0.028011,0.023106,0.029856,-0.010197,-0.005054,0.041942,-0.002008,-0.006141,-0.046635,-0.111161,-0.017975,-0.014036,0.027196,-0.020361,-0.009124,0.059861,-0.027153,0.044617,0.100163,-0.017210,-0.028003,-0.010327,-0.010363,0.024975,-0.058736,-0.067962,0.016231,-0.047341,-0.002873,-0.112224,-0.021316,0.048766,0.078449,0.019122,0.064520,0.009100,0.077022,0.026073,0.043289,-0.024171,-0.018261,-0.038977,-0.060643,-0.030725,-0.084770,0.015668,0.091074,0.041432,0.037227,0.057856,0.032466,-0.010286,0.038812,-0.077287,-0.049205,0.005122,-0.030950,-0.020938,0.025032,0.002624,-0.078252,0.007227,0.044513,0.005350,-0.006332,0.014094,0.076125,-0.007806,-0.027211,-0.064854,-0.066903,0.001068,0.046570,0.017761,0.033944,0.044160,0.076674,-0.004111,-0.011887,0.033293,0.007030,0.052233,-0.029528,0.154325,0.040341,-0.056313,-0.071850,0.002547,0.006832,-0.030825,0.060665,-0.009865,0.013256,-0.047004,-0.004811,0.038247,0.018905,-0.099373,-0.023546,0.037968,-0.007057,-0.038502,0.070323,0.088535,-0.072756,-0.045297,0.008133,0.018626,-0.020049,0.003498,0.034088,0.059240,-0.005049,0.019282,-0.028091,0.001909,-0.067206,0.041854,0.016912,-0.007389,0.005633,-0.032644,-0.054890,-0.032116,-0.015984,0.007899,0.049997,0.055743,0.020651,-0.061688,0.009789,-0.002601,-0.069934,0.007550,0.010509,-0.030204,-0.004872,0.029648,-0.035615,-0.083204,0.035749,0.039674,-0.003712,0.012814,-0.054307,0.016890,-0.023737,0.038391,-0.059822,0.096328,0.018987,0.008253,0.029995,0.029852,0.020300,-0.010143,0.035545,-0.006307,0.054816,0.024170,0.076480,-0.056344,-0.025782,-0.018064,0.017894,-0.055717,-0.015519,0.011895,-0.000575,0.029044,-0.041722,0.029810,0.019966,-0.016637,-0.102845,0.012527,-0.044770,-0.000489,-0.057994,0.030250,-0.031844,-0.048505,0.008678,-0.021535
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4067,3.0,True,True,False,False,False,False,False,3,True,True,True,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,True,0,1,-0.003618,0.058121,-0.010971,-0.024560,-0.015104,0.040289,0.011123,-0.004842,0.028892,-0.014071,-0.009573,-0.019000,-0.001425,0.084456,-0.047262,0.004229,-0.022657,0.014259,0.004051,-0.036760,0.036307,0.007745,-0.026160,0.030271,-0.025824,-0.018264,-0.074786,0.067256,0.012912,-0.015506,0.027679,0.046435,0.047247,0.055328,0.065786,-0.005883,0.007250,0.058249,0.019177,-0.051131,-0.058190,-0.074958,0.012585,-0.066766,-0.024250,0.027603,0.094502,-0.023830,-0.039747,-0.019473,0.036096,-0.017493,-0.103207,-0.010490,-0.068213,0.020208,0.042067,0.036954,-0.046426,-0.040378,0.020492,-0.022694,-0.031587,-0.019285,-0.003914,-0.049035,0.008381,-0.078895,-0.049145,-0.016321,-0.023660,-0.019178,-0.058027,0.038432,-0.006494,0.012289,0.035825,-0.065233,0.011675,-0.063603,0.008960,-0.029043,-0.092501,0.002612,0.063076,-0.093855,0.049929,0.034873,-0.064413,-0.029655,-0.001210,-0.044726,0.003086,0.024552,-0.029560,0.062172,-0.037534,0.036951,-0.071543,-0.004343,0.005381,-0.001627,0.049217,-0.005211,-0.023806,-0.001619,0.038702,-0.021721,-0.054323,-0.037373,0.003218,-0.024405,-0.041997,-0.002499,-0.091692,0.063253,-0.053883,0.071613,0.084124,-0.018033,-0.002207,0.077441,-0.001185,0.022344,-0.089386,-0.136807,0.000102,-0.038499,0.028815,-0.028279,-0.040579,0.014224,-0.031377,-0.010033,0.059499,-0.010832,-0.068315,0.019107,0.030841,-0.002021,-0.061625,-0.055829,0.005653,-0.042676,-0.009903,-0.085082,-0.000370,0.030083,0.072561,-0.018381,0.026247,0.044361,0.043541,-0.004058,0.029527,-0.040057,-0.023306,-0.035046,-0.017580,0.000214,-0.077315,0.008209,0.022940,0.051983,0.033607,0.036275,0.036112,0.013021,0.067226,-0.059304,-0.059929,-0.062082,-0.038676,-0.031955,0.007846,-0.001383,-0.078254,-0.017301,0.035594,0.025077,0.011419,0.021580,0.063512,-0.006164,-0.059959,-0.046148,-0.087352,-0.018497,0.061335,0.008955,0.031225,0.004025,0.126064,0.096040,0.013127,0.025161,0.016345,0.072884,-0.064425,0.152090,0.005331,-0.077882,-0.055195,-0.003500,0.044775,-0.013743,0.036336,0.010037,0.017660,-0.039402,0.014682,0.060670,0.038012,-0.052585,-0.004137,-0.000904,0.021169,-0.044585,0.075546,0.095417,-0.058069,-0.010409,0.043557,0.010566,-0.052371,0.015817,0.001390,0.076573,-0.001478,0.004719,0.006272,0.061773,-0.044828,-0.018122,-0.052251,0.017361,-0.006368,0.003209,-0.067311,0.002905,0.002059,0.066850,0.076607,0.036245,-0.082742,-0.004649,0.037634,-0.005843,-0.078178,-0.011012,0.028959,-0.032400,0.014301,0.026113,-0.046829,-0.064672,0.027336,0.045000,0.011680,0.040761,-0.072008,0.028874,0.010766,0.083118,-0.021534,0.055529,-0.007600,0.035426,0.028404,0.031806,0.000300,0.026255,0.048048,0.013583,0.044660,0.076731,0.067596,-0.021959,-0.015123,-0.032280,0.052793,-0.040780,-0.012044,-0.017591,-0.022850,0.020426,0.029919,0.003826,0.007851,0.021381,-0.084199,-0.028158,-0.090240,0.029053,-0.018574,-0.013479,-0.044561,-0.071171,0.069025,0.004547
4068,3.0,False,False,True,False,False,False,False,1,True,True,True,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,1,0,0,4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,True,0,0,-0.029949,0.082896,-0.034953,-0.026650,-0.007026,0.048386,0.103976,0.053069,0.028544,0.037824,0.010498,0.003027,0.041195,0.062536,-0.069469,0.033833,-0.026721,0.012704,0.039190,-0.000819,0.075880,-0.037937,0.012587,-0.079431,-0.020758,-0.046560,-0.033649,0.063246,0.109619,-0.027868,0.080242,0.089619,0.046940,0.053514,0.062016,0.078524,0.057739,0.039785,0.061485,0.001224,-0.129764,-0.075119,0.130026,-0.122084,-0.065537,-0.005310,0.097052,0.031745,0.037714,-0.059286,0.036847,-0.025701,-0.040034,-0.054438,-0.074192,0.028409,0.061883,0.039251,-0.051789,-0.033047,0.029015,-0.012153,-0.033272,-0.053394,0.024368,0.060886,0.035054,-0.045672,0.004662,0.018941,-0.036971,-0.016653,-0.105880,-0.007407,0.041904,0.026454,0.036862,-0.073709,-0.110962,-0.055900,-0.066555,-0.128753,-0.036556,0.027923,0.107993,-0.056767,0.055282,0.065534,-0.090170,0.034045,0.072998,0.024888,0.014867,-0.007259,0.019076,0.076609,0.003849,0.057210,-0.157038,-0.103749,-0.069695,0.026046,0.046664,0.019272,-0.026096,-0.003092,0.002056,-0.024532,-0.049025,-0.022419,0.032409,-0.020819,0.010792,-0.071955,-0.052534,0.085120,-0.002010,-0.069554,0.082184,-0.004843,-0.045449,0.034508,0.002139,0.054922,-0.033568,-0.133518,-0.032905,0.008476,0.004921,-0.018112,-0.066921,0.107561,-0.027423,0.035733,0.058635,-0.060647,-0.060344,0.039578,-0.049325,0.059377,-0.065392,-0.051362,0.005142,-0.076968,-0.086815,-0.063364,-0.082725,0.074013,0.081778,0.026348,0.082820,0.057292,0.061720,0.022944,0.039204,-0.020933,-0.026428,-0.056681,-0.095046,-0.042045,-0.027099,0.073696,0.140192,0.046226,0.033139,0.097401,0.043727,0.027574,0.004586,-0.102492,-0.050484,0.014189,-0.012544,-0.047203,-0.038654,0.004412,-0.024103,0.004007,0.026544,0.007521,-0.039020,-0.045092,0.062847,0.095151,0.043744,-0.028076,-0.020914,-0.088718,0.039027,0.051227,0.052905,0.061713,0.050377,0.034627,-0.036857,-0.066364,-0.008437,0.061294,-0.040958,0.143536,0.066859,-0.075993,-0.181539,0.027681,0.002268,-0.029905,-0.013638,0.011276,-0.018016,-0.038703,0.077696,-0.022799,0.055586,-0.050645,-0.009741,0.050082,0.068166,-0.030589,0.088519,0.045833,-0.061168,-0.037759,-0.059199,-0.021840,-0.066296,-0.000624,0.045270,0.186016,0.054999,0.022974,-0.063671,-0.045497,-0.060417,0.108035,-0.010742,0.000037,0.016505,0.013031,-0.064794,-0.035717,-0.030115,0.060364,0.092549,-0.022847,-0.072762,-0.089851,0.014038,0.025565,-0.029788,0.069465,0.014966,-0.033615,-0.015868,0.018298,-0.098117,-0.069492,-0.034801,-0.018127,-0.013888,0.044347,-0.103274,-0.011247,-0.007543,0.063533,-0.037185,-0.000851,0.010465,0.029443,0.051954,0.038196,0.083863,-0.057757,0.060645,0.043028,0.023664,0.039786,0.074929,-0.111183,0.020566,-0.074858,-0.025110,-0.057587,0.031941,-0.011975,-0.047322,-0.008559,-0.037636,0.123590,-0.045866,0.063978,-0.061941,0.010169,-0.079048,0.027468,-0.083977,0.008454,-0.096723,-0.099934,-0.009760,0.032551
4069,2.0,False,False,True,False,False,False,False,0,True,True,True,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,4,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,True,0,0,0.002470,0.078219,-0.094662,-0.012177,-0.076930,-0.002619,0.032509,-0.027175,0.029885,-0.042286,0.016125,0.026415,-0.009085,0.099511,-0.056172,0.020821,-0.085178,-0.032424,0.002648,-0.010356,0.037666,0.025260,-0.012897,-0.083087,0.031865,0.007742,-0.053793,0.095234,0.013345,-0.087426,0.042700,0.033889,0.051042,0.051886,0.053449,-0.015664,0.068701,0.033437,0.058960,-0.003502,-0.063360,-0.063908,0.015674,-0.156984,-0.018325,-0.012474,0.027586,-0.025988,0.033386,-0.027181,-0.002337,-0.071353,-0.019098,-0.045506,-0.000024,0.054360,0.004709,-0.024216,-0.097498,-0.015610,0.033205,-0.031178,-0.028429,-0.071186,-0.063911,0.045220,-0.008236,-0.004236,0.001260,0.004577,-0.002433,-0.016931,-0.056844,-0.040174,0.077710,0.047567,0.044537,-0.039319,-0.010966,-0.125610,0.004114,-0.060320,-0.004474,-0.098593,0.093088,-0.037905,0.079398,-0.008017,-0.116566,0.004442,0.060225,0.009568,0.008126,0.010461,-0.024298,0.068034,0.023374,0.040502,-0.039376,-0.056563,-0.066905,-0.018515,0.058605,0.005041,-0.027970,0.049607,0.060908,-0.046879,-0.117245,0.005469,-0.036090,-0.038039,-0.009751,-0.062473,-0.058788,0.095638,-0.006192,0.038069,0.004033,-0.012175,0.007719,0.055829,-0.041768,0.023460,-0.036569,-0.072462,-0.005519,-0.027219,-0.009686,-0.043106,-0.008501,0.118518,0.009638,0.024207,0.092814,-0.040495,-0.075250,0.057235,-0.041603,0.031816,-0.070149,-0.060391,0.047628,-0.050623,0.037164,-0.080856,-0.079186,0.010856,0.088415,-0.021069,0.018515,0.026620,-0.016855,0.002003,0.023489,-0.041261,-0.029764,0.020696,-0.031480,-0.116578,-0.016483,-0.031590,0.021655,0.029812,0.017514,0.012964,0.062687,0.010609,0.090952,-0.029409,-0.058296,0.009926,-0.020065,-0.090135,-0.027468,-0.017014,-0.004630,0.056401,0.057741,-0.015593,0.034602,-0.027373,0.093255,0.023636,-0.022094,0.008009,-0.108164,-0.036808,-0.017492,0.021963,0.058332,0.028106,0.057428,0.067753,-0.050060,-0.046828,-0.034821,0.072258,-0.067812,0.096966,0.035189,-0.065327,-0.064208,0.034975,0.024156,0.067379,0.021940,0.048778,-0.013128,-0.047728,-0.043967,0.021754,0.016653,-0.004015,-0.041430,0.024690,0.023948,-0.049344,0.070377,0.000449,-0.015777,-0.009330,0.006234,0.031886,-0.011636,-0.065732,-0.011940,0.034118,0.009939,0.016589,-0.024905,-0.016157,-0.102922,0.032914,0.011198,-0.010007,-0.050044,-0.010692,-0.058734,-0.009958,0.005162,0.004729,0.069264,-0.028750,-0.006300,-0.026528,0.068210,0.057302,-0.048453,0.025970,0.028511,-0.016983,-0.029838,-0.001373,-0.101447,-0.108972,-0.026558,0.013271,0.059763,0.031733,-0.080480,-0.023863,-0.010001,0.144193,-0.057489,0.042888,0.032461,-0.030626,0.003628,0.006203,0.038288,-0.090810,0.145676,0.043105,0.012280,0.035813,0.108851,-0.054189,0.023854,-0.028440,0.008076,-0.043433,-0.001047,-0.050102,-0.042989,-0.054117,-0.000704,0.070600,-0.034405,0.038071,-0.021460,0.044687,-0.079540,0.009942,-0.092061,-0.034113,-0.072306,-0.059114,-0.020850,-0.053418
4070,2.0,False,False,False,False,False,False,False,2,True,True,True,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,4,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,True,0,0,-0.034177,0.077556,-0.011058,-0.086996,0.034279,-0.014344,-0.033914,-0.063777,0.038160,0.001994,-0.010802,0.015610,-0.014797,0.119807,-0.047338,-0.047526,0.052441,0.034317,0.044201,-0.032743,-0.013010,0.027590,0.021144,-0.028237,0.027407,0.011987,-0.106980,0.046202,-0.060642,-0.023930,0.040782,0.033085,0.034930,0.063875,0.042414,-0.008231,0.032132,-0.026464,0.045044,-0.067457,-0.080955,-0.092966,0.070665,-0.101826,-0.044300,0.013691,0.109338,0.013994,0.004600,-0.077820,0.050898,-0.042021,-0.061644,-0.047945,-0.032206,0.048376,0.034962,0.037574,-0.059392,-0.003809,0.000099,0.003013,0.035145,-0.010850,0.061110,-0.046176,0.022002,-0.048406,-0.038230,-0.029888,-0.040896,-0.063607,-0.071575,-0.045971,-0.027302,-0.027493,-0.041108,-0.029638,-0.033369,-0.101046,0.009872,-0.103276,-0.085856,0.010434,0.068429,-0.060297,0.047251,0.018481,-0.119975,-0.063696,0.036831,-0.065960,0.071737,0.058739,-0.022634,0.121302,-0.020011,0.020333,-0.093293,-0.058198,-0.030954,-0.002540,0.009700,0.008960,-0.028818,0.009814,-0.008781,0.029520,-0.065504,0.000216,-0.034041,-0.006437,-0.006701,-0.037755,-0.094280,0.050431,-0.046982,0.037827,0.014955,-0.025313,0.021132,0.128940,-0.031854,0.042546,0.029081,-0.113783,0.003567,0.020273,0.023910,0.008032,-0.026092,0.050834,-0.003301,0.060502,0.118287,0.014981,-0.083238,0.024805,0.021399,-0.006884,-0.047415,-0.089563,-0.039146,-0.033937,-0.022923,-0.096591,-0.058916,0.068429,0.145869,0.014704,0.025027,-0.001154,0.036798,0.013003,-0.074694,-0.028948,0.007459,-0.009501,-0.059447,0.003394,-0.058953,0.076933,0.129002,-0.001593,0.043301,0.120252,0.088731,-0.047179,0.110565,-0.027096,-0.046521,0.045704,-0.057037,0.013652,-0.023586,-0.022696,-0.119700,0.083819,0.088893,-0.001201,0.050255,-0.000538,0.093466,0.006873,-0.034463,0.058760,-0.081374,-0.062011,0.025962,0.017156,0.026481,0.052565,0.044887,0.032397,-0.031832,-0.030265,-0.049936,0.105454,-0.013589,0.187587,0.050343,-0.063932,-0.072270,-0.062362,0.007774,-0.041319,0.024528,0.004675,0.039101,-0.027696,0.027059,0.065727,0.018227,-0.079131,-0.014140,0.014906,-0.004415,-0.060057,0.143039,0.025383,-0.070740,-0.047148,-0.004544,0.008094,-0.008650,0.084743,0.042808,0.067586,0.043256,0.001937,-0.025178,-0.035849,-0.082572,0.058315,0.039340,-0.034671,-0.041601,-0.008235,-0.073277,-0.021149,-0.026429,0.066358,0.077550,0.037130,-0.012708,-0.121888,0.036602,-0.029955,-0.080198,0.012543,-0.053428,0.004718,-0.004358,0.014091,-0.102437,-0.069218,0.054956,-0.019504,-0.054644,0.013445,-0.096140,-0.024145,-0.005013,0.077182,-0.028164,0.048575,0.051049,0.032784,0.019019,-0.005926,-0.002140,-0.054473,0.053678,-0.009380,0.103730,0.009677,0.117700,-0.042859,0.062222,-0.025698,0.020666,-0.091414,-0.006184,0.034034,-0.029137,0.054535,-0.028652,-0.023722,-0.000195,-0.020736,-0.060990,-0.054584,-0.093164,-0.007858,-0.082264,0.005616,-0.134510,-0.045910,-0.018475,-0.055004


In [268]:
embeddingsdf = pd.concat([test, embeddings], axis=1)

In [269]:
embeddingsdf.shape

(4072, 389)

In [270]:
# create the initial train and test sets
train = embeddingsdf[embeddingsdf['active'] == 0]
test = embeddingsdf[embeddingsdf['active'] == 1]

In [271]:
# drop the active column from these sets
train = train.drop(columns='active')
test = test.drop(columns='active')

In [272]:
# create X and y from train dataframe
X = train.drop(columns='success')
y = train['success']

In [273]:
# do train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Modelling

## Logistic Regression Model <a class="anchor" id="logregmodel"></a>

In [274]:
# instantiate a logistic regression model
logreg_model = LogisticRegression(max_iter=10000)

# fit it to training data
logreg_model.fit(X_train, y_train)

# score the accuracy of the model
lr_train_score = logreg_model.score(X_train, y_train)
lr_test_score = logreg_model.score(X_test, y_test)

print(f"Score on training set: {lr_train_score}")
print(f"Score on test set: {lr_test_score}")

Score on training set: 0.7626546681664792
Score on test set: 0.7219730941704036


In [275]:
# Generate confusion matrix
lr_pred = logreg_model.predict(X_test)
cf_matrix = confusion_matrix(y_test, lr_pred)

# label rows and columns
cf_df = pd.DataFrame(
    cf_matrix, 
    columns=["Predicted Failure", "Predicted Success"],
    index=["True Failure", "True Success"]
)

display(cf_df)

Unnamed: 0,Predicted Failure,Predicted Success
True Failure,96,26
True Success,36,65


In [276]:
lr_precision = precision_score(y_test, lr_pred)
lr_recall = recall_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred)
print(f'precision = {lr_precision}, recall = {lr_recall}, f1 = {lr_f1}')

precision = 0.7142857142857143, recall = 0.6435643564356436, f1 = 0.6770833333333333


In [277]:
# creating a dataframe of the coefficients
coef = pd.DataFrame(logreg_model.coef_,  columns = X_train.columns)

In [278]:
coef

Unnamed: 0,team_size,highlight_black,highlight_latinx,highlight_women,top_company,top_company_by_revenue,isHiring,nonprofit,num_former_names,has_logo,has_website,has_one_liner,"Austin, TX, USA","Bengaluru, KA, India","Boston, MA, USA","London, England, United Kingdom","Los Angeles, CA, USA","Mexico City, CDMX, Mexico","Mountain View, CA, USA","New York, NY, USA","Palo Alto, CA, USA","Paris, Île-de-France, France",Remote,"San Francisco, CA, USA","Seattle, WA, USA","Toronto, ON, Canada",other_city,Canada - region,Europe - region,India - region,Latin America - region,United States of America - region,other_region,Unspecified - region,num_regions,num_tags,AI,API,Analytics,Artificial Intelligence,B2B,Biotech,Climate,Consumer,Consumer Finance,Consumer Health Services,Consumer Health and Wellness,Content,Crypto / Web3,Data Engineering,Developer Tools,E-commerce,Education,Enterprise,Finance and Accounting,Fintech,Food and Beverage,Gaming,Generative AI,Hardware,Health Tech,Healthcare,Healthcare IT,Home and Personal,Housing and Real Estate,Human Resources,Industrials,Infrastructure,Logistics,Machine Learning,Marketing,Marketplace,Open Source,Operations,Payments,Productivity,Proptech,Real Estate and Construction,Retail,SaaS,Sales,Security,Social,Supply Chain and Logistics,other_tag,years_since_founding,Early,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299,300
0,0.014836,0.058983,0.003637,0.642867,1.574596,0.150562,6.031115e-07,0.145513,-0.105389,0.814051,0.439096,0.812909,1.09314,0.321479,0.627711,-0.187209,0.329988,-0.008995,-0.134255,-0.173809,-0.51045,-0.607272,-0.781506,0.220193,-0.941183,0.462018,0.219602,-0.043273,-0.048359,-0.235319,-0.594887,-0.006504,-0.178004,-0.805312,0.182977,-0.065327,-0.190055,0.319825,0.64417,0.370638,0.494243,-0.281973,-0.092127,0.101335,0.707284,-0.709087,-1.174333,0.325031,-0.279491,0.268267,0.219118,0.112493,0.452915,0.027388,0.155605,0.415747,-0.180151,0.010754,-0.369201,0.473844,-0.341154,0.068034,0.778057,0.308941,0.279076,-0.544893,-0.217575,0.50658,0.235555,0.107141,0.248865,-0.350743,0.526985,0.28806,0.67506,0.363039,-0.129144,0.127087,-0.430741,0.470986,-0.168615,0.481108,-0.209852,-0.352042,0.078918,0.082059,-0.781947,0.116021,0.102196,0.125022,-0.109791,-0.150798,0.090398,-0.449208,-0.140191,0.321688,-0.269477,-0.500971,0.026321,-0.075764,0.22437,-0.097969,-0.330248,-0.297067,-0.270931,0.157518,0.107434,-0.185574,0.699329,0.020456,0.168738,-0.343215,-0.55309,-0.645704,0.076458,-0.053703,-0.082728,0.085792,0.376069,0.533296,-0.1343,0.009599,-0.122491,0.008944,0.065679,0.417993,0.189458,-0.064293,-0.435965,0.137331,0.172101,-0.176781,0.052645,-0.007492,0.227059,0.481327,-0.012267,0.101448,-0.059461,-0.321168,-0.201997,-0.118561,0.252561,0.431152,-0.2216,-0.141229,-0.141882,0.133181,0.224785,0.020042,0.275905,0.660231,0.512992,0.113265,0.558409,-0.095075,-0.15605,-0.053228,-0.255002,-0.142812,-0.128853,0.041052,0.246101,0.146542,0.365889,-0.195451,-0.060164,0.094755,-0.152643,-0.178628,0.236956,0.368243,0.37368,-0.229908,0.569059,-0.129025,-0.135591,-0.114551,-0.141687,-0.074972,0.172927,0.186786,-0.258062,0.384336,0.053802,-0.275151,-0.636318,0.336292,-0.368604,0.439548,0.112137,-0.014987,-0.395489,-0.013981,-0.357515,0.549634,-0.058959,0.226079,-0.264936,0.09458,0.123332,-0.171864,0.314163,0.071738,-0.452409,0.025112,-0.453606,0.516842,0.072797,-0.233114,0.118733,0.545109,-0.08491,-0.091206,0.102594,0.26464,0.475281,0.329905,0.295036,0.111253,-0.131051,-0.205103,-0.215995,0.125314,-0.089859,0.561641,-0.258302,-0.285967,0.119401,-0.050386,0.048003,-0.109225,0.145268,-0.372858,0.33354,0.039844,-0.011183,-0.063527,0.204816,0.128153,0.431445,-0.003335,0.489564,-0.197658,0.012752,-0.294448,0.307753,-0.261288,0.044595,-0.082277,0.048696,-0.038524,0.447438,-0.049429,0.337395,-0.254978,0.149392,-0.231608,-0.57411,0.386302,0.263937,-0.222461,-0.138555,-0.23631,0.029566,-0.178965,-0.447188,-0.136971,-0.19533,0.075262,-0.172854,-0.194322,-0.181937,-0.250686,-0.236472,0.321731,-0.086818,-0.053453,0.053751,0.378193,0.151743,-0.107838,-0.160681,0.004104,0.05295,-0.220276,0.081803,0.06289,-0.277103,-0.265811,-0.66851,-0.065633,0.013,0.009009,0.151109,0.162399,-0.267,0.039833,0.094303,0.089388,-0.222614,-0.19783,-0.317321,-0.049631,0.034714,0.121864,-0.032735,-0.121889,0.01098,0.143556,-0.26471,0.022145,0.064517,-0.11588,-0.204713,0.075385,0.027062,0.308163,-0.150097,0.169924,-0.267511,-0.185675,0.087072,-0.212931,0.427404,-0.330861,0.194286,-0.067736,0.373351,-0.066224,0.131342,-0.807535,-0.39955,0.31621,-0.033992,-0.42832,0.121469,-0.293139,0.056375,0.186933,0.155162,0.441178,-0.630441,-0.03252,0.481451,-0.019095,-0.322993,0.170666,0.17585,0.113388,-0.084125,0.02835,-0.255521,-0.031467,0.360148,0.387954,-0.059626,0.079704,-0.03307,0.663315,0.404507,-0.143166,0.448243,-0.217103,-0.052994,-0.088362,0.162327,0.438305,0.435842,-0.159254,-0.400431,0.129347,-0.024294,0.064391,0.107734,0.217678,0.297923,-0.031071,0.133038,-0.285872,0.12444,-0.428339,-0.385296,-0.367293,0.267734,0.369788,-0.566963


In [279]:
# adding the odds ratios to the dataframe to look at them side by side with the coefficients
coef.loc[1] = np.exp(logreg_model.coef_)[0]
coef.rename({0:'coefficients', 1:'odds_ratios'}, inplace=True)

In [280]:
# sorting by coefficients, descending --> positively correlates with success
coef.sort_values(axis=1, by='coefficients', ascending=False)

Unnamed: 0,top_company,"Austin, TX, USA",has_logo,has_one_liner,Healthcare IT,Consumer Finance,22,Payments,273,65,Analytics,highlight_women,"Boston, MA, USA",88,139,68,109,125,33,Open Source,121,66,Infrastructure,B2B,156,258,49,Security,130,Hardware,SaaS,"Toronto, ON, Canada",Education,276,166,255,103,has_website,281,282,154,57,238,39,Fintech,274,269,173,97,193,32,86,242,Artificial Intelligence,299,85,78,Productivity,268,168,101,148,"Los Angeles, CA, USA",131,Content,189,9,"Bengaluru, KA, India",API,247,116,Home and Personal,231,160,290,132,Operations,Housing and Real Estate,64,Data Engineering,298,129,174,56,Marketing,76,84,Logistics,48,111,62,14,"San Francisco, CA, USA",other_city,Developer Tools,289,152,240,40,253,95,num_regions,262,94,44,261,233,24,209,280,19,Finance and Accounting,254,194,208,top_company_by_revenue,170,77,nonprofit,146,223,43,61,292,244,285,153,Real Estate and Construction,137,3,294,114,219,250,142,124,1,263,67,E-commerce,104,133,288,20,Machine Learning,128,2,51,Consumer,81,113,212,6,213,236,31,years_since_founding,200,271,other_tag,28,229,183,122,117,Healthcare,38,226,287,201,highlight_black,252,98,192,198,46,164,144,162,75,149,211,218,178,265,Enterprise,230,12,119,225,23,63,team_size,206,158,222,Gaming,35,207,37,197,highlight_latinx,isHiring,155,United States of America - region,47,"Mexico City, CDMX, Mexico",150,50,107,105,259,286,291,267,257,220,272,248,165,Canada - region,Europe - region,167,217,143,278,71,191,29,110,52,270,80,151,41,num_tags,205,243,241,93,13,163,30,264,126,190,279,138,127,Climate,69,15,num_former_names,195,145,4,91,227,55,221,36,74,89,Proptech,134,"Mountain View, CA, USA",34,90,181,176,8,59,92,60,73,275,232,5,82,70,283,196,Sales,115,184,"New York, NY, USA",45,other_region,83,179,Food and Beverage,186,21,235,"London, England, United Kingdom",AI,185,182,79,157,215,54,228,135,Social,237,136,277,Industrials,199,58,175,214,87,171,123,India - region,177,188,187,169,72,266,96,140,161,224,112,203,210,234,10,18,99,202,Crypto / Web3,Biotech,293,141,251,159,17,216,53,260,16,239,Health Tech,25,Marketplace,Supply Chain and Logistics,108,297,102,Generative AI,147,296,106,246,284,249,295,Retail,42,180,7,118,120,11,"Palo Alto, CA, USA",Human Resources,26,300,172,Latin America - region,"Paris, Île-de-France, France",256,100,27,204,Consumer Health Services,Remote,Early,Unspecified - region,245,"Seattle, WA, USA",Consumer Health and Wellness
coefficients,1.574596,1.09314,0.814051,0.812909,0.778057,0.707284,0.699329,0.67506,0.663315,0.660231,0.64417,0.642867,0.627711,0.569059,0.561641,0.558409,0.549634,0.545109,0.533296,0.526985,0.516842,0.512992,0.50658,0.494243,0.489564,0.481451,0.481327,0.481108,0.475281,0.473844,0.470986,0.462018,0.452915,0.448243,0.447438,0.441178,0.439548,0.439096,0.438305,0.435842,0.431445,0.431152,0.427404,0.417993,0.415747,0.404507,0.387954,0.386302,0.384336,0.378193,0.376069,0.37368,0.373351,0.370638,0.369788,0.368243,0.365889,0.363039,0.360148,0.337395,0.336292,0.33354,0.329988,0.329905,0.325031,0.321731,0.321688,0.321479,0.319825,0.31621,0.314163,0.308941,0.308163,0.307753,0.297923,0.295036,0.28806,0.279076,0.275905,0.268267,0.267734,0.26464,0.263937,0.252561,0.248865,0.246101,0.236956,0.235555,0.227059,0.226079,0.224785,0.22437,0.220193,0.219602,0.219118,0.217678,0.204816,0.194286,0.189458,0.186933,0.186786,0.182977,0.17585,0.172927,0.172101,0.170666,0.169924,0.168738,0.162399,0.162327,0.157518,0.155605,0.155162,0.151743,0.151109,0.150562,0.149392,0.146542,0.145513,0.145268,0.143556,0.137331,0.133181,0.133038,0.131342,0.129347,0.128153,0.127087,0.125314,0.125022,0.12444,0.123332,0.121864,0.121469,0.119401,0.118733,0.116021,0.113388,0.113265,0.112493,0.112137,0.111253,0.107734,0.107434,0.107141,0.102594,0.102196,0.101448,0.101335,0.094755,0.09458,0.094303,0.090398,0.089388,0.087072,0.085792,0.082059,0.081803,0.079704,0.078918,0.076458,0.075385,0.075262,0.072797,0.071738,0.068034,0.065679,0.064517,0.064391,0.06289,0.058983,0.056375,0.053802,0.053751,0.05295,0.052645,0.048696,0.048003,0.044595,0.041052,0.039844,0.039833,0.034714,0.029566,0.02835,0.027388,0.027062,0.026321,0.025112,0.022145,0.020456,0.020042,0.014836,0.013,0.012752,0.01098,0.010754,0.009599,0.009009,0.008944,0.004104,0.003637,6.031115e-07,-0.003335,-0.006504,-0.007492,-0.008995,-0.011183,-0.012267,-0.013981,-0.014987,-0.019095,-0.024294,-0.031071,-0.031467,-0.03252,-0.032735,-0.03307,-0.033992,-0.038524,-0.043273,-0.048359,-0.049429,-0.049631,-0.050386,-0.052994,-0.053228,-0.053453,-0.053703,-0.058959,-0.059461,-0.059626,-0.060164,-0.063527,-0.064293,-0.065327,-0.065633,-0.066224,-0.067736,-0.074972,-0.075764,-0.082277,-0.082728,-0.084125,-0.08491,-0.086818,-0.088362,-0.089859,-0.091206,-0.092127,-0.095075,-0.097969,-0.105389,-0.107838,-0.109225,-0.109791,-0.114551,-0.11588,-0.118561,-0.121889,-0.122491,-0.128853,-0.129025,-0.129144,-0.131051,-0.134255,-0.1343,-0.135591,-0.136971,-0.138555,-0.140191,-0.141229,-0.141687,-0.141882,-0.142812,-0.143166,-0.150097,-0.150798,-0.152643,-0.15605,-0.159254,-0.160681,-0.168615,-0.171864,-0.172854,-0.173809,-0.176781,-0.178004,-0.178628,-0.178965,-0.180151,-0.181937,-0.185574,-0.185675,-0.187209,-0.190055,-0.194322,-0.19533,-0.195451,-0.197658,-0.19783,-0.201997,-0.204713,-0.205103,-0.209852,-0.212931,-0.215995,-0.217103,-0.217575,-0.220276,-0.2216,-0.222461,-0.222614,-0.229908,-0.231608,-0.233114,-0.235319,-0.23631,-0.236472,-0.250686,-0.254978,-0.255002,-0.255521,-0.258062,-0.258302,-0.261288,-0.26471,-0.264936,-0.265811,-0.267,-0.267511,-0.269477,-0.270931,-0.275151,-0.277103,-0.279491,-0.281973,-0.285872,-0.285967,-0.293139,-0.294448,-0.297067,-0.317321,-0.321168,-0.322993,-0.330248,-0.330861,-0.341154,-0.343215,-0.350743,-0.352042,-0.357515,-0.367293,-0.368604,-0.369201,-0.372858,-0.385296,-0.395489,-0.39955,-0.400431,-0.42832,-0.428339,-0.430741,-0.435965,-0.447188,-0.449208,-0.452409,-0.453606,-0.500971,-0.51045,-0.544893,-0.55309,-0.566963,-0.57411,-0.594887,-0.607272,-0.630441,-0.636318,-0.645704,-0.66851,-0.709087,-0.781506,-0.781947,-0.805312,-0.807535,-0.941183,-1.174333
odds_ratios,4.828792,2.983627,2.257033,2.254457,2.177238,2.028475,2.012402,1.96415,1.941216,1.935239,1.904406,1.901925,1.873317,1.766605,1.753547,1.74789,1.732618,1.724797,1.704542,1.693818,1.676724,1.67028,1.659606,1.639256,1.631604,1.618422,1.61822,1.617865,1.608466,1.606157,1.601573,1.587274,1.572891,1.565559,1.5643,1.554538,1.552006,1.551304,1.550077,1.546264,1.539481,1.539029,1.533272,1.51891,1.515503,1.498564,1.473962,1.471528,1.468639,1.459645,1.456547,1.453071,1.452593,1.448658,1.447428,1.445194,1.441795,1.437692,1.433542,1.401292,1.399748,1.395901,1.390952,1.390836,1.384073,1.379514,1.379454,1.379166,1.376887,1.371918,1.369113,1.361982,1.360923,1.360364,1.347058,1.343174,1.333837,1.321908,1.317723,1.307697,1.306999,1.302961,1.302047,1.287318,1.282569,1.279029,1.267386,1.265611,1.254904,1.253674,1.252053,1.251534,1.246318,1.245581,1.244978,1.243187,1.227299,1.214443,1.208595,1.205547,1.205369,1.200787,1.192259,1.18878,1.187798,1.186095,1.185214,1.183811,1.176329,1.176245,1.170602,1.168364,1.167847,1.163861,1.163123,1.162487,1.161128,1.157823,1.156633,1.15635,1.154371,1.147208,1.142457,1.142294,1.140358,1.138084,1.136727,1.135515,1.133504,1.133173,1.132514,1.13126,1.1296,1.129155,1.126822,1.126069,1.12302,1.120066,1.119929,1.119065,1.118666,1.117678,1.113752,1.113417,1.113092,1.108042,1.107601,1.106773,1.106647,1.099389,1.099197,1.098892,1.09461,1.093505,1.090976,1.08958,1.08552,1.085242,1.082967,1.082115,1.079456,1.0783,1.078167,1.075512,1.074374,1.070401,1.067884,1.066644,1.06651,1.06491,1.060758,1.057994,1.055276,1.055222,1.054377,1.054056,1.049901,1.049173,1.045604,1.041906,1.040648,1.040637,1.035323,1.030008,1.028756,1.027766,1.027432,1.026671,1.025429,1.022392,1.020667,1.020244,1.014947,1.013085,1.012834,1.011041,1.010812,1.009645,1.009049,1.008984,1.004112,1.003644,1.000001,0.99667,0.993517,0.992536,0.991046,0.98888,0.987808,0.986117,0.985125,0.981086,0.975999,0.969406,0.969023,0.968003,0.967795,0.967471,0.966579,0.962208,0.95765,0.952792,0.951773,0.95158,0.950862,0.948386,0.948163,0.94795,0.947714,0.942746,0.942272,0.942117,0.94161,0.938449,0.93773,0.936761,0.936475,0.935921,0.934507,0.927769,0.927035,0.921017,0.920602,0.919316,0.918595,0.916844,0.91543,0.91406,0.912829,0.911989,0.909305,0.906677,0.899974,0.897773,0.896528,0.896021,0.891766,0.890582,0.888198,0.885247,0.884714,0.879104,0.878952,0.878847,0.877173,0.874367,0.874328,0.8732,0.871995,0.870615,0.869192,0.868291,0.867893,0.867724,0.866917,0.866611,0.860624,0.860022,0.858436,0.855517,0.852779,0.851564,0.844834,0.842094,0.84126,0.840458,0.837963,0.836939,0.836417,0.836135,0.835144,0.833654,0.830628,0.830544,0.829271,0.826914,0.823393,0.822564,0.822464,0.82065,0.820509,0.817097,0.814882,0.814564,0.810704,0.808212,0.805739,0.804847,0.804467,0.802298,0.801236,0.800547,0.800424,0.794607,0.793257,0.792063,0.790319,0.789536,0.789408,0.778267,0.774934,0.774915,0.774513,0.772547,0.772362,0.770059,0.767428,0.767255,0.766584,0.765673,0.765282,0.763779,0.762669,0.759457,0.757976,0.756168,0.754294,0.751359,0.751287,0.745918,0.744943,0.742994,0.728097,0.725301,0.723979,0.718746,0.718305,0.71095,0.709486,0.704165,0.703251,0.699412,0.692607,0.691699,0.691286,0.688763,0.680249,0.673351,0.670622,0.670031,0.651603,0.65159,0.650027,0.64664,0.639423,0.638133,0.636094,0.635333,0.605942,0.600225,0.579904,0.57517,0.567246,0.563206,0.551625,0.544835,0.532357,0.529237,0.524293,0.512472,0.492093,0.457716,0.457515,0.446948,0.445956,0.390166,0.309025


In [281]:
# sorting by coefficients, ascending --> negatively correlates with success
coef.sort_values(axis=1, by='coefficients')

Unnamed: 0,Consumer Health and Wellness,"Seattle, WA, USA",245,Unspecified - region,Early,Remote,Consumer Health Services,204,27,100,256,"Paris, Île-de-France, France",Latin America - region,172,300,26,Human Resources,"Palo Alto, CA, USA",11,120,118,7,180,42,Retail,295,249,284,246,106,296,147,Generative AI,102,297,108,Supply Chain and Logistics,Marketplace,25,Health Tech,239,16,260,53,216,17,159,251,141,293,Biotech,Crypto / Web3,202,99,18,10,234,210,203,112,224,161,140,96,266,72,169,187,188,177,India - region,123,171,87,214,175,58,199,Industrials,277,136,237,Social,135,228,54,215,157,79,182,185,AI,"London, England, United Kingdom",235,21,186,Food and Beverage,179,83,other_region,45,"New York, NY, USA",184,115,Sales,196,283,70,82,5,232,275,73,60,92,59,8,176,181,90,34,"Mountain View, CA, USA",134,Proptech,89,74,36,221,55,227,91,4,145,195,num_former_names,15,69,Climate,127,138,279,190,126,264,30,163,13,93,241,243,205,num_tags,41,151,80,270,52,110,29,191,71,278,143,217,167,Europe - region,Canada - region,165,248,272,220,257,267,291,286,259,105,107,50,150,"Mexico City, CDMX, Mexico",47,United States of America - region,155,isHiring,highlight_latinx,197,37,207,35,Gaming,222,158,206,team_size,63,23,225,119,12,230,Enterprise,265,178,218,211,149,75,162,144,164,46,198,192,98,252,highlight_black,201,287,226,38,Healthcare,117,122,183,229,28,other_tag,271,200,years_since_founding,31,236,213,6,212,113,81,Consumer,51,2,128,Machine Learning,20,288,133,104,E-commerce,67,263,1,124,142,250,219,114,294,3,137,Real Estate and Construction,153,285,244,292,61,43,223,146,nonprofit,77,170,top_company_by_revenue,208,194,254,Finance and Accounting,19,280,209,24,233,261,44,94,262,num_regions,95,253,40,240,152,289,Developer Tools,other_city,"San Francisco, CA, USA",14,62,111,48,Logistics,84,76,Marketing,56,174,129,298,Data Engineering,64,Housing and Real Estate,Operations,132,290,160,231,Home and Personal,116,247,API,"Bengaluru, KA, India",9,189,Content,131,"Los Angeles, CA, USA",148,101,168,268,Productivity,78,85,299,Artificial Intelligence,242,86,32,193,97,173,269,274,Fintech,39,238,57,154,282,281,has_website,103,255,166,276,Education,"Toronto, ON, Canada",SaaS,Hardware,130,Security,49,258,156,B2B,Infrastructure,66,121,Open Source,33,125,109,68,139,88,"Boston, MA, USA",highlight_women,Analytics,65,273,Payments,22,Consumer Finance,Healthcare IT,has_one_liner,has_logo,"Austin, TX, USA",top_company
coefficients,-1.174333,-0.941183,-0.807535,-0.805312,-0.781947,-0.781506,-0.709087,-0.66851,-0.645704,-0.636318,-0.630441,-0.607272,-0.594887,-0.57411,-0.566963,-0.55309,-0.544893,-0.51045,-0.500971,-0.453606,-0.452409,-0.449208,-0.447188,-0.435965,-0.430741,-0.428339,-0.42832,-0.400431,-0.39955,-0.395489,-0.385296,-0.372858,-0.369201,-0.368604,-0.367293,-0.357515,-0.352042,-0.350743,-0.343215,-0.341154,-0.330861,-0.330248,-0.322993,-0.321168,-0.317321,-0.297067,-0.294448,-0.293139,-0.285967,-0.285872,-0.281973,-0.279491,-0.277103,-0.275151,-0.270931,-0.269477,-0.267511,-0.267,-0.265811,-0.264936,-0.26471,-0.261288,-0.258302,-0.258062,-0.255521,-0.255002,-0.254978,-0.250686,-0.236472,-0.23631,-0.235319,-0.233114,-0.231608,-0.229908,-0.222614,-0.222461,-0.2216,-0.220276,-0.217575,-0.217103,-0.215995,-0.212931,-0.209852,-0.205103,-0.204713,-0.201997,-0.19783,-0.197658,-0.195451,-0.19533,-0.194322,-0.190055,-0.187209,-0.185675,-0.185574,-0.181937,-0.180151,-0.178965,-0.178628,-0.178004,-0.176781,-0.173809,-0.172854,-0.171864,-0.168615,-0.160681,-0.159254,-0.15605,-0.152643,-0.150798,-0.150097,-0.143166,-0.142812,-0.141882,-0.141687,-0.141229,-0.140191,-0.138555,-0.136971,-0.135591,-0.1343,-0.134255,-0.131051,-0.129144,-0.129025,-0.128853,-0.122491,-0.121889,-0.118561,-0.11588,-0.114551,-0.109791,-0.109225,-0.107838,-0.105389,-0.097969,-0.095075,-0.092127,-0.091206,-0.089859,-0.088362,-0.086818,-0.08491,-0.084125,-0.082728,-0.082277,-0.075764,-0.074972,-0.067736,-0.066224,-0.065633,-0.065327,-0.064293,-0.063527,-0.060164,-0.059626,-0.059461,-0.058959,-0.053703,-0.053453,-0.053228,-0.052994,-0.050386,-0.049631,-0.049429,-0.048359,-0.043273,-0.038524,-0.033992,-0.03307,-0.032735,-0.03252,-0.031467,-0.031071,-0.024294,-0.019095,-0.014987,-0.013981,-0.012267,-0.011183,-0.008995,-0.007492,-0.006504,-0.003335,6.031115e-07,0.003637,0.004104,0.008944,0.009009,0.009599,0.010754,0.01098,0.012752,0.013,0.014836,0.020042,0.020456,0.022145,0.025112,0.026321,0.027062,0.027388,0.02835,0.029566,0.034714,0.039833,0.039844,0.041052,0.044595,0.048003,0.048696,0.052645,0.05295,0.053751,0.053802,0.056375,0.058983,0.06289,0.064391,0.064517,0.065679,0.068034,0.071738,0.072797,0.075262,0.075385,0.076458,0.078918,0.079704,0.081803,0.082059,0.085792,0.087072,0.089388,0.090398,0.094303,0.09458,0.094755,0.101335,0.101448,0.102196,0.102594,0.107141,0.107434,0.107734,0.111253,0.112137,0.112493,0.113265,0.113388,0.116021,0.118733,0.119401,0.121469,0.121864,0.123332,0.12444,0.125022,0.125314,0.127087,0.128153,0.129347,0.131342,0.133038,0.133181,0.137331,0.143556,0.145268,0.145513,0.146542,0.149392,0.150562,0.151109,0.151743,0.155162,0.155605,0.157518,0.162327,0.162399,0.168738,0.169924,0.170666,0.172101,0.172927,0.17585,0.182977,0.186786,0.186933,0.189458,0.194286,0.204816,0.217678,0.219118,0.219602,0.220193,0.22437,0.224785,0.226079,0.227059,0.235555,0.236956,0.246101,0.248865,0.252561,0.263937,0.26464,0.267734,0.268267,0.275905,0.279076,0.28806,0.295036,0.297923,0.307753,0.308163,0.308941,0.314163,0.31621,0.319825,0.321479,0.321688,0.321731,0.325031,0.329905,0.329988,0.33354,0.336292,0.337395,0.360148,0.363039,0.365889,0.368243,0.369788,0.370638,0.373351,0.37368,0.376069,0.378193,0.384336,0.386302,0.387954,0.404507,0.415747,0.417993,0.427404,0.431152,0.431445,0.435842,0.438305,0.439096,0.439548,0.441178,0.447438,0.448243,0.452915,0.462018,0.470986,0.473844,0.475281,0.481108,0.481327,0.481451,0.489564,0.494243,0.50658,0.512992,0.516842,0.526985,0.533296,0.545109,0.549634,0.558409,0.561641,0.569059,0.627711,0.642867,0.64417,0.660231,0.663315,0.67506,0.699329,0.707284,0.778057,0.812909,0.814051,1.09314,1.574596
odds_ratios,0.309025,0.390166,0.445956,0.446948,0.457515,0.457716,0.492093,0.512472,0.524293,0.529237,0.532357,0.544835,0.551625,0.563206,0.567246,0.57517,0.579904,0.600225,0.605942,0.635333,0.636094,0.638133,0.639423,0.64664,0.650027,0.65159,0.651603,0.670031,0.670622,0.673351,0.680249,0.688763,0.691286,0.691699,0.692607,0.699412,0.703251,0.704165,0.709486,0.71095,0.718305,0.718746,0.723979,0.725301,0.728097,0.742994,0.744943,0.745918,0.751287,0.751359,0.754294,0.756168,0.757976,0.759457,0.762669,0.763779,0.765282,0.765673,0.766584,0.767255,0.767428,0.770059,0.772362,0.772547,0.774513,0.774915,0.774934,0.778267,0.789408,0.789536,0.790319,0.792063,0.793257,0.794607,0.800424,0.800547,0.801236,0.802298,0.804467,0.804847,0.805739,0.808212,0.810704,0.814564,0.814882,0.817097,0.820509,0.82065,0.822464,0.822564,0.823393,0.826914,0.829271,0.830544,0.830628,0.833654,0.835144,0.836135,0.836417,0.836939,0.837963,0.840458,0.84126,0.842094,0.844834,0.851564,0.852779,0.855517,0.858436,0.860022,0.860624,0.866611,0.866917,0.867724,0.867893,0.868291,0.869192,0.870615,0.871995,0.8732,0.874328,0.874367,0.877173,0.878847,0.878952,0.879104,0.884714,0.885247,0.888198,0.890582,0.891766,0.896021,0.896528,0.897773,0.899974,0.906677,0.909305,0.911989,0.912829,0.91406,0.91543,0.916844,0.918595,0.919316,0.920602,0.921017,0.927035,0.927769,0.934507,0.935921,0.936475,0.936761,0.93773,0.938449,0.94161,0.942117,0.942272,0.942746,0.947714,0.94795,0.948163,0.948386,0.950862,0.95158,0.951773,0.952792,0.95765,0.962208,0.966579,0.967471,0.967795,0.968003,0.969023,0.969406,0.975999,0.981086,0.985125,0.986117,0.987808,0.98888,0.991046,0.992536,0.993517,0.99667,1.000001,1.003644,1.004112,1.008984,1.009049,1.009645,1.010812,1.011041,1.012834,1.013085,1.014947,1.020244,1.020667,1.022392,1.025429,1.026671,1.027432,1.027766,1.028756,1.030008,1.035323,1.040637,1.040648,1.041906,1.045604,1.049173,1.049901,1.054056,1.054377,1.055222,1.055276,1.057994,1.060758,1.06491,1.06651,1.066644,1.067884,1.070401,1.074374,1.075512,1.078167,1.0783,1.079456,1.082115,1.082967,1.085242,1.08552,1.08958,1.090976,1.093505,1.09461,1.098892,1.099197,1.099389,1.106647,1.106773,1.107601,1.108042,1.113092,1.113417,1.113752,1.117678,1.118666,1.119065,1.119929,1.120066,1.12302,1.126069,1.126822,1.129155,1.1296,1.13126,1.132514,1.133173,1.133504,1.135515,1.136727,1.138084,1.140358,1.142294,1.142457,1.147208,1.154371,1.15635,1.156633,1.157823,1.161128,1.162487,1.163123,1.163861,1.167847,1.168364,1.170602,1.176245,1.176329,1.183811,1.185214,1.186095,1.187798,1.18878,1.192259,1.200787,1.205369,1.205547,1.208595,1.214443,1.227299,1.243187,1.244978,1.245581,1.246318,1.251534,1.252053,1.253674,1.254904,1.265611,1.267386,1.279029,1.282569,1.287318,1.302047,1.302961,1.306999,1.307697,1.317723,1.321908,1.333837,1.343174,1.347058,1.360364,1.360923,1.361982,1.369113,1.371918,1.376887,1.379166,1.379454,1.379514,1.384073,1.390836,1.390952,1.395901,1.399748,1.401292,1.433542,1.437692,1.441795,1.445194,1.447428,1.448658,1.452593,1.453071,1.456547,1.459645,1.468639,1.471528,1.473962,1.498564,1.515503,1.51891,1.533272,1.539029,1.539481,1.546264,1.550077,1.551304,1.552006,1.554538,1.5643,1.565559,1.572891,1.587274,1.601573,1.606157,1.608466,1.617865,1.61822,1.618422,1.631604,1.639256,1.659606,1.67028,1.676724,1.693818,1.704542,1.724797,1.732618,1.74789,1.753547,1.766605,1.873317,1.901925,1.904406,1.935239,1.941216,1.96415,2.012402,2.028475,2.177238,2.254457,2.257033,2.983627,4.828792


## Decision Tree Model <a class="anchor" id="dtmodel"></a>

In [282]:
# instantiate decision tree
dt_model = DecisionTreeClassifier(max_depth=7)

# fit it to training data
dt_model.fit(X_train, y_train)

# score the accuracy
dt_train_score = dt_model.score(X_train, y_train)
dt_test_score = dt_model.score(X_test, y_test)

print(f"Score on training set: {dt_train_score}")
print(f"Score on test set: {dt_test_score}")

Score on training set: 0.8976377952755905
Score on test set: 0.6502242152466368


In [283]:
dt_pred = dt_model.predict(X_test)
dt_precision = precision_score(y_test, dt_pred)
dt_recall = recall_score(y_test, dt_pred)
dt_f1 = f1_score(y_test, dt_pred)
print(f'precision = {dt_precision}, recall = {dt_recall}, f1 = {dt_f1}')

precision = 0.6236559139784946, recall = 0.5742574257425742, f1 = 0.5979381443298969


## Random Forest Model <a class="anchor" id="rfmodel"></a>

In [284]:
# instantiate
rf_model = RandomForestClassifier(n_estimators=50, max_depth=9)

# fit
rf_model.fit(X_train, y_train)

# score
rf_train_score = rf_model.score(X_train, y_train)
rf_test_score = rf_model.score(X_test, y_test)

print(f"Score on training set: {rf_train_score}")
print(f"Score on test set: {rf_test_score}")

Score on training set: 0.9966254218222722
Score on test set: 0.6591928251121076


In [285]:
rf_pred = rf_model.predict(X_test)
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)
print(f'precision = {rf_precision}, recall = {rf_recall}, f1 = {rf_f1}')

precision = 0.6984126984126984, recall = 0.43564356435643564, f1 = 0.5365853658536586


## Support Vector Machine Model <a class="anchor" id="svmmodel"></a>

In [286]:
# instantiate
svm_model = LinearSVC(dual='auto')

# fit
svm_model.fit(X_train, y_train)

# score
svm_train_score = svm_model.score(X_train, y_train)
svm_test_score = svm_model.score(X_test, y_test)

print(f"Score on training set: {svm_train_score}")
print(f"Score on test set: {svm_test_score}")

Score on training set: 0.8098987626546682
Score on test set: 0.7040358744394619


In [287]:
svm_pred = svm_model.predict(X_test)
svm_precision = precision_score(y_test, svm_pred)
svm_recall = recall_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred)
print(f'precision = {svm_precision}, recall = {svm_recall}, f1 = {svm_f1}')

precision = 0.7011494252873564, recall = 0.6039603960396039, f1 = 0.648936170212766


In [288]:
# instantiate
svc_model = SVC()

# fit
svc_model.fit(X_train, y_train)

# score
svc_train_score = svc_model.score(X_train, y_train)
svc_test_score = svc_model.score(X_test, y_test)

print(f"Score on training set: {svc_train_score}")
print(f"Score on test set: {svc_test_score}")

Score on training set: 0.6186726659167604
Score on test set: 0.6457399103139013


In [289]:
svc_pred = svc_model.predict(X_test)
svc_precision = precision_score(y_test, svc_pred)
svc_recall = recall_score(y_test, svc_pred)
svc_f1 = f1_score(y_test, svc_pred)
print(f'precision = {svc_precision}, recall = {svc_recall}, f1 = {svc_f1}')

precision = 0.9230769230769231, recall = 0.2376237623762376, f1 = 0.3779527559055118


# Iteration without certain columns

In [325]:
success_measures = df[['team_size', 'years_since_founding', 'top_company', 'top_company_by_revenue', 'Acquired', 'Active', 'Inactive', 'Public', 'Early', 'Growth', 'isHiring', 'num_former_names', 'num_locations', 'num_regions']]