## step 1: imports

In [81]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [82]:
pd.set_option('display.max_columns', None)

In [83]:
companies = pd.read_csv('Modified_Unicorn_Companies.csv')
companies.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors
0,Bytedance,180,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,100,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,100,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,95,2014-01-23,FinTech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG"
4,Klarna,46,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita..."


## step 2: data cleaning 

In [84]:
companies.dtypes

Company             object
Valuation            int64
Date Joined         object
Industry            object
City                object
Country/Region      object
Continent           object
Year Founded         int64
Funding             object
Select Investors    object
dtype: object

In [85]:
# converting the 'object' dtype column to 'datetime'
companies["Date Joined"] = pd.to_datetime(companies["Date Joined"])

In [86]:
# adding a column called 'Years To Unicorn'
companies["Years To Unicorn"] = companies["Date Joined"].dt.year - companies["Year Founded"]

In [87]:
companies.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn
0,Bytedance,180,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S...",5
1,SpaceX,100,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen...",10
2,SHEIN,100,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China...",10
3,Stripe,95,2014-01-23,FinTech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG",4
4,Klarna,46,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita...",6


#### input validation

##### correcting bad data

In [88]:
companies["Years To Unicorn"].describe()

count    1074.000000
mean        7.013035
std         5.331842
min        -3.000000
25%         4.000000
50%         6.000000
75%         9.000000
max        98.000000
Name: Years To Unicorn, dtype: float64

'Years To Unicorn' cannot be negative.

In [89]:
companies[companies['Years To Unicorn'] < 0]

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn
527,InVision,2,2017-11-01,Internet software & services,New York,United States,North America,2020,$349M,"FirstMark Capital, Tiger Global Management, IC...",-3


In [90]:
print(companies.loc[companies['Company']=='InVision', 'Year Founded'])
companies.loc[companies['Company']=='InVision', 'Year Founded'] = 2011
# an internet search reveals that InVision was founded in 2011

527    2020
Name: Year Founded, dtype: int64


In [91]:
companies[companies['Company']=='InVision']
# checking out again

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn
527,InVision,2,2017-11-01,Internet software & services,New York,United States,North America,2011,$349M,"FirstMark Capital, Tiger Global Management, IC...",-3


In [92]:
# recalculating all the values for 'Years To Unicorn'
companies["Years To Unicorn"] = companies["Date Joined"].dt.year - companies["Year Founded"]

# checking the final result
companies.loc[companies['Company']=='InVision', 'Years To Unicorn']

527    6
Name: Years To Unicorn, dtype: int64

In [93]:
# a list of all the industry name that should be in the dataset. any labels in the `Industry` column that are not in `industry_list` are misspellings.

industry_list = ['Artificial intelligence', 'Other','E-commerce & direct-to-consumer', 'Fintech',
       'Internet software & services','Supply chain, logistics, & delivery', 'Consumer & retail',
       'Data management & analytics', 'Edtech', 'Health', 'Hardware','Auto & transportation',
        'Travel', 'Cybersecurity','Mobile & telecommunications']

In [94]:
# checking which values are in 'Industry' but not in industry_list.

In [95]:
# first way
set(companies['Industry']) - set(industry_list)

{'Artificial Intelligence', 'Data management and analytics', 'FinTech'}

In [96]:
# second way
mask = ~companies['Industry'].isin(industry_list)
companies[mask]["Industry"].unique()

array(['FinTech', 'Data management and analytics',
       'Artificial Intelligence'], dtype=object)

In [97]:
## correcting the bad entries with 'replace()' method.

# companies["Industry"] = companies["Industry"].replace("Artificial Intelligence", "Artificial intelligence")
#
# companies["Industry"] = companies["Industry"].replace("Data management and analytics", "Data management & analytics")
#
# companies["Industry"] = companies["Industry"].replace("FinTech", "Fintech")


## a way simpler and way shorter way to do this operation:

replacement_dict = {
    "Artificial Intelligence" : "Artificial intelligence",
    "Data management and analytics" : "Data management & analytics",
    "FinTech" : "Fintech",
}

companies["Industry"] = companies["Industry"].replace(replacement_dict)

In [98]:
# checking if there are andy misspelling values still
set(companies['Industry']) - set(industry_list)

set()

In [99]:
companies[companies.duplicated(subset=['Company'], keep=False)]

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn
385,BrewDog,2,2017-04-10,Consumer & retail,Aberdeen,United Kingdom,Europe,2007,$233M,"TSG Consumer Partners, Crowdcube",10
386,BrewDog,2,2017-04-10,Consumer & retail,Aberdeen,UnitedKingdom,Europe,2007,$233M,TSG Consumer Partners,10
510,ZocDoc,2,2015-08-20,Health,New York,United States,North America,2007,$374M,"Founders Fund, Khosla Ventures, Goldman Sachs",8
511,ZocDoc,2,2015-08-20,Health,,United States,North America,2007,$374M,Founders Fund,8
1031,SoundHound,1,2018-05-03,Artificial intelligence,Santa Clara,United States,North America,2005,$215M,"Tencent Holdings, Walden Venture Capital, Glob...",13
1032,SoundHound,1,2018-05-03,Other,Santa Clara,United States,North America,2005,$215M,Tencent Holdings,13


In [100]:
# dropping rows of duplicate companies after their first occurrence

companies = companies.drop_duplicates(subset=['Company'], keep="first")

### convert numerical data into categorical data

In [104]:
companies["High Valuation"] = pd.qcut(
    companies["Valuation"],
    2,
    labels=["low", "high"],
)
companies.sample(10)

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn,High Valuation
714,Yotpo,1,2021-03-18,Internet software & services,New York,United States,North America,2011,$436M,"Bessemer Venture Partners, Vintage Investment ...",10,low
374,Ximalaya FM,2,2017-09-22,Mobile & telecommunications,Shanghai,China,Asia,2012,$71M,"China Creation Ventures, Sierra Ventures, Xing...",5,low
732,DistroKid,1,2021-08-16,Internet software & services,New York,United States,North America,2013,Unknown,"Insight Partners, Silversmith Capital Partners...",8,low
521,CoinSwitch Kuber,2,2021-10-06,Fintech,Bangalore,India,Asia,2017,$301M,"Tiger Global Management, Sequoia Capital India...",4,low
559,Druva,2,2019-06-20,Data management & analytics,Sunnyvale,United States,North America,2007,$475M,"Nexus Venture Partners, Tenaya Capital, Sequoi...",12,low
479,Reltio,2,2021-11-04,Data management & analytics,Redwood City,United States,North America,2011,$237M,"Crosslink Capital, .406 Ventures, Sapphire Ven...",10,low
113,Cityblock Health,6,2020-12-10,Health,Brooklyn,United States,North America,2017,$891M,"Thrive Capital, Maverick Ventures, Redpoint Ve...",3,high
302,Blockstream,3,2021-08-24,Fintech,Vancouver,Canada,North America,2014,$299M,"AME Cloud Ventures, Future Perfect Ventures, B...",7,high
441,Clio,2,2021-04-27,Internet software & services,Burnaby,Canada,North America,2008,$386M,"OMERS Private Equity, T. Rowe Price, Technolog...",13,low
370,Pagaya,2,2020-06-17,Fintech,Tel Aviv,Israel,Asia,2015,$417M,"Oak HC/FT Partners, GF Investments, Harvey Gol...",5,low


#### converting continent to numeric

the investment group has specified that they want to give more weight to continents with fewer unicorn companies because they believe this could indicate unrealized market potential

In [111]:
companies.value_counts(["Continent"])

Continent    
North America    586
Asia             310
Europe           143
South America     21
Oceania            8
Africa             3
Name: count, dtype: int64

In [119]:
continent_number_mapping = {
    "Africa" : 6,
    "Oceania" : 5,
    "South America" : 4,
    "Europe" : 3,
    "Asia" : 2,
    "North America" : 1,

}
companies["Continent Number"] = companies["Continent"].replace(continent_number_mapping)

  companies["Continent Number"] = companies["Continent"].replace(continent_number_mapping)


In [118]:
companies.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn,High Valuation,Continent Number
0,Bytedance,180,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S...",5,high,5
1,SpaceX,100,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen...",10,high,6
2,SHEIN,100,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China...",10,high,5
3,Stripe,95,2014-01-23,Fintech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG",4,high,6
4,Klarna,46,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita...",6,high,4


In [120]:
companies["Country/Region Numeric"] = companies['Country/Region'].astype('category').cat.codes

In [121]:
companies["Country/Region Numeric"].head()

0     9
1    44
2     9
3    44
4    38
Name: Country/Region Numeric, dtype: int8

In [124]:
industry_encoded = pd.get_dummies(companies['Industry'])
companies = pd.concat([companies, industry_encoded], axis=1)
companies.head()

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors,Years To Unicorn,High Valuation,Continent Number,Country/Region Numeric,Artificial intelligence,Auto & transportation,Consumer & retail,Cybersecurity,Data management & analytics,E-commerce & direct-to-consumer,Edtech,Fintech,Hardware,Health,Internet software & services,Mobile & telecommunications,Other,"Supply chain, logistics, & delivery",Travel,Artificial intelligence.1,Auto & transportation.1,Consumer & retail.1,Cybersecurity.1,Data management & analytics.1,E-commerce & direct-to-consumer.1,Edtech.1,Fintech.1,Hardware.1,Health.1,Internet software & services.1,Mobile & telecommunications.1,Other.1,"Supply chain, logistics, & delivery.1",Travel.1,Artificial intelligence.2,Auto & transportation.2,Consumer & retail.2,Cybersecurity.2,Data management & analytics.2,E-commerce & direct-to-consumer.2,Edtech.2,Fintech.2,Hardware.2,Health.2,Internet software & services.2,Mobile & telecommunications.2,Other.2,"Supply chain, logistics, & delivery.2",Travel.2
0,Bytedance,180,2017-04-07,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S...",5,high,2,9,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,SpaceX,100,2012-12-01,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen...",10,high,1,44,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
2,SHEIN,100,2018-07-03,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China...",10,high,2,9,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
3,Stripe,95,2014-01-23,Fintech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG",4,high,1,44,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
4,Klarna,46,2011-12-12,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita...",6,high,3,38,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False
