# Unicorns (Python)

- exploratory data analysis (EDA)

In [116]:
# Libraries
import numpy as np
import pandas as pd
import re

from skimpy import clean_columns

In [117]:
# Data
data = pd.read_csv(
    r"data\unicorn_companies.csv",
    delimiter=None,
    header=0, 
    skiprows=None, 
    nrows=None
)

In [118]:
# Display the first 10 rows of the data
data.head(n=10)

Unnamed: 0,Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors
0,Bytedance,$180B,4/7/17,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,$100B,12/1/12,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,$100B,7/3/18,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,$95B,1/23/14,Fintech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG"
4,Klarna,$46B,12/12/11,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capita..."
5,Canva,$40B,1/8/18,Internet software & services,Surry Hills,Australia,Oceania,2012,$572M,"Sequoia Capital China, Blackbird Ventures, Mat..."
6,Checkout.com,$40B,5/2/19,Fintech,London,United Kingdom,Europe,2012,$2B,"Tiger Global Management, Insight Partners, DST..."
7,Instacart,$39B,12/30/14,"Supply chain, logistics, & delivery",San Francisco,United States,North America,2012,$3B,"Khosla Ventures, Kleiner Perkins Caufield & By..."
8,JUUL Labs,$38B,12/20/17,Consumer & retail,San Francisco,United States,North America,2015,$14B,Tiger Global Management
9,Databricks,$38B,2/5/19,Data management & analytics,San Francisco,United States,North America,2013,$3B,"Andreessen Horowitz, New Enterprise Associates..."


In [119]:
# Get the shape
df_shape = data.shape
print("The df has {r} rows and {c} columns.".format(r = df_shape[0], c = df_shape[1]))

The df has 1074 rows and 10 columns.


In [120]:
# List the columns
data_cols = data.columns
data_cols_str = ", ".join(data_cols.tolist())
print("List of columns:", data_cols_str)

List of columns: Company, Valuation, Date Joined, Industry, City, Country/Region, Continent, Year Founded, Funding, Select Investors


In [121]:
# Tidy up col names
df = clean_columns(data)
print(df.columns.tolist())

['company', 'valuation', 'date_joined', 'industry', 'city', 'country_region', 'continent', 'year_founded', 'funding', 'select_investors']


In [122]:
# Survey the data frame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1074 entries, 0 to 1073
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   company           1074 non-null   object
 1   valuation         1074 non-null   object
 2   date_joined       1074 non-null   object
 3   industry          1074 non-null   object
 4   city              1058 non-null   object
 5   country_region    1074 non-null   object
 6   continent         1074 non-null   object
 7   year_founded      1074 non-null   int64 
 8   funding           1074 non-null   object
 9   select_investors  1073 non-null   object
dtypes: int64(1), object(9)
memory usage: 84.0+ KB


In [123]:
# Fix the data types before surveying the statistics
    # Convert the valuation to floats
df["valuation"] = df["valuation"].str.extract(r"(\d+\.*\d*)")
df["valuation"] = df["valuation"].astype("float32")

In [124]:
    # Convert date_joined to dates
df["date_joined"] = df["date_joined"].str.replace(" *", "")
df["date_joined"] = pd.to_datetime(df["date_joined"], format = r"%m/%d/%y")

In [125]:
    # Convert year_founded to integers
df["year_founded"] = df["year_founded"].astype("Int32")

In [126]:
    # Convert funding to floats
df["funding"] = df["funding"].str.extract(r"(\d+\.*\d*)")
df["funding"] = df["funding"].astype("float32")

In [127]:
    # Add year_joined col and group _joined cols together
date_joined_idx = df.columns.get_loc("date_joined")
df.insert(date_joined_idx + 1, "year_joined", df["date_joined"].dt.year)

In [128]:
    # Rename cols to indicate to account for scale and currency
df = df.rename(columns={"valuation": "valuation_bn_usd", "funding": "funding_bn_usd"})

In [129]:
# Get NA counts
df.apply(lambda col: col.isna().sum())

company              0
valuation_bn_usd     0
date_joined          0
year_joined          0
industry             0
city                16
country_region       0
continent            0
year_founded         0
funding_bn_usd      12
select_investors     1
dtype: int64

In [130]:
# Survey summary statistics
df.select_dtypes(include=["float32"]).describe()

Unnamed: 0,valuation_bn_usd,funding_bn_usd
count,1074.0,1062.0
mean,3.455307,338.091339
std,8.547022,237.333145
min,1.0,0.0
25%,1.0,166.0
50%,2.0,300.0
75%,3.0,491.5
max,180.0,999.0


In [131]:
# Sample data
df.sample(n=10)

Unnamed: 0,company,valuation_bn_usd,date_joined,year_joined,industry,city,country_region,continent,year_founded,funding_bn_usd,select_investors
35,Biosplice Therapeutics,12.0,2018-08-06,2018,Health,San Diego,United States,North America,2008,799.0,"Vickers Venture Partners, IKEA GreenTech"
347,Remote,3.0,2021-07-13,2021,Fintech,San Francisco,United States,North America,2016,496.0,"Index Ventures, Sequoia Capital, General Catalyst"
198,Noom,4.0,2021-05-24,2021,Health,New York,United States,North America,2006,657.0,"Qualcomm Ventures, Samsung Ventures, Silver Lake"
66,Kavak,9.0,2020-10-01,2020,E-commerce & direct-to-consumer,Lerma de Villada,Mexico,North America,2016,2.0,"DST Global, SoftBank Group, Mountain Nazca"
338,Grafana Labs,3.0,2021-03-25,2021,Internet software & services,New York,United States,North America,2014,535.0,"Lightspeed Venture Partners, Lead Edge Capital..."
829,Pharmapacks,1.0,2020-11-12,2020,E-commerce & direct-to-consumer,Islandia,United States,North America,2010,433.0,The Carlyle Group
370,Jusfoun Big Data,2.0,2018-07-09,2018,Data management & analytics,Beijing,China,Asia,2010,137.0,"Boxin Capital, DT Capital Partners, IDG Capital"
971,Moka,1.0,2021-11-02,2021,Internet software & services,Beijing,China,Asia,2015,144.0,"GGV Capital, GSR Ventures, FreesFund"
556,Cloudinary,2.0,2022-02-15,2022,Internet software & services,Santa Clara,United States,North America,2011,100.0,"Blackstone, Bessemer Venture Partners"
1020,Shift Technology,1.0,2021-05-06,2021,Artificial intelligence,Paris,France,Europe,2014,545.0,"Griffin Gaming Partners, Andreessen Horowitz, ..."
