# Unicorns (R)

- exploratory data analysis (EDA)

In [52]:
# Libraries
library(tidyverse)
library(readxl)
library(janitor)

In [53]:
# Data
data <- read_csv(
    file.path(r"(data\unicorn_companies.csv)"),
    skip = 0,
    col_names = TRUE,
    col_types = NULL,
    show_col_types = NULL
)

[1mRows: [22m[34m1074[39m [1mColumns: [22m[34m10[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (9): Company, Valuation, Date Joined, Industry, City, Country/Region, Co...
[32mdbl[39m (1): Year Founded

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [54]:
# Display the first 10 rows of the data
data |> head(n=10)

Company,Valuation,Date Joined,Industry,City,Country/Region,Continent,Year Founded,Funding,Select Investors
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>
Bytedance,$180B,4/7/17,Artificial intelligence,Beijing,China,Asia,2012,$8B,"Sequoia Capital China, SIG Asia Investments, Sina Weibo, Softbank Group"
SpaceX,$100B,12/1/12,Other,Hawthorne,United States,North America,2002,$7B,"Founders Fund, Draper Fisher Jurvetson, Rothenberg Ventures"
SHEIN,$100B,7/3/18,E-commerce & direct-to-consumer,Shenzhen,China,Asia,2008,$2B,"Tiger Global Management, Sequoia Capital China, Shunwei Capital Partners"
Stripe,$95B,1/23/14,Fintech,San Francisco,United States,North America,2010,$2B,"Khosla Ventures, LowercaseCapital, capitalG"
Klarna,$46B,12/12/11,Fintech,Stockholm,Sweden,Europe,2005,$4B,"Institutional Venture Partners, Sequoia Capital, General Atlantic"
Canva,$40B,1/8/18,Internet software & services,Surry Hills,Australia,Oceania,2012,$572M,"Sequoia Capital China, Blackbird Ventures, Matrix Partners"
Checkout.com,$40B,5/2/19,Fintech,London,United Kingdom,Europe,2012,$2B,"Tiger Global Management, Insight Partners, DST Global"
Instacart,$39B,12/30/14,"Supply chain, logistics, & delivery",San Francisco,United States,North America,2012,$3B,"Khosla Ventures, Kleiner Perkins Caufield & Byers, Collaborative Fund"
JUUL Labs,$38B,12/20/17,Consumer & retail,San Francisco,United States,North America,2015,$14B,Tiger Global Management
Databricks,$38B,2/5/19,Data management & analytics,San Francisco,United States,North America,2013,$3B,"Andreessen Horowitz, New Enterprise Associates, Battery Ventures"


In [55]:
# Get the shape
df_shape <- dim(data)
print(sprintf("The df has %d rows and %d columns.", df_shape[1], df_shape[2]))

[1] "The df has 1074 rows and 10 columns."


In [56]:
# List of columns
data_cols <- colnames(data)
data_cols_str <- paste(data_cols, collapse = ", ")
cat("List of columns:", data_cols_str)

List of columns: Company, Valuation, Date Joined, Industry, City, Country/Region, Continent, Year Founded, Funding, Select Investors

In [57]:
# Tidy up the col names
df <- data |>
    clean_names()

df |> colnames()

In [58]:
# Survey the data frame
df |> str()

spc_tbl_ [1,074 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ company         : chr [1:1074] "Bytedance" "SpaceX" "SHEIN" "Stripe" ...
 $ valuation       : chr [1:1074] "$180B" "$100B" "$100B" "$95B" ...
 $ date_joined     : chr [1:1074] "4/7/17" "12/1/12" "7/3/18" "1/23/14" ...
 $ industry        : chr [1:1074] "Artificial intelligence" "Other" "E-commerce & direct-to-consumer" "Fintech" ...
 $ city            : chr [1:1074] "Beijing" "Hawthorne" "Shenzhen" "San Francisco" ...
 $ country_region  : chr [1:1074] "China" "United States" "China" "United States" ...
 $ continent       : chr [1:1074] "Asia" "North America" "Asia" "North America" ...
 $ year_founded    : num [1:1074] 2012 2002 2008 2010 2005 ...
 $ funding         : chr [1:1074] "$8B" "$7B" "$2B" "$2B" ...
 $ select_investors: chr [1:1074] "Sequoia Capital China, SIG Asia Investments, Sina Weibo, Softbank Group" "Founders Fund, Draper Fisher Jurvetson, Rothenberg Ventures" "Tiger Global Management, Sequoia Capital China, 

In [None]:
# Fix the data types before surveying the statistics
df <- df |>
    mutate(
        # Convert the valuation to floats
        valuation = as.numeric(
            str_extract(valuation, r"(\d+\.*\d*)")
        ),
        # Convert date_joined to dates
        date_joined = parse_date_time(date_joined, orders = c(r"(%m/%d/%y)"), exact = TRUE),
        # Convert year_founded to integers
        year_founded = as.integer(year_founded),
        # Convert funding to floats
        funding = as.numeric(
            str_extract(funding, r"(\d+\.*\d*)")
        ),
        # Add year_joined col
        year_joined = year(date_joined)
    ) |>
    # Group _joined cols together
    relocate(
        year_joined,
        .after = "date_joined"
    ) |>
    # Rename cols to indicate to account for scale and currency
    rename_with(
        .cols = c("valuation", "funding"),
        .fn = ~ paste0(.x, "_bn_usd")
    )

In [60]:
# Get NA counts
df |>
    summarise(
        across(
            .cols = everything(),
            .fns = ~ sum(is.na(.x))
        )
    )

company,valuation_bn_usd,date_joined,year_joined,industry,city,country_region,continent,year_founded,funding_bn_usd,select_investors
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
0,0,0,0,0,16,0,0,0,12,0


In [61]:
# Survey summary statistics
df |>
    select(
        where(is.numeric),
        -c(year_joined, year_founded)
    ) |>
    summary()

 valuation_bn_usd  funding_bn_usd 
 Min.   :  1.000   Min.   :  0.0  
 1st Qu.:  1.000   1st Qu.:166.0  
 Median :  2.000   Median :300.0  
 Mean   :  3.455   Mean   :338.1  
 3rd Qu.:  3.000   3rd Qu.:491.5  
 Max.   :180.000   Max.   :999.0  
                   NA's   :12     

In [62]:
# Sample data
df |>
    slice_sample(n=10)

company,valuation_bn_usd,date_joined,year_joined,industry,city,country_region,continent,year_founded,funding_bn_usd,select_investors
<chr>,<dbl>,<dttm>,<dbl>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<chr>
Maven Clinic,1,2021-08-17,2021,Health,New York,United States,North America,2014,202,"Female Founders Fund, Oak HC/FT Partners, Sequoia Capital"
Wayflyer,2,2022-02-01,2022,Fintech,Dublin,Ireland,Europe,2019,236,"QED Investors, DST Global, Left Lane Capital"
Quizlet,1,2020-05-13,2020,Edtech,San Francisco,United States,North America,2005,62,"Union Square Ventures, Altos Ventures, Costanoa Ventures"
Intercom,1,2018-03-27,2018,Internet software & services,San Francisco,United States,North America,2011,241,"FirstMark Capital, Tiger Global Management"
AgentSync,1,2021-12-07,2021,Fintech,Denver,United States,North America,2018,111,"Craft Ventures, Caffeinated Capital, Operator Collective"
Checkr,5,2019-09-19,2019,Internet software & services,San Francisco,United States,North America,2014,559,"Y Combinator, Accel, T. Rowe Price"
China Cloud,1,2018-06-11,2018,Hardware,Wuxi,China,Asia,2010,523,"V Star Capital, GF Xinde Investment Management Co., Haitong Leading Capital Management"
Cybereason,3,2019-08-06,2019,Cybersecurity,Boston,United States,North America,2012,714,"SoftBank Group, CRV, Spark Capital"
MEGVII,4,2017-10-31,2017,Artificial intelligence,Beijing,China,Asia,2011,1,"Ant Financial Services Group, Russia-China Investment Fund, Foxconn Technology Company"
CircleCI,2,2021-05-11,2021,Internet software & services,San Francisco,United States,North America,2011,315,"Threshold Ventures, Baseline Ventures, Harrison Metal"
