#### Imports

In [2]:
import pandas as pd
import requests
import dotenv
import json

from tqdm import tqdm

In [3]:
API_KEY = dotenv.get_key('./.env', "CRUNCHBASE_API_KEY")
params = {"user_key" : API_KEY}
url = "https://api.crunchbase.com/api/v4/searches"

### Bulk export

In [None]:
import tarfile
from tqdm.auto import tqdm
import dotenv
import os

dir_path = "../../data/crunchbase"
data_path = dir_path + '/bulk_export.tar.gz'

if not os.path.isdir(dir_path):
    print('Crunchbase data not found. Downloading bulk export...')
    os.makedirs(dir_path)

API_KEY = dotenv.get_key('.env', "CRUNCHBASE_API_KEY")
url = f"https://api.crunchbase.com/bulk/v4/bulk_export.tar.gz?user_key={API_KEY}"

response = requests.get(url, stream=True)
with open(data_path, "wb") as fout:
    with tqdm(
            unit='B', unit_scale=True, unit_divisor=1024, miniters=1,
            desc='bulk_export.tar.gz', total=int(response.headers.get('content-length', 0))
    ) as pbar:
        for chunk in response.iter_content(chunk_size=4096):
            fout.write(chunk)
            pbar.update(len(chunk))

tar = tarfile.open(data_path, "r:gz")
tar.extractall(dir_path)
tar.close()

### API functions

In [71]:
def get_query_for_uuids(properties, uuid_start, size_limit=1000):
    return {
        "field_ids": properties,
        "order": [
            {
                "field_id": "uuid",
                "sort": "asc",
                "nulls": "last"
            }
        ],
        "query": [
            {
                "type": "predicate",
                "field_id": "target_money_raised",
                "operator_id": "gt",
                "values": {"value_usd": 0}
            },
            {
                "type": "predicate",
                "field_id": "money_raised",
                "operator_id": "gt",
                "values": {"value_usd": 0}
            },
            {
                "type": "predicate",
                "field_id": "num_investors",
                "operator_id": "gt",
                "values": [0]
            }
        ],
        "limit": size_limit,
        "before_id": uuid_start
    }


def get_categories(funded_organization_categories):
    if isinstance(funded_organization_categories, float):
        return None
    return [cat['value'] for cat in funded_organization_categories]


def get_country(funded_organization_location):
    if isinstance(funded_organization_location, float):
        return None
    for loc in funded_organization_location:
        if loc['location_type'] == 'country':
            return loc['value']
    return None


def get_lead_investor(lead_investor_identifiers):
    if isinstance(lead_investor_identifiers, float):
        return None
    return lead_investor_identifiers[0]['value']


def get_df_from_response(json_response):
    rounds_columns = ['uuid', 'properties.announced_on', 'properties.money_raised.value_usd',
                      'properties.num_investors', 'properties.funded_organization_categories',
                      'properties.funded_organization_location', 'properties.investment_type',
                      'properties.lead_investor_identifiers', 'properties.post_money_valuation.value_usd',
                      'properties.funded_organization_identifier.uuid']

    data = pd.json_normalize(json_response["entities"])
    data['properties.funded_organization_categories'] =  data['properties.funded_organization_categories'].apply(get_categories)
    data['properties.funded_organization_location'] =  data['properties.funded_organization_location'].apply(get_categories)
    data['properties.lead_investor_identifiers'] = data['properties.lead_investor_identifiers'].apply(get_lead_investor)
    data = data[rounds_columns]
    data.columns = [label.replace('properties.', '').replace('.value', '') for label in data.columns]
    return data

In [76]:
rounds_data = pd.DataFrame()
rounds_uuids = pd.read_csv("../../data/crunchbase/funding_rounds.csv")["uuid"].values
rounds_properties = [
    "funded_organization_identifier",
    "announced_on",
    "target_money_raised",
    "money_raised",
    "num_investors",
    "lead_investor_identifiers",
    "investment_type",
    "funded_organization_location",
    "funded_organization_categories",
    "post_money_valuation"
]

query_limit = 1000
for uuid in tqdm(rounds_uuids[::query_limit]):
    query = get_query_for_uuids(rounds_properties, uuid, query_limit)
    r = requests.post(url + "/funding_rounds", json=query, params=params)
    chunk = get_df_from_response(json.loads(r.text))
    rounds_data = pd.concat([rounds_data, chunk])


100%|██████████| 452/452 [29:27<00:00,  3.91s/it] 


In [82]:
rounds_data

Unnamed: 0,uuid,announced_on,money_raised_usd,num_investors,funded_organization_categories,funded_organization_location,investment_type,lead_investor_identifiers,post_money_valuation_usd,funded_organization_identifier.uuid
0,89da2fef-059b-4ec8-a2dc-e18fad3e9697,2019-08-22,16000000.0,8,"[E-Commerce, Furniture, Home and Garden, Home ...",United States,series_b,Accel,,c7916a39-10cb-485a-b364-7fe414d9868d
1,89da8960-820d-fc16-0048-d277b1c74040,2009-03-10,780001.0,1,"[Analytics, Life Science, Nanotechnology]",United States,series_unknown,,,76f05830-746b-5ccc-5a2d-f39f8840e060
2,89daa031-180b-4e80-9d84-de6f77644f37,2020-01-01,,5,[E-Commerce],United States,seed,,,c6a64d14-0f54-4077-89e5-a07838f36265
3,89daa224-d40b-4696-bfe4-6824e18ff4d8,2018-06-01,,1,"[Food and Beverage, Hospitality, Restaurants]",United States,undisclosed,,,97d4ad30-abd4-47b3-8318-1f6a02b98a89
4,89daa3bc-98ef-4d91-98f4-7d46e5ade50f,2020-09-01,3700000.0,3,"[Artificial Intelligence, Computer Vision, Mac...",Germany,seed,Shasta Ventures,,9fcbc4c7-5a74-43f6-8df1-e0c2917a6657
...,...,...,...,...,...,...,...,...,...,...
995,3c0587fa-46de-421d-8ea7-7c2aed9352c7,2021-12-15,47113512.0,6,"[Artificial Intelligence, Machine Learning, Sa...",China,series_b,Cathay Capital,,c9cf2757-aba4-4a08-be2d-0757e7a079ea
996,3c05ac01-6b66-4d15-b7d9-f6501d6ed341,2021-08-05,3800000.0,6,"[Health Care, Personal Health]",United States,seed,Quiet Capital,,53c909e2-9b8b-424d-a2d4-c8608f851f8e
997,3c060241-856f-4eae-b77a-ab5c47c1608c,2014-09-15,50258.0,1,"[Software, Translation Service]",United States,convertible_note,,,8a29b56a-4a20-db0f-bd83-c85f8e3ffe6f
998,3c063583-bd4f-439f-9c41-2483d4ab1a28,2005-02-01,,1,[Manufacturing],United States,series_unknown,,,7b9b770d-d990-5e41-0a42-db770564f1c2


In [83]:
rounds_data.to_csv('../../data/rounds_data.csv', index=False)