#### Imports

In [6]:
import pandas as pd
import requests
import dotenv
import json

from tqdm import tqdm

In [7]:
API_KEY = dotenv.get_key('../../.env', "CRUNCHBASE_API_KEY")
params = {"user_key" : API_KEY}
url = "https://api.crunchbase.com/api/v4/searches"

### Bulk export

In [8]:
import tarfile
from tqdm.auto import tqdm
import dotenv
import os

dir_path = "../../data/crunchbase"
data_path = dir_path + '/bulk_export.tar.gz'

if not os.path.isdir(dir_path):
    print('Crunchbase data not found. Downloading bulk export...')
    os.makedirs(dir_path)

url = f"https://api.crunchbase.com/bulk/v4/bulk_export.tar.gz?user_key={API_KEY}"

response = requests.get(url, stream=True)
with open(data_path, "wb") as fout:
    with tqdm(
            unit='B', unit_scale=True, unit_divisor=1024, miniters=1,
            desc='bulk_export.tar.gz', total=int(response.headers.get('content-length', 0))
    ) as pbar:
        for chunk in response.iter_content(chunk_size=4096):
            fout.write(chunk)
            pbar.update(len(chunk))

tar = tarfile.open(data_path)
tar.extractall(dir_path)
tar.close()

bulk_export.tar.gz: 100%|██████████| 1.50G/1.50G [02:15<00:00, 11.9MB/s]  


### API functions

In [None]:
def get_query_for_uuids(properties, uuid_start, size_limit=1000):
    return {
        "field_ids": properties,
        "order": [
            {
                "field_id": "uuid",
                "sort": "asc",
                "nulls": "last"
            }
        ],
        "query": [
            {
                "type": "predicate",
                "field_id": "target_money_raised",
                "operator_id": "gt",
                "values": {"value_usd": 0}
            },
            {
                "type": "predicate",
                "field_id": "money_raised",
                "operator_id": "gt",
                "values": {"value_usd": 0}
            },
            {
                "type": "predicate",
                "field_id": "num_investors",
                "operator_id": "gt",
                "values": [0]
            }
        ],
        "limit": size_limit,
        "before_id": uuid_start
    }


def get_categories(funded_organization_categories):
    if isinstance(funded_organization_categories, float):
        return None
    return [cat['value'] for cat in funded_organization_categories]


def get_country(funded_organization_location):
    if isinstance(funded_organization_location, float):
        return None
    for loc in funded_organization_location:
        if loc['location_type'] == 'country':
            return loc['value']
    return None


def get_lead_investor(lead_investor_identifiers):
    if isinstance(lead_investor_identifiers, float):
        return None
    return lead_investor_identifiers[0]['value']


def get_df_from_response(json_response):
    rounds_columns = ['uuid', 'properties.announced_on', 'properties.money_raised.value_usd',
                      'properties.num_investors', 'properties.funded_organization_categories',
                      'properties.funded_organization_location', 'properties.investment_type',
                      'properties.lead_investor_identifiers', 'properties.post_money_valuation.value_usd',
                      'properties.funded_organization_identifier.uuid']

    data = pd.json_normalize(json_response["entities"])
    data['properties.funded_organization_categories'] =  data['properties.funded_organization_categories'].apply(get_categories)
    data['properties.funded_organization_location'] =  data['properties.funded_organization_location'].apply(get_categories)
    data['properties.lead_investor_identifiers'] = data['properties.lead_investor_identifiers'].apply(get_lead_investor)
    data = data[rounds_columns]
    data.columns = [label.replace('properties.', '').replace('.value', '') for label in data.columns]
    return data

In [None]:
rounds_data = pd.DataFrame()
rounds_uuids = pd.read_csv("../../data/crunchbase/funding_rounds.csv")["uuid"].values
rounds_properties = [
    "funded_organization_identifier",
    "announced_on",
    "target_money_raised",
    "money_raised",
    "num_investors",
    "lead_investor_identifiers",
    "investment_type",
    "funded_organization_location",
    "funded_organization_categories",
    "post_money_valuation"
]

query_limit = 1000
for uuid in tqdm(rounds_uuids[::query_limit]):
    query = get_query_for_uuids(rounds_properties, uuid, query_limit)
    r = requests.post(url + "/funding_rounds", json=query, params=params)
    chunk = get_df_from_response(json.loads(r.text))
    rounds_data = pd.concat([rounds_data, chunk])


In [None]:
rounds_data

In [None]:
rounds_data.to_csv('../../data/rounds_data.csv', index=False)