In [None]:
import requests
import pandas as pd
import re
from fuzzywuzzy import fuzz
import string
from tqdm import tqdm
from IPython.display import clear_output

from decouple import config

tqdm.pandas()

Phase 1
1. Pass extracted list of names into autocomplete function
2. From API return Crunchbase name, country, link, permalink, UUID
3. Fuzzy match
    a. Do ratio match. If >90, directly return, else go to b
    b. Do token_set_ratio
    b. If len(word)>1, remove last word and token_set_ratio again. Return if >90
4. Manual review

Phase 2
- Using matched companies, pass UUID/permalink into entity lookup function
- Return highest funding round, founders identifiers

In [None]:
# Environment Variables
CB_API_KEY = config("CB_API_KEY")
BASE = "https://api.crunchbase.com/api/v4"

In [None]:
# Crunchbase API test cell
uuid = "f57b9762-20eb-4098-947f-dfa8c4a0db82"
action = f"/entities/organizations/{uuid}"
url = BASE+action
result = requests.get(url,headers={"X-cb-user-key":CB_API_KEY},params={"card_ids":"fields,jobs,founders"}).json()

In [None]:
# Change the below filepath
filepath = "test"

In [None]:
data = pd.read_csv(f"{filepath}.csv")

In [None]:
# Get a mapping of Alpha-2 country code to country name since Crunchbase uses Alpha-2 code
# e.g. {"DE": "Germany", "SG": "Singapore"}

country_map = pd.read_html("https://www.iban.com/country-codes")[0]
country_map = country_map.iloc[:,:2]
country_map.columns = ['country','code']
country_map = country_map.set_index("code").to_dict()['country']

In [None]:
def phase1(rowdata):
    """
    This function performs a fuzzy match between the top results using the Crunchbase autocomplete function and the target startup name
    (i.e. how similar the names are)
    
    If there is a match, return a tuple in the format (startup_name, startup_uuid, startup_description)
    
    If there are no similar matches, return a list of top results found in the same country as the target startup in the format
    [(startup_1_name, startup_1_uuid, startup_1_description), (startup_2_name,...)]
    """
    name = rowdata["name"]
    action = "/autocompletes"
    url = BASE+action
    results = requests.get(url,headers={"X-cb-user-key":CB_API_KEY},params={"collection_ids":"organizations","query":f"{name}","limit":10}).json()
    candidates = [(e['identifier']['value'],e['identifier']['uuid'],e['short_description'])for e in results['entities']]
    for candidate in candidates:
        if fuzz.token_set_ratio(candidate[0],name)>90:
            return candidate
        else:
            # Remove punctuation
            name = name.translate(str.maketrans(" "," ",string.punctuation))
            splitname = name.split()
            if len(splitname)>1:
                splitname = " ".join(splitname[:-1])
                if fuzz.token_set_ratio(candidate[0],splitname)>95:
                    return candidate
    
    candidates_2 = []
    
    for candidate in candidates:
        uuid = candidate[1]
        action = f"/entities/organizations/{uuid}"
        url = BASE+action
        results = requests.get(url,headers={"X-cb-user-key":CB_API_KEY},params={"card_ids":["headquarters_address"]}).json()['cards']['headquarters_address']
        if len(results):
            country = results[0]['country_code']
            if country_map[country]==rowdata["country"]:
                candidates_2.append(candidate)
        else:
            candidates_2.append(candidate)
    if len(candidates_2):
        return candidates_2

data['basic_info'] = data.progress_apply(phase1,axis=1)

In [None]:
data['basic_info'].apply(type).value_counts()

### Manual Check

User input is required for the next cell

In [None]:
# This cell allows the user to perform a manual check of the unclear startups
# Running this cell will print the target startup's details along with the details of possible startups
# The user inputs either the index number of the correct startup or blank string to skip (if there are no correct startups)

df = data.copy()

unclear = df[df['basic_info'].apply(lambda x: isinstance(x,list))]

counter = 0
for r,row in tqdm(unclear.iterrows()):
    counter += 1
    print(f"{counter}/{len(unclear)} records")
    print(row['name'])
    print(row['summary'])
    print()
    for n,candidate in enumerate(row['basic_info'][:10]):
        print(f"{n}: "+candidate[0]+":\t"+candidate[2])
    clear_output(wait=True)
    correct = input("Which one is it?")
    if correct=="":
        df.at[r,'basic_info'] = None
    else:
        df.at[r,'basic_info'] = row['basic_info'][int(correct)]

In [None]:
def phase2(rowdata):
    """
    
    """
    uuid = rowdata["basic_info"][1]
    action = f"/entities/organizations/{uuid}"
    url = BASE+action
    results = requests.get(url,headers={"X-cb-user-key":CB_API_KEY},params={"card_ids":"fields,founders,jobs"}).json()["cards"]
    
    # Check if funding eligible
    result_dict = {"website":None,"funding":None, "founders":None}
    if "website_url" in results["fields"]:
        result_dict["website"] = results["fields"]["website_url"]
    
    if "last_funding_type" in results["fields"]:
        if not bool(re.match(r"(series_[b-z])|(.*ipo.*)",results["fields"]["last_funding_type"])):
            result_dict["funding"] = results["fields"]["last_funding_type"]
        else:
            return None
    else:
        result_dict["funding"] = "UNKNOWN"
    
    # Get list of employee uuids, to get title and starting_date
    employees = {}
    for p in results['jobs']:
        title = None if 'title' not in p else p['title']
        started = None if 'started_on' not in p else p['started_on']['value'][:4]
        employees[p["person_identifier"]["uuid"]] = (title,started)
    
    # Get details of founders
    if "founders" in results and results["founders"]:
        founders = []
        for p in results["founders"]:
            founder_name = p["identifier"]["value"]
            founder_uuid = p["identifier"]["uuid"]
            linkedin = None if "linkedin" not in p else p["linkedin"]["value"]
            title,started = None,None
            if founder_uuid in employees:
                title,started = employees[founder_uuid]
            founders.append((founder_name,founder_uuid,linkedin,title,started))
    
        result_dict["founders"] = founders
    return result_dict

In [None]:
found = df.dropna(subset=["basic_info"])
found['extra_info'] = found.progress_apply(phase2,axis=1)

In [None]:
found['website'] = found['extra_info'].apply(lambda x: x['website'] if x else x)
found['funding'] = found['extra_info'].apply(lambda x: x['funding'] if x else x)
found['founders'] = found['extra_info'].apply(lambda x: x['founders'] if x else x)
found = found.drop('extra_info',axis=1)

In [None]:
# Run this cell if you want to easily load the data into the Linkedin scraper

found.to_pickle(f"{filepath}_found.pkl")
df[df['basic_info'].isnull()].to_pickle(f"{filepath}_missing.pkl")

In [None]:
# Run this cell if you want to view the output in Excel
found_formatted = found.copy()
found_formatted["founders"] = found_formatted["founders"].apply(lambda x: "\r".join([", ".join([e for e in p if e]) for p in x]) if x else x)

found.to_excel(f"{filepath}_found.xlsx",index=False)
df[df['resubasic_infolts'].isnull()].to_excel(f"{filepath}_missing.xlsx",index=False)