# Founder Rank
This notebook implements all ranking workflows. 

In [1]:

import numpy as np
import json
import sys
import pandas as pd
import os
from dotenv import load_dotenv

sys.path.append("..")

from src.config.config import cfg
from src.clients.perplexity_client import PerplexityClient
from src.clients.proxycurl_client import ProxycurlClient
from src.data.profile_transforms import ProfileTransforms
from src.core.ranking import search_founders, rank_profiles, load_model
from src.utils.profile_utils import get_queried_urls


load_dotenv()
np.set_printoptions(precision=2, suppress=True, linewidth=120)

### Search 

We can provide a list of URLs (SEARCH =False) or let the script search for founders (SEARCH = True). 

Currently, need to loop over search in batches of 10... can get this up to 100 per batch by combining search w/o enrich and lookup which should be sufficient for our needs.
Maintaining already-search ids in initial-search.json currently, or whatever list-name can be ref'd.

In [19]:
px = ProxycurlClient()
pc = PerplexityClient()

N = 10

SEARCH = True

linkedin_urls = [
    "https://linkedin.com/in/george-goodfellow/",
    "https://linkedin.com/in/adithyagurunathan/",
    "https://linkedin.com/in/sarangpujari/",
    "https://linkedin.com/in/katelam8/",
    "https://linkedin.com/in/christopher-hur/",
    "https://linkedin.com/in/aliciajsteele/",
    "https://linkedin.com/in/charlesfatunbi/",
    "https://linkedin.com/in/tejal-dahake/",
    "https://linkedin.com/in/rohan-devraj/",
    "https://linkedin.com/in/skareer/",
    "https://linkedin.com/in/imgeorgiev/",
    "https://linkedin.com/in/viresh-pati/",
]

LIST_NAME = 'initial-search'
MODEL_PATH = '../models/founder_rank_with_ranking_loss-2.pkl'

In [26]:
with open(f"../data/proxycurl/{LIST_NAME}.json", "r") as json_file:
    data = json.load(json_file)
ids = ",".join([f['profile']['public_identifier'] for f in data['results']])

if SEARCH:
    data = search_founders(px=px, limit=N, ids=ids)
    
    existing_data = {"results": []}
    try:
        with open(f"../data/proxycurl/{LIST_NAME}.json", "r") as json_file:
            existing_data = json.load(json_file)
        
        existing_data["results"].extend(data["results"])
        data = existing_data
    except FileNotFoundError:
        # File doesn't exist yet, use new data as is
        pass
        
    # Save the combined data
    with open(f"../data/proxycurl/{LIST_NAME}.json", "w") as json_file:
        json.dump(data, json_file)
else:
    data = []
    with open(f"../data/proxycurl/{LIST_NAME}.json", "r") as json_file:
        data = json.load(json_file)
    processed = get_queried_urls(data)
    
    print(f'Found {len(processed)} profiles in specified dir')
    for url in linkedin_urls:
        if url in processed:
            print(f'already processed {url} ... skipping')
            continue
        print(f"Fetching profile: {url}")
        profile = px.fetch_linkedin_profile(url, use_cache="if-present")
        if profile:
            data.append({"profile": profile})



Found 10 profiles


In [27]:
ids = ",".join([f['profile']['public_identifier'] for f in data['results']])
print(len(ids.split(",")))
data

100


{'results': [{'linkedin_profile_url': 'https://www.linkedin.com/in/cantino',
   'profile': {'public_identifier': 'cantino',
    'profile_pic_url': 'https://s3.us-west-000.backblazeb2.com/proxycurl/person/cantino/profile?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=0004d7f56a0400b0000000001%2F20250313%2Fus-west-000%2Fs3%2Faws4_request&X-Amz-Date=20250313T231258Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=c6264df851110661771467079eba434949ddc08feaa58818f05d78dd4d79bbef',
    'background_cover_image_url': None,
    'first_name': 'Andrew',
    'last_name': 'Cantino',
    'full_name': 'Andrew Cantino',
    'follower_count': 1258,
    'occupation': 'Founder / Chief Strategy Officer at Overview Energy',
    'headline': 'Climate ◊ Space ◊ Science ◊ Software',
    'summary': "I am a hands-on technical leader specializing in strategic planning, technical prioritization, and engineering practices. I've founded companies, raised money, and built teams. I've also managed phila

### Transforming Profiles

In [28]:
# %%timeit
T = ProfileTransforms(data)
DF_DIR = f'../data/sample_encodings/'
 
df = T.process_profiles(profiles=data, perplexity_client=pc, output_dir=None,batch_code=LIST_NAME)

import os

csv_path = f'{DF_DIR}{LIST_NAME}-profiles.csv'
file_exists = os.path.isfile(csv_path)

# If file exists, append without headers; otherwise create new file with headers
df.to_csv(csv_path, mode='a', header=not file_exists, index=False)

# df = pd.read_csv(f'{DF_DIR}{LIST_NAME}-profiles.csv', index_col=False)
# df['feature_vector'] = df['feature_vector'].apply(lambda x: np.fromstring(x.strip("[]"), sep=' '))
# T.df = df


Starting profile processing...
Processing 100 profiles
Could not extract JSON for Scott Davis. Full response: {
Creating feature matrix...


### Ranking

In [30]:

ranked_results = rank_profiles(df, T.get_feature_matrix(), model_dict=load_model(model_path=MODEL_PATH))
display(ranked_results[['Name','Linkedin','UNDERGRAD','GRADUATE','COMPANY','SENIORITY','EXPERTISE','EXIT','FOUNDER','STARTUP','score']].sort_values(by="score", ascending=False))

ranked_results[['Name','Linkedin','Current Title','Current Company', 'score', 'UNDERGRAD','GRADUATE','COMPANY','SENIORITY','EXPERTISE','EXIT','FOUNDER','STARTUP']].to_csv(f'../out/{LIST_NAME}.csv', index=False)

Unnamed: 0,Name,Linkedin,UNDERGRAD,GRADUATE,COMPANY,SENIORITY,EXPERTISE,EXIT,FOUNDER,STARTUP,score
0,Greg Alexander,https://www.linkedin.com/in/gpalexander,1,1,1,2,1,3,3,2,1.000000
1,Darren Rush,https://www.linkedin.com/in/darrenrush,1,0,1,3,3,2,3,3,0.986704
2,Neha Narkhede,https://www.linkedin.com/in/nehanarkhede,2,1,1,3,3,3,3,3,0.982399
3,Charlie Cichetti,https://www.linkedin.com/in/charliecichetti,2,0,1,3,3,2,3,3,0.894602
4,Scott Davis,https://www.linkedin.com/in/ksd415,2,3,1,3,3,0,1,1,0.837283
...,...,...,...,...,...,...,...,...,...,...,...
95,"Tim Campbell, P.E.",https://www.linkedin.com/in/tim-campbell-p-e-7...,1,0,1,3,3,0,1,1,0.209244
96,Brittany Ellison,https://www.linkedin.com/in/brittanyaellison,1,1,1,2,1,0,1,1,0.174245
97,Nathan Taylor,https://www.linkedin.com/in/tnathantaylor,1,0,1,3,2,0,1,1,0.165603
98,"Anthony Zeruto, CISM",https://www.linkedin.com/in/anthony-zeruto-cis...,1,0,1,2,1,0,1,1,0.156519
