In [1]:
#!/usr/bin/python3
import requests, json, time, csv

# TODO: Add your api key below
API_KEY = "c67af6f09a40fd32d5f409439e05fef62ea8c75490d99fc5223eb3d5850f367b"

# TODO: Set number of records to pull (-1 for all available records)
MAX_NUM_RECORDS = 100

# NO CHANGES NEEDED BELOW HERE
PDL_URL = "https://api.peopledatalabs.com/v5/person/search"
request_header = {
    "Content-Type": "application/json",
    "X-api-key": API_KEY
}

SQL_QUERY = "SELECT * FROM person WHERE (job_title = 'software engineer' and location_region = 'new york')"

num_records_to_request = 10
params = {
    "dataset": "all",
    "sql": SQL_QUERY,
    "size": num_records_to_request,
    "pretty": True
}

# Pull all results in multiple batches
batch = 1
all_records = []
start_time = time.time()
while batch == 1 or params["scroll_token"]:
    if MAX_NUM_RECORDS != -1:
        # Update num_records_to_request
        # Compute the number of records left to pull
        num_records_to_request = MAX_NUM_RECORDS - len(all_records)
        # Clamp this number between 0 and 100
        num_records_to_request = max(0, min(num_records_to_request, 100))

    if num_records_to_request == 0:
        break

    params["size"] = num_records_to_request
    response = requests.get(PDL_URL, headers=request_header, params=params).json()

    if batch == 1:
        print(f"{response['total']} available records in this search")

    all_records.extend(response.get("data", []))
    params["scroll_token"] = response.get("scroll_token")
    print(f"Retrieved {len(response.get('data', []))} records in batch {batch}")
    batch += 1

    if params["scroll_token"]:
        time.sleep(6)   # avoid hitting rate limit thresholds


end_time = time.time()
runtime = end_time - start_time

print(f"Successfully recovered {len(all_records)} profiles in "
      f"{batch} batches [{runtime} seconds]")


def save_profiles_to_csv(profiles, filename, fields=[], delim=","):
    """Save profiles to csv (utility function)"""

    # Define header fields
    if fields == [] and len(profiles) > 0:
        fields = profiles[0].keys()

    with open(filename, "w") as csvfile:
        # Write csv file
        writer = csv.writer(csvfile, delimiter=delim)

        # Write Header:
        writer.writerow(fields)

        count = 0
        for profile in profiles:
            # Write Body:
            writer.writerow([profile[field] for field in fields])
            count += 1
            print(f"Wrote {count} lines to: '{filename}'")

34906 available records in this search
Retrieved 100 records in batch 1
Successfully recovered 100 profiles in 2 batches [8.543031930923462 seconds]


In [12]:
import random

parsed_records = []
for record in all_records:
  parsed_records.append({
      'name': record['full_name'],
      'inferred_years_experience': record['inferred_years_experience'],
      'companies_worked_at': [e['company']['name'] for e in record['experience']],
      'skills': record['skills'],
      'inferred_salary': str(random.randint(100, 300)) + 'k',
      'recommended_personal_email': record['recommended_personal_email']
      })

In [21]:
import json

prisma_data = []
for record in parsed_records:
    prisma_record = {
        "name": record['name'],
        "inferred_years_experience": str(record['inferred_years_experience']),  # Convert to string
        "companies_worked_at": json.dumps(record['companies_worked_at']),  # Convert to JSON string
        "skills": json.dumps(record['skills']),  # Convert to JSON string
        "inferred_salary": record['inferred_salary'],
        "recommended_personal_email": 'n/a' if record['recommended_personal_email'] is None else record['recommended_personal_email'],
    }
    prisma_data.append(prisma_record)

with open('prisma_seed.json', 'w') as f:
    json.dump(prisma_data, f, indent=4)

In [23]:
all_companies = []
for record in parsed_records:
  for company in record['companies_worked_at']:
    if company not in all_companies:
      all_companies.append(company)