In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
from bs4 import BeautifulSoup


In [5]:
# scrape linkedin profile and extract education, experience, skills, etc. in a structured format

def scrape_linkedin_profile(linkedin_url):
    response = requests.get(linkedin_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    print(soup.prettify())

linkedin_url = 'https://www.linkedin.com/in/yanis-miraoui-54377a1b8/'
scrape_linkedin_profile(linkedin_url)

<html>
 <head>
  <script type="text/javascript">
   window.onload = function() {
  // Parse the tracking code from cookies.
  var trk = "bf";
  var trkInfo = "bf";
  var cookies = document.cookie.split("; ");
  for (var i = 0; i < cookies.length; ++i) {
    if ((cookies[i].indexOf("trkCode=") == 0) && (cookies[i].length > 8)) {
      trk = cookies[i].substring(8);
    }
    else if ((cookies[i].indexOf("trkInfo=") == 0) && (cookies[i].length > 8)) {
      trkInfo = cookies[i].substring(8);
    }
  }

  if (window.location.protocol == "http:") {
    // If "sl" cookie is set, redirect to https.
    for (var i = 0; i < cookies.length; ++i) {
      if ((cookies[i].indexOf("sl=") == 0) && (cookies[i].length > 3)) {
        window.location.href = "https:" + window.location.href.substring(window.location.protocol.length);
        return;
      }
    }
  }

  // Get the new domain. For international domains such as
  // fr.linkedin.com, we convert it to www.linkedin.com
  // treat .cn similar 

In [31]:
# use pymupdf to read cv and extract text in a structured format
import pymupdf

def extract_text_from_cv(cv_path):
    pdf = pymupdf.open(cv_path)
    text = ''
    for page in pdf.pages():
        text += page.get_text()
    return text

extract_text_from_cv('./cv_examples/CV Yanis MIRAOUI.pdf')


# use llm to structure the text extracted from the cv (using together ai)
from together import Together
import os
import toml

secrets_path = 'secrets.toml'
secrets = toml.load(secrets_path)
os.environ['TOGETHER_API_KEY'] = secrets['TOGETHER_API_KEY']

client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))

def structured_cv_text(text):
    prompt = """
        Extract the following information from the text: 
        {text}

        
        The output should be a structured json with the following fields:
        - education: list of tuples (degree, school, location, start date, end date)
        - experience: list of tuples (company, title, location, start date, end date)
        - skills: list of strings
        - projects: list of tuples (name, description, start date, end date)
        - languages: list of strings
        If you cannot find a field, leave it empty. Make sure that the information is correct, present in the text and not hallucinated.
        """       
    response = client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
        messages=[{"role": "user", "content": prompt.format(text=text)}],
        stream=False
    )
    return response.choices[0].message.content

In [32]:
extract_text_from_cv('./cv_examples/CV Yanis MIRAOUI.pdf')

'Yanis MIRAOUI \nStanford, USA | +1 (650) 505 3509 | ymiraoui@stanford.edu | https://yanis.miraoui.com/ \nEDUCATION \nStanford University \nStanford, USA \nMS Statistics & Data Science \nSeptember 2023 - June 2025 \nImperial College London & ETH Zürich \nLondon, UK – Zürich, Switzerland \nMSci Mathematics, Grade: First-Class Honors \nOctober 2019 - June 2023 \nLANGUAGES\nFrench: Native  \n \nMoroccan: Fluent \nEnglish: Fluent \nGerman: Professional proficiency  \nSpanish: Fluent \nArabic: Intermediate\nPROFESSIONAL EXPERIENCES \nOracle \nRedwood City, USA \nResearch Data Scientist Intern \nSeptember 2024 - Present \n• \nDesigned, implemented, and evaluated fully integrated LLM Agents at Oracle Labs, improving workflow automation for data \nscientists \n• \nCollaborated with cross-functional teams to identify optimization opportunities, leveraging LLMs to improve agent accuracy \nStanford Computational Neuroscience Laboratory (CNS Lab) \n \nStanford, USA \nResearch Assistant \n \nSeptem

In [33]:
print(structured_cv_text(extract_text_from_cv('./cv_examples/CV Yanis MIRAOUI.pdf')))

id='8daecc75ff0715e9-SJC' object=<ObjectType.ChatCompletion: 'chat.completion'> created=1730326889 model='meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo' choices=[ChatCompletionChoicesData(index=0, logprobs=None, seed=14713538458809217000, finish_reason=<FinishReason.EOS: 'eos'>, message=ChatCompletionMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='```json\n{\n    "education": [\n        ("MS Statistics & Data Science", "Stanford University", "Stanford, USA", "September 2023", "June 2025"),\n        ("MSci Mathematics", "Imperial College London & ETH Zürich", "London, UK – Zürich, Switzerland", "October 2019", "June 2023")\n    ],\n    "experience": [\n        ("Oracle", "Research Data Scientist Intern", "Redwood City, USA", "September 2024", "Present"),\n        ("Stanford Computational Neuroscience Laboratory (CNS Lab)", "Research Assistant", "Stanford, USA", "September 2023", "Present"),\n        ("Snowflake", "Applied Research Data Scientist Intern", "San Mateo, USA", "Ju