## LinkedIn Profile Scraper in Python using LLM

YouTube Video: https://youtu.be/SQtwhuYJk3M

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

from dotenv import load_dotenv
load_dotenv()

True

In [3]:
driver = webdriver.Chrome()

In [4]:
driver.get('https://www.linkedin.com/login')

In [5]:
driver.title

'LinkedIn Login, Sign in | LinkedIn'

In [6]:
email = driver.find_element(By.ID, 'username')
email.send_keys(os.environ['EMAIL'])

password = driver.find_element(By.ID, 'password')
password.send_keys(os.environ['PASSWORD'])

password.submit()

In [7]:
## MAKE SURE TO USE ONLY THIS URL TO AVOID BEING STUCK IN ERRORS

url = "https://www.linkedin.com/in/laxmimerit"
driver.get(url)

In [8]:
profile_data = {}

In [9]:
driver.title

'(23) Laxmi Kant | LinkedIn'

In [45]:
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')

profile = soup.find('main', {'class': 'scaffold-layout__main'})

In [46]:
sections = profile.find_all('section', {'class': 'artdeco-card'})
len(sections)

17

In [47]:
profile_text = profile.get_text().strip() 
# print(profile.get_text())
len(profile_text)

42504

In [69]:
import re

# remove multiple new lines and tabs
def clean_text(text):
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\t+', '\t', text)
    text = re.sub(r'\t\s+', ' ', text)
    text = re.sub(r'\n\s+', '\n', text)
    return text

# profile_text = clean_text(profile_text)



In [79]:
sections_text = [section.get_text().strip() for section in sections]

In [80]:
sections_text = [clean_text(section) for section in sections_text]

In [82]:
def remove_duplicates(text):
    lines = text.split('\n')
    new_lines = []
    for line in lines:
        if line[:len(line)//2] == line[len(line)//2:]:
            new_lines.append(line[:len(line)//2])
        else:
            new_lines.append(line)

    return '\n'.join(new_lines)

In [83]:
sections_text[10]

sections_text = [remove_duplicates(section) for section in sections_text]


In [91]:
# sections_text

### LLM Call

In [15]:
section_keys = ['Name and Headline']
for section in sections_text[1:]:
    section_keys.append(section.split('\n')[0])

section_keys

['Name and Headline',
 'Highlights',
 'About',
 'Featured',
 'Activity',
 'Experience',
 'Education',
 'Licenses & certifications',
 'Projects',
 'Skills',
 'Recommendations',
 'Patents',
 'Courses',
 'Honors & awards',
 'Languages',
 'Interests',
 'Causes']

In [41]:
from langchain_ollama import ChatOllama

from langchain_core.prompts import (SystemMessagePromptTemplate, 
                                    HumanMessagePromptTemplate,
                                    ChatPromptTemplate)



from langchain_core.output_parsers import StrOutputParser

base_url = "http://localhost:11434"
# model = 'llama3.2:3b'
model = 'qwen2.5:7b'

llm = ChatOllama(base_url=base_url, model=model)


system = SystemMessagePromptTemplate.from_template("""You are helpful AI assistant who answer LinkedIn profile parsing related 
                                                    user question based on the provided profile text data.""")

def ask_llm(prompt):
    prompt = HumanMessagePromptTemplate.from_template(prompt)

    messages = [system, prompt]
    template = ChatPromptTemplate(messages)

    qna_chain = template | llm | StrOutputParser()

    return qna_chain.invoke({})

In [42]:
template = """
Extract and return the requested information from the LinkedIn profile data in a concise, point-by-point format (up to 5 points). Avoid preambles or any additional context.

### LinkedIn Profile Data:
{}

### Information to Extract:
Extract '{}' in bullet points, limiting the output to 5 points. Provide only the necessary details.
Remember, It is LinkedIn profile data.

### Extracted Data:"""

context = sections_text[0]
prompt = template.format(context, section_keys[0])

response = ask_llm(prompt=prompt)
print(response)

- Name: Laxmi Kant
- Headline: Gen AI in Finance & Investment Services | Data Scientist | IIT Kharagpur | Asset Management | AI-Driven Financial Modeling | Search Ranking | NLP Python BERT AWS Elasticsearch GNN SQL LLM | AI in Investment Strategies


In [43]:
responses = {}

for k,context in zip(section_keys, sections_text):
    prompt = template.format(context, k)
    response = ask_llm(prompt=prompt)
    responses[k] = response

In [44]:
print(responses)

{'Name and Headline': '- Name: Laxmi Kant\n- Headline: Gen AI in Finance & Investment Services | Data Scientist | IIT Kharagpur | Asset Management | AI-Driven Financial Modeling | Search Ranking | NLP Python BERT AWS Elasticsearch GNN SQL LLM | AI in Investment Strategies', 'Highlights': '- Laxmi Kant has viewed your profile.\n- Curious about who else viewed your profile? Unlock for more opportunities.\n- See all profile views.', 'About': '- Demonstrated 8+ years of expertise in advanced analytics as an AVP in Data Science.\n- Showcasing dynamic and impactful contributions in fields like customer behavior modeling, personalized marketing, product discovery & search optimization, and recommendations.\n- Strategically led initiatives resulting in a 30% increase in customer retention and a 10% reduction in advertising spending.\n- Pioneered the development of a Machine Learning product relevance ranking system leading to a 30% increase in conversion rates and 20% growth in revenue.\n- Ach

In [45]:
template = """You are provided with LinkedIn profile data in JSON format.
            Parse the data according to the specified schema, correct any spelling errors,
            and condense the information if possible.

### LinkedIn Profile JSON Data:
{context}

### Schema You need to follow:
You need to extract
Name:
Headline:
About:
Experience:
Education:
Skills:
Projects:
Summary:

Do not return preambles or any other information.
### Parsed Data:"""

prompt = template.format(context=responses).replace("{", "{{").replace("}", "}}")
response = ask_llm(prompt=prompt)
response

'Name: Laxmi Kant\n\nHeadline: Gen AI in Finance & Investment Services | Data Scientist | IIT Kharagpur | Asset Management | AI-Driven Financial Modeling | Search Ranking | NLP Python BERT AWS Elasticsearch GNN SQL LLM | AI in Investment Strategies\n\nAbout: Demonstrated 8+ years of expertise in advanced analytics as an AVP in Data Science. Showcasing dynamic and impactful contributions in fields like customer behavior modeling, personalized marketing, product discovery & search optimization, and recommendations. Strategically led initiatives resulting in a 30% increase in customer retention and a 10% reduction in advertising spending. Pioneered the development of a Machine Learning product relevance ranking system leading to a 30% increase in conversion rates and 20% growth in revenue. Achieved significant improvements through innovative techniques like Social Graph Analytics and Predictive Modeling.\n\nExperience: \n- Developed SleepDoc, an Android application using state-of-the-art 

In [46]:
print(response)

Name: Laxmi Kant

Headline: Gen AI in Finance & Investment Services | Data Scientist | IIT Kharagpur | Asset Management | AI-Driven Financial Modeling | Search Ranking | NLP Python BERT AWS Elasticsearch GNN SQL LLM | AI in Investment Strategies

About: Demonstrated 8+ years of expertise in advanced analytics as an AVP in Data Science. Showcasing dynamic and impactful contributions in fields like customer behavior modeling, personalized marketing, product discovery & search optimization, and recommendations. Strategically led initiatives resulting in a 30% increase in customer retention and a 10% reduction in advertising spending. Pioneered the development of a Machine Learning product relevance ranking system leading to a 30% increase in conversion rates and 20% growth in revenue. Achieved significant improvements through innovative techniques like Social Graph Analytics and Predictive Modeling.

Experience: 
- Developed SleepDoc, an Android application using state-of-the-art ML algor