Scraper Logic

In [1]:
import os
import time
import random
from typing import List, Dict
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import pandas as pd

try:
    import undetected_chromedriver as uc
    _HAS_UC = True
except Exception:
    _HAS_UC = False

INPUT_CSV = "linkedin_profiles.csv"   # Input CSV file with 'url' column
SAVE_RAW_HTML = False
HEADLESS = False
MIN_DELAY = 3
MAX_DELAY = 6
LOGIN_TIMEOUT = 30
PAGE_TIMEOUT = 30

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:117.0) Gecko/20100101 Firefox/117.0",
]
PROXIES = []


def get_credentials() -> Dict[str, str]:
    os.environ["LI_USER"] = "santosh3112sing@gmail.com"
    os.environ["LI_PASS"] = "Scraper_Vansh"
    user = os.getenv("LI_USER")
    pwd = os.getenv("LI_PASS")
    if not user or not pwd:
        raise RuntimeError("Set LI_USER and LI_PASS environment variables.")
    return {"username": user, "password": pwd}


def build_driver(headless=True, user_agent=None, proxy=None):
    chromedriver_path = "C:\\ChromeDriver\\chromedriver.exe"

    options = Options()
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1920,1080")
    if user_agent:
        options.add_argument(f"user-agent={user_agent}")
    if proxy:
        options.add_argument(f"--proxy-server={proxy}")
    if headless:
        options.add_argument("--headless=new")

    service = Service(executable_path=chromedriver_path)
    driver = webdriver.Chrome(service=service, options=options)
    return driver


def linkedin_login(driver: WebDriver, username: str, password: str) -> None:
    driver.get("https://www.linkedin.com/login")
    wait = WebDriverWait(driver, LOGIN_TIMEOUT)
    email_in = wait.until(EC.presence_of_element_located((By.ID, "username")))
    email_in.clear()
    email_in.send_keys(username)
    pwd_in = driver.find_element(By.ID, "password")
    pwd_in.clear()
    pwd_in.send_keys(password)
    pwd_in.send_keys(Keys.RETURN)
    try:
        wait.until(EC.presence_of_element_located((By.ID, "global-nav-search")))
    except Exception:
        time.sleep(3)
    time.sleep(2)


class Person:
    def __init__(self, driver: WebDriver, linkedin_url: str):
        self.driver = driver
        self.linkedin_url = linkedin_url
        self.name = None
        self.location = None
        self.about = None
        self.experiences = []
        self.educations = []
        self.skills = []
        self.publications = []
        self.volunteering = []

    def _scroll_page(self):
        for _ in range(3):
            self.driver.execute_script("window.scrollBy(0, document.body.scrollHeight/3);")
            time.sleep(1.5)

    def extract_basic_info(self):
        try:
            top_card = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "main"))
            )
            self.name = top_card.find_element(By.TAG_NAME, "h1").text.strip()
            self.location = top_card.find_element(
                By.XPATH, ".//span[contains(@class,'text-body-small') and contains(@class,'t-black--light')]"
            ).text.strip()
        except Exception:
            pass

    def extract_about(self):
        try:
            about_section = self.driver.find_element(By.ID, "about")
            about_parent = about_section.find_element(By.XPATH, "..")
            self.about = about_parent.text.replace("About", "").strip()
        except Exception:
            self.about = None

    def extract_section(self, subpath: str) -> List[str]:
        try:
            url = os.path.join(self.linkedin_url, subpath)
            self.driver.get(url)
            self._scroll_page()
            items = self.driver.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item")
            return [item.text.strip() for item in items if item.text.strip()]
        except Exception:
            return []

    def extract_experience(self):
        self.experiences = self.extract_section("details/experience")

    def extract_education(self):
        self.educations = self.extract_section("details/education")

    def extract_skills(self):
        self.skills = self.extract_section("details/skills")

    def extract_publications(self):
        self.publications = self.extract_section("details/accomplishments/publications")

    def extract_volunteering(self):
        self.volunteering = self.extract_section("details/volunteering-experiences")

    def scrape_all(self):
        self.driver.get(self.linkedin_url)
        self.extract_basic_info()
        self.extract_about()
        self.extract_experience()
        self.extract_education()
        self.extract_skills()
        self.extract_publications()
        self.extract_volunteering()

    def to_dict(self):
        return {
            "name": self.name,
            "location": self.location,
            "about": self.about,
            "experience": " | ".join(self.experiences),
            "education": " | ".join(self.educations),
            "skills": ", ".join(self.skills),
            "publications": " | ".join(self.publications),
            "volunteering": " | ".join(self.volunteering)
        }


def scrape_profiles_from_csv(csv_path: str):
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV file not found: {csv_path}")

    df = pd.read_csv(csv_path)
    if "url" not in df.columns and "URL" not in df.columns:
        raise ValueError("CSV must contain a 'url' column with LinkedIn profile links.")
    if "URL" in df.columns:
        df.rename(columns={"URL": "url"}, inplace=True)

    creds = get_credentials()
    ua = random.choice(USER_AGENTS)
    proxy = random.choice(PROXIES) if PROXIES else None

    driver = build_driver(headless=HEADLESS, user_agent=ua, proxy=proxy)
    linkedin_login(driver, creds["username"], creds["password"])
    print("[INFO] Login complete.")

    scraped_data = []
    for idx, row in df.iterrows():
        profile_url = row["url"]
        print(f"[INFO] Scraping profile ({idx+1}/{len(df)}): {profile_url}")
        try:
            person = Person(driver, profile_url)
            person.scrape_all()
            data = person.to_dict()
            scraped_data.append(data)
        except Exception as e:
            print(f"[ERROR] Failed for {profile_url}: {e}")
            scraped_data.append({key: None for key in [
                "name", "location", "about", "experience",
                "education", "skills", "publications", "volunteering"
            ]})
        delay = random.uniform(MIN_DELAY, MAX_DELAY)
        print(f"[DEBUG] Sleeping for {delay:.1f}s...")
        time.sleep(delay)

    driver.quit()

    scraped_df = pd.DataFrame(scraped_data)
    updated_df = pd.concat([df.reset_index(drop=True), scraped_df], axis=1)
    updated_df.to_csv(csv_path, index=False)
    print(f"[INFO] Scraped data saved to {csv_path}")
    return updated_df


if __name__ == "__main__":
    scrape_profiles_from_csv(INPUT_CSV)


[INFO] Login complete.
[INFO] Scraping profile (1/1): https://www.linkedin.com/in/vansh-singh-77988b235/
[DEBUG] Sleeping for 5.4s...
[INFO] Scraped data saved to linkedin_profiles.csv


Converting Raw Scraped Content to Meaningful

In [6]:
import subprocess
import pandas as pd

INPUT_CSV = "linkedin_profiles.csv"
OUTPUT_CSV = "linkedin_profiles_with_report.csv"
TIME_LIMIT = 600 

df = pd.read_csv(INPUT_CSV)
reports = []

for idx, row in df.iterrows():
    prompt = f"""
Generate a detailed professional report for the following LinkedIn profile:

Full Name: {row.get('name', '')}
Location: {row.get('location', '')}
About: {row.get('about', '')}
Experience: {row.get('experience', '')}
Education: {row.get('education', '')}
Skills: {row.get('skills', '')}
Publications: {row.get('publications', '')}
Volunteering: {row.get('volunteering', '')}

Include sections:
1. Executive Summary
2. Career Timeline
3. Research & Technical Highlights
4. Key Skills & Strengths
5. Education
6. Final Impression

Write accurately, concisely, and only based on the above information.
"""

    try:
        result = subprocess.run(
            ["ollama", "run", "tinyllama"],
            input=prompt,
            text=True,
            capture_output=True,
            timeout=TIME_LIMIT
        )
        report_text = result.stdout.strip()
        print(report_text)
    except subprocess.TimeoutExpired:
        print(f"[ERROR] Model execution timed out for row {idx}!")
        report_text = ""

    reports.append(report_text)

df["Generated_Report"] = reports
df.to_csv(OUTPUT_CSV, index=False)



