In [32]:





from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup

def get_broker_data(link):

    options = webdriver.ChromeOptions()
    options.add_experimental_option('excludeSwitches', ['enable-logging'])
    driver = webdriver.Chrome(options=options)

    response = []

    timeout = False
    driver.get(link)

    try:
        element = WebDriverWait(driver, 2).until(
            EC.presence_of_element_located((By.XPATH, "/html/body/bc-root/div/bc-individual-container-page/bc-individual-detail-page/div[2]/investor-tools-individual-summary-template/div/div[1]/div[1]/div/investor-tools-big-name/div[1]/span[1]"))
        )
    except TimeoutException:
        timeout = True
    finally:
        if not timeout:
            response=(driver.page_source)
        else:
            response=(int(link.rsplit('/', 1)[1]),"IA")
            timeout = False
        

    driver.quit()
    
    scraped_data_partial = {}
    if type(response) == tuple:
        scraped_data_partial["Broker CRD"] = response[0]
        scraped_data_partial["Is a Broker"] = "null"
        scraped_data_partial["Is an Investment Adviser"] = True
    else:
        soup = BeautifulSoup(response, features="html.parser")
        scraped_data_partial["name"] = soup.find(
            "span", {"class": "text-lg sm:text-sm font-semibold"}
        )
        scraped_data_partial["status"] = soup.find_all(
            "span", {"class": "text-gray-80 text-xs font-medium"}
        )
        scraped_data_partial["crd"] = soup.find(
            "div",
            {
                "class": "text-gray-85 text-left font-semibold mt-2 text-sm ng-star-inserted"
            },
        )
        scraped_data_partial["firm"] = soup.find(
            "div", {"class": "flex flex-col text-sm"}
        )
        scraped_data_partial["background"] = soup.find_all(
            "div", {"class": "flex-1 flex flex-col justify-center"}
        )

    scrape = scraped_data_partial

    if len(scrape) > 3:
        clean_data = {}
        clean_data["Broker Name"] = scrape["name"].string.strip()

        raw_status = scrape["status"]

        if len(raw_status) == 1:
            if scrape["status"][0].find("span").string == "Broker":
                clean_data["Is a Broker"] = True
            else:
                clean_data["Is a Broker"] = False

            clean_data["Is an Investment Adviser"] = False
        else:
            if raw_status[1].find("span").string == "Broker":
                clean_data["Is a Broker"] = True
            else:
                clean_data["Is a Broker"] = False

            if (
                raw_status[0]
                .find("span", {"title": "Investment Adviser"})
                .string.strip()
                == "Investment Adviser"
            ):
                clean_data["Is an Investment Adviser"] = True
            else:
                clean_data["Is an Investment Adviser"] = False

        clean_data["Broker CRD"] = int(scrape["crd"].find("span").next_sibling.string)

        if scrape["firm"]:
            clean_data["Firm Name"] = scrape["firm"].find("span").string
            clean_data["Firm CRD"] = int(
                scrape["firm"]
                .find("span")
                .next_sibling.find("span")
                .next_sibling.string
            )

            rawAddress = scrape["firm"].find("investor-tools-address")

            rawStreetAddress = rawAddress.next_element

            for x in range(3):
                rawStreetAddress = rawStreetAddress.next_element

            clean_data["Firm Street"] = rawStreetAddress.strip()

            rawCityStateZip = rawAddress.find("br")

            for x in range(4):
                rawCityStateZip = rawCityStateZip.next_element

            rawCityStateZip = rawCityStateZip.strip()

            rawStateZip = rawCityStateZip.split(" ", 1)[1].split(" ", 1)

            clean_data["Firm State"] = rawStateZip[0]

            clean_data["Firm Zip"] = rawStateZip[1]
        else:
            clean_data["Firm Name"] = "none"
            clean_data["Firm CRD"] = "none"
            clean_data["Firm Street"] = "none"
            clean_data["Firm State"] = "none"
            clean_data["Firm Zip"] = "none"

        clean_data["Number of Disclosures"] = int(
            scrape["background"][0]
            .find(
                "span",
                {"class": "sm:text-lg sm:font-semibold text-3xl ng-star-inserted"},
            )
            .string.strip()
        )

        rawYearsFirms = scrape["background"][1].find_all(
            "span", {"class": "sm:text-lg sm:font-semibold text-3xl ng-star-inserted"}
        )

        if len(rawYearsFirms) == 2:
            clean_data["Years of Experience"] = int(rawYearsFirms[0].string.strip())
            clean_data["Number of Firms"] = rawYearsFirms[1].string.strip()
        else:
            clean_data["Years of Experience"] = int(rawYearsFirms[0].string.strip())
            clean_data["Number of Firms"] = (
                scrape["background"][1]
                .find(
                    "span",
                    {"class": "sm:text-lg sm:font-semibold text-xl ng-star-inserted"},
                )
                .string.strip()
            )

        scraped_data_clean = clean_data
    else:
        scraped_data_clean = scrape

    return scraped_data_clean
    


In [30]:
def parseData(response):
    """Takes in the raw set of data and parses it into a list of
       dictionaries, each dictionary containing the data for one
       broker.

    Args:
        response : raw scraped data

    Returns:
        clean set of data to be exported
    """

    scraped_data_partial = {}
    if type(response) == tuple:
        scraped_data_partial["Broker CRD"] = response[0]
        scraped_data_partial["Is a Broker"] = "null"
        scraped_data_partial["Is an Investment Adviser"] = True
    else:
        soup = BeautifulSoup(response, features="html.parser")
        scraped_data_partial["name"] = soup.find(
            "span", {"class": "text-lg sm:text-sm font-semibold"}
        )
        scraped_data_partial["status"] = soup.find_all(
            "span", {"class": "text-gray-80 text-xs font-medium"}
        )
        scraped_data_partial["crd"] = soup.find(
            "div",
            {
                "class": "text-gray-85 text-left font-semibold mt-2 text-sm ng-star-inserted"
            },
        )
        scraped_data_partial["firm"] = soup.find(
            "div", {"class": "flex flex-col text-sm"}
        )
        scraped_data_partial["background"] = soup.find_all(
            "div", {"class": "flex-1 flex flex-col justify-center"}
        )

    scrape = scraped_data_partial

    if len(scrape) > 3:
        clean_data = {}
        clean_data["Broker Name"] = scrape["name"].string.strip()

        raw_status = scrape["status"]

        if len(raw_status) == 1:
            if scrape["status"][0].find("span").string == "Broker":
                clean_data["Is a Broker"] = True
            else:
                clean_data["Is a Broker"] = False

            clean_data["Is an Investment Adviser"] = False
        else:
            if raw_status[1].find("span").string == "Broker":
                clean_data["Is a Broker"] = True
            else:
                clean_data["Is a Broker"] = False

            if (
                raw_status[0]
                .find("span", {"title": "Investment Adviser"})
                .string.strip()
                == "Investment Adviser"
            ):
                clean_data["Is an Investment Adviser"] = True
            else:
                clean_data["Is an Investment Adviser"] = False

        clean_data["Broker CRD"] = int(scrape["crd"].find("span").next_sibling.string)

        if scrape["firm"]:
            clean_data["Firm Name"] = scrape["firm"].find("span").string
            clean_data["Firm CRD"] = int(
                scrape["firm"]
                .find("span")
                .next_sibling.find("span")
                .next_sibling.string
            )

            rawAddress = scrape["firm"].find("investor-tools-address")

            rawStreetAddress = rawAddress.next_element

            for x in range(3):
                rawStreetAddress = rawStreetAddress.next_element

            clean_data["Firm Street"] = rawStreetAddress.strip()

            rawCityStateZip = rawAddress.find("br")

            for x in range(4):
                rawCityStateZip = rawCityStateZip.next_element

            rawCityStateZip = rawCityStateZip.strip()

            rawStateZip = rawCityStateZip.split(" ", 1)[1].split(" ", 1)

            clean_data["Firm State"] = rawStateZip[0]

            clean_data["Firm Zip"] = rawStateZip[1]
        else:
            clean_data["Firm Name"] = "none"
            clean_data["Firm CRD"] = "none"
            clean_data["Firm Street"] = "none"
            clean_data["Firm State"] = "none"
            clean_data["Firm Zip"] = "none"

        clean_data["Number of Disclosures"] = int(
            scrape["background"][0]
            .find(
                "span",
                {"class": "sm:text-lg sm:font-semibold text-3xl ng-star-inserted"},
            )
            .string.strip()
        )

        rawYearsFirms = scrape["background"][1].find_all(
            "span", {"class": "sm:text-lg sm:font-semibold text-3xl ng-star-inserted"}
        )

        if len(rawYearsFirms) == 2:
            clean_data["Years of Experience"] = int(rawYearsFirms[0].string.strip())
            clean_data["Number of Firms"] = rawYearsFirms[1].string.strip()
        else:
            clean_data["Years of Experience"] = int(rawYearsFirms[0].string.strip())
            clean_data["Number of Firms"] = (
                scrape["background"][1]
                .find(
                    "span",
                    {"class": "sm:text-lg sm:font-semibold text-xl ng-star-inserted"},
                )
                .string.strip()
            )

        scraped_data_clean = clean_data
    else:
        scraped_data_clean = scrape

    return scraped_data_clean

In [31]:
parseData(get_broker_data("https://brokercheck.finra.org/individual/summary/1000059"))

{'Broker Name': 'DOUGLAS PENDLETON GAINES',
 'Is a Broker': True,
 'Is an Investment Adviser': True,
 'Broker CRD': 1000059,
 'Firm Name': 'STIFEL, NICOLAUS & COMPANY, INCORPORATED',
 'Firm CRD': 793,
 'Firm Street': '10400 NE 4TH STREET, SUITE 2000',
 'Firm State': 'WA',
 'Firm Zip': '98004',
 'Number of Disclosures': 0,
 'Years of Experience': 42,
 'Number of Firms': '4'}

In [6]:
import requests
import fitz  # PyMuPDF

def get_broker_pdf_text(cdr_number):
    url = f"https://files.brokercheck.finra.org/individual/individual_{cdr_number}.pdf"
    response = requests.get(url)
    
    if response.status_code != 200:
        raise Exception(f"Failed to retrieve PDF for CDR number {cdr_number}. Status code: {response.status_code}")
    
    # Save the PDF to a temporary file
    pdf_filename = f"individual_{cdr_number}.pdf"
    with open(pdf_filename, 'wb') as pdf_file:
        pdf_file.write(response.content)
    
    # Open the PDF and extract text
    pdf_document = fitz.open(pdf_filename)
    pdf_text = ""
    
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        pdf_text += page.get_text()
    
    pdf_document.close()
    
    return pdf_text

# Example usage:
cdr_number = '1000059'
broker_pdf_text = get_broker_pdf_text(cdr_number)
print(broker_pdf_text)


BrokerCheck Report
DOUGLAS PENDLETON GAINES
Section Title
Report Summary
Broker Qualifications
Registration and Employment History
CRD# 1000059
1
2 - 4
6
Page(s)
Please be aware that fraudsters may link to BrokerCheck from phishing and similar scam websites, trying to steal your personal information or your money.
Make sure you know who you’re dealing with when investing, and contact FINRA with any concerns.
For more information read our investor alert on imposters.
i
About BrokerCheck®
BrokerCheck offers information on all current, and many former, registered securities brokers, and all current and former
registered securities firms. FINRA strongly encourages investors to use BrokerCheck to check the background of
securities brokers and brokerage firms before deciding to conduct, or continue to conduct, business with them.
·
What is included in a BrokerCheck report?
·
BrokerCheck reports for individual brokers include information such as employment history, professional
qualifications

In [11]:
def extract_broker_data_from_pdf(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    pdf_text = ""

    # Extract text from each page of the PDF
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        pdf_text += page.get_text()
    
    pdf_document.close()

    # Initialize dictionary to store broker data
    broker_data = {}

    # Split text into lines for easier parsing
    lines = pdf_text.split('\n')

    # Parsing the text for relevant data
    for i, line in enumerate(lines):
        if "DOUGLAS PENDLETON GAINES" in line:
            broker_data['name'] = line.strip()
        elif "CRD# 1000059" in line:
            broker_data['crd_number'] = line.split('#')[1].strip()
        elif "Currently employed by and registered with the following Firm(s):" in line:
            broker_data['current_employment'] = {}
            broker_data['current_employment']['firm_name'] = lines[i+1].strip()
            broker_data['current_employment']['address'] = lines[i+2].strip()
            broker_data['current_employment']['crd_number'] = lines[i+3].split('#')[1].strip()
            broker_data['current_employment']['registered_since'] = lines[i+4].split(':')[1].strip()
        elif "Disclosure Events" in line:
            broker_data['disclosure_events'] = "No" if "No" in lines[i+1] else "Yes"
        elif "Broker Qualifications" in line:
            broker_data['qualifications'] = {}
            broker_data['qualifications']['self_regulatory_organizations'] = lines[i+2].strip()
            broker_data['qualifications']['us_states_and_territories'] = lines[i+3].strip()
            broker_data['qualifications']['principal_supervisory_exam'] = lines[i+4].strip()
            broker_data['qualifications']['general_industry_product_exams'] = lines[i+5].strip()
            broker_data['qualifications']['state_securities_law_exams'] = lines[i+6].strip()
        elif "Registration History" in line:
            broker_data['registration_history'] = []
            for j in range(i+1, len(lines)):
                if "Employment History" in lines[j]:
                    break
                if lines[j].strip():
                    registration_info = lines[j].split(' ')
                    registration_data = {
                        'firm_name': ' '.join(registration_info[:-3]),
                        'crd_number': registration_info[-1],
                        'location': registration_info[-1],
                        'dates': registration_info[-1]
                    }
                    broker_data['registration_history'].append(registration_data)
        elif "Employment History" in line:
            broker_data['employment_history'] = []
            for j in range(i+1, len(lines)):
                if "Other Business Activities" in lines[j]:
                    break
                if lines[j].strip():
                    employment_info = lines[j].split(' ')
                    employment_data = {
                        'employer_name': ' '.join(employment_info[:-2]),
                        'investment_related': employment_info[-1],
                        'location': employment_info[-1]
                    }
                    broker_data['employment_history'].append(employment_data)

    return broker_data

# Example usage:
pdf_path = '/Users/vatsalthakkar/Desktop/VATSAL_THAKKAR/MS-CS-UGA/VT Code/VTs-Lab/Coding-Challenge/AssetLink-Coding-Challenge/Notebooks/individual_1000059.pdf'
broker_data = extract_broker_data_from_pdf(pdf_path)
print(broker_data)

{'name': 'DOUGLAS PENDLETON GAINES', 'qualifications': {'self_regulatory_organizations': 'This section details that the representative has reported 0 professional designation(s).', 'us_states_and_territories': 'No information reported.', 'principal_supervisory_exam': '5', 'general_industry_product_exams': '௭2024 FINRA. All rights reserved. Report about DOUGLAS P. GAINES.', 'state_securities_law_exams': 'www.finra.org/brokercheck'}, 'employment_history': [{'employer_name': '', 'investment_related': 'Employment', 'location': 'Employment'}, {'employer_name': '', 'investment_related': 'Name', 'location': 'Name'}, {'employer_name': '', 'investment_related': 'Related', 'location': 'Related'}, {'employer_name': '', 'investment_related': 'Position', 'location': 'Position'}, {'employer_name': '', 'investment_related': 'Location', 'location': 'Location'}, {'employer_name': "This section provides up to 10 years of an individual broker's employment history as reported by the individual broker on t

In [5]:
from src.gui import create_gui
gui = create_gui()
pn.serve(gui)

OperationalError: (sqlite3.OperationalError) unable to open database file
(Background on this error at: https://sqlalche.me/e/20/e3q8)