# With Beautiful Soup & Requests

In [247]:
import requests
import bs4
import re


In [139]:
class Companies:
    
    def __init__(self):
        self.all_companies = []
        self.all_departments = set()
    
    def add_company(self, company):
        self.all_companies.append(company)
        self.all_departments = self.all_departments.union(company.departments)
    
    def find_roles_by_department(self, department):
        role_list = list()
        for company in self.all_companies: 
            for role in company.open_roles:
                if role.department == department:
                    role_list.append({ "": })
        return role_list

In [120]:
class Company:
    
    def __init__(self,name, job_portal):
        self.name = name
        self.open_roles = []
        self.departments = set()
        self.job_portal = job_portal
        
    def add_open_role(self, role):
        self.open_roles.append(role)
        self.departments.add(role.department)
    
    def __str__(self):
        roles = ''
        for role in self.open_roles:
            roles += f"\n{role.__str__()}"
        
        return f"{self.name.capitalize()} has: {roles}"

In [185]:
class Role:
    def __init__(self, title, location, link, department, company):
        self.title = title
        self.location = location
        self.link = link
        self.department = department
        self.company = company
        
    def __str__(self):
        return f"{self.title} in {self.location} | {self.department} | {self.company} | {self.link}"

In [154]:
def init_bs4(base_url):
    result = requests.get(base_url).text
    return bs4.BeautifulSoup(result, "html.parser")

In [179]:
def grab_department(department, job_portal):
    def sanitize_string(string):
        return string.lstrip().rstrip()
    
    name = None
    
    try:
        if job_portal == "lever":
            name = department.select('.posting-category-title')[0].get_text()
        elif job_portal == "greenhouse":
            name = department.find('h3').text
    except:
        name = "Other"
    finally:
        return sanitize_string(name)

In [176]:
def scrape_lever(company):
    soup = init_bs4(company.job_portal)
    departments = soup.select('.postings-group')
    
    for department in departments:
        department_name = grab_department(department, "lever")
        
        for opening in department.select('.posting'):
            title = opening.find('h5').text
            location = opening.select('.location')[0].get_text()
            link = opening.select('.posting-apply a')[0]['href']

            role = Role(title, location, link, department_name, company.name)
            company.add_open_role(role)

def scrape_greenhouse(company):
    soup = init_bs4(company.job_portal)
    departments = soup.select('section.level-0')
    
    for department in departments:
        department_name = grab_department(department, "greenhouse")
        
        for opening in department.select('.opening'):
            title = opening.find('a').text
            location = opening.select('.location')[0].get_text()
            link = f"https://boards.greenhouse.io{opening.select('a')[0]['href']}"
            
            role = Role(title, location, link, department_name, company.name)
            company.add_open_role(role)

# With Scrapy

In [186]:
companies = Companies()

for item in company_data:
    company = Company(item['name'], item['link'])
    print(f" Scraping {company.name}")
    if 'lever' in company.job_portal:
        scrape_lever(company)
    elif 'greenhouse' in company.job_portal:
        scrape_greenhouse(company)
    
    companies.add_company(company)

 Scraping OpenAI
 Scraping Memora Health
 Scraping MindsDB
 Scraping Shield AI
 Scraping Spot AI
 Scraping Fathom
 Scraping Deepgram
 Scraping AMP Robotics
 Scraping Kumo
 Scraping Abridge
 Scraping Aisera
 Scraping AssemblyAI
 Scraping Tecton.AI
 Scraping Optimal Dynamics
 Scraping Built Robotics
 Scraping Viz
 Scraping Lilt
 Scraping Cresta
 Scraping Deepcell
 Scraping Veriff
 Scraping Labelbox
 Scraping HyperScience
 Scraping Robust Intelligence
 Scraping Robin Healthcare
 Scraping Synthesia
 Scraping VergeSense
 Scraping Shift Technology
 Scraping copy.ai
 Scraping Osaro
 Scraping Centaur Labs
 Scraping Moveworks
 Scraping Neuralink
 Scraping Standard AI
 Scraping Scale AI
 Scraping Crosschq
 Scraping Ada
 Scraping AEye
 Scraping Databricks
 Scraping People.ai
 Scraping Sisu
 Scraping Dialpad
 Scraping AKASA
 Scraping Dyno Therapeutics
 Scraping Cape Analytics
 Scraping Vectra AI
 Scraping Deep Genomics
 Scraping Tempo
 Scraping Orbital Insight
 Scraping Tessian
 Scraping Domino Da

In [187]:
roles = companies.find_roles_by_department('Customer Success')
for role in roles:
    print(role)

Client Growth in Remote | Customer Success | Fathom | https://boards.greenhouse.io/fathom/jobs/5536914003
Client Growth in San Francisco | Customer Success | Fathom | https://boards.greenhouse.io/fathom/jobs/5544517003
AI Services Engineer in Palo Alto, CA | Customer Success | Aisera | https://boards.greenhouse.io/aiserajobs/jobs/4899686004
AI Services Engineer, Remote in Bangalore, India | Customer Success | Aisera | https://boards.greenhouse.io/aiserajobs/jobs/4595857004
AI Services Manager in Palo Alto, CA | Customer Success | Aisera | https://boards.greenhouse.io/aiserajobs/jobs/4850507004
Customer Engineer in Palo Alto, California | Customer Success | Aisera | https://boards.greenhouse.io/aiserajobs/jobs/4857064004
Customer Success Engineer in Bangalore, India  | Customer Success | Aisera | https://boards.greenhouse.io/aiserajobs/jobs/4345512004
Customer Success Manager in Hyderabad, India | Customer Success | Aisera | https://boards.greenhouse.io/aiserajobs/jobs/4595855004
Custom

In [203]:
companies.all_departments

{'1104 - Computer Vision',
 '1105 - Data Science',
 '130 - Customer Support',
 '1301 - Program Mgt - PS',
 '140 - Customer Success',
 '150 - Professional Services',
 '2101 - Sales - Commercial',
 '2103 - Sales - PS',
 '211 - Product Engineering',
 '213 - Telephony Engineering',
 '214 - AI Engineering',
 '220 - Design',
 '230 - Quality Assurance',
 '240 - Product Management',
 '244 - AI Data Annotation',
 '246 - Data - Product Management',
 '410 - Marketing',
 '4102 - UX-UI Engr',
 '4301 - Product - Content/Vendor',
 '435 - Marketing Growth',
 '450 - Demand Generation',
 '515 - Enterprise Sales',
 '520 - Sales Engineering',
 '526 - Business Technology',
 '530 - MidMarket Sales',
 '535 - SMB Sales',
 '540 - Sales Development',
 '550 - Channel Sales',
 '580 - VSB Sales',
 '610 - Accounting',
 '620 - Legal',
 '645 - Facilities',
 '650 - Data',
 'AI / ML Data Science',
 'AI/ML Engineering',
 'Active Learning',
 'Applied ML',
 'Applied ML or Research',
 'Business Development',
 'COGS',
 'Che

In [183]:
roles = companies.find_roles_by_department('Customer Success')
for role in roles:
    print(role)

Client Growth in Remote | Customer Success | Fathom
Client Growth in San Francisco | Customer Success | Fathom
AI Services Engineer in Palo Alto, CA | Customer Success | Aisera
AI Services Engineer, Remote in Bangalore, India | Customer Success | Aisera
AI Services Manager in Palo Alto, CA | Customer Success | Aisera
Customer Engineer in Palo Alto, California | Customer Success | Aisera
Customer Success Engineer in Bangalore, India  | Customer Success | Aisera
Customer Success Manager in Hyderabad, India | Customer Success | Aisera
Customer Success Operations Manager in Palo Alto, CA | Customer Success | Aisera
Director of Customer Success in Hyderabad, India (Remote) | Customer Success | Aisera
Escalation Manager in Bangalore, India  | Customer Success | Aisera
Technical Project Manager in Hyderabad, India | Customer Success | Aisera
Technical Project Manager in Palo Alto, CA | Customer Success | Aisera
Technical Project Manager in Hyderabad, Bangalore (Remote) | Customer Success | Ai

In [213]:
string = "2101 - Sales - Commercial"
pattern = re.compile('[^a-zA-Z]')
 
re.sub(pattern, " ", string)

'       Sales   Commercial'

In [171]:
string = string.lstrip().rstrip()

In [172]:
string

'sfksdm'

In [215]:
url = "http://tripactions.com/?utm_source=topstartups.io"
"?" in url

True

In [250]:
data

<generator object <genexpr> at 0x1155a7ae0>

In [251]:
data[0]

TypeError: 'generator' object is not subscriptable