# With Beautiful Soup & Requests

In [278]:
import requests
import bs4
import re

source = ['https://www.forbes.com/lists/ai50']

In [None]:
class Companies:
    
    def __init__(self):
        self.list = []
        self.

In [376]:
class Company:
    
    def __init__(self,name):
        self.name = name
        self.open_roles = []
        self.departments = set()
        
    def add_open_role(self, role):
        self.open_roles.append(role)
        self.departments.add(role.department)
    
    def __str__(self):
        roles = ''
        for role in self.open_roles:
            roles += f"\n{role.__str__()}"
        
        return f"{self.name.capitalize()} has: {roles}"

In [377]:
class Role:
    def __init__(self, title, location, link, department):
        self.title = title
        self.location = location
        self.link = link
        self.department = department
        
    def __str__(self):
        return f"{self.title} in {self.location}"

In [378]:
def init_bs4(base_url):
    result = requests.get(base_url).text
    return bs4.BeautifulSoup(result, "html.parser")

In [396]:
def scrape_lever(company):
    soup = init_bs4(f"https://jobs.lever.co/{company.name}")
    departments = soup.select('.postings-group')
    
    for department in departments:
        department_name = department.select('.posting-category-title')[0].get_text()
        
        for opening in department.select('.posting'):
            title = opening.find('h5').text
            location = opening.select('.location')[0].get_text()
            link = opening.select('.posting-apply a')[0]['href']

            role = Role(title, location, link, department_name)
            company.add_open_role(role)

def scrape_greenhouse(company):
    soup = init_bs4(f"https://boards.greenhouse.io/{company.name}")
    departments = soup.select('section.level-0')
    
    for department in departments:
        department_name = department.find('h3').text
        
        for opening in department.select('.opening'):
            title = opening.find('a').text
            location = opening.select('.location')[0].get_text()
            link = f"https://boards.greenhouse.io{opening.select('a')[0]['href']}"
            
            role = Role(title, location, link, department_name)
            company.add_open_role(role)

In [397]:
company_names = ['shieldai', 'quantummetric']
companies = []

for name in company_names:  
    company = Company(name)
    scrape_lever(company)
    print("---")
    print(f"{company.name.capitalize()} : {len(company.open_roles)}")
    print("")
    print(company.departments())
    print("")
    for role in company.open_roles:
        print(role.__str__())

---
Shieldai : 40

{'Production Management', 'Hivemind - Hivemind Edge', 'Platform - Systems', 'Production and Supply Chain', 'Design', 'Platform - Mechanical', 'Deployed Operations', 'Fleet Asset Management', 'Platform - Test', 'Hivemind - Hivemind Design', 'Platform - Electrical', 'Hivemind - Test', 'Hivemind - Systems', 'Supply Chain Management', 'Finance', 'Service Operations', 'Technical Directors', 'Program Management', 'Enterprise Operations'}

Accounts Payable Manager (on site in Dallas) in Dallas Metro Area
Manager, Government Compliance (R2155) in United States
V-BAT Air Vehicle Operator (R1910) in Dallas Metro Area
V-BAT Operations Officer (R2142) in Dallas Metro Area
IT Services Manager in Washington DC Metro Area
Senior Cybersecurity Manager (R2037) in Dallas Metro Area
Senior Systems Administrator (R1941) in Washington DC Metro Area
Fleet Engineer (R1898) in Dallas Metro Area
Fleet Management Project Manager (R2144) in Dallas Metro Area
Continuous Improvement Engineer (R2

In [398]:
company_names = ['assemblyai', 'openai']
companies = []

for name in company_names:  
    company = Company(name)
    scrape_greenhouse(company)
    print("---")
    print(f"{company.name.capitalize()} : {len(company.open_roles)}")
    print("")
    print(company.departments())
    print("")
    for role in company.open_roles:
        print(role.__str__())

---
Assemblyai : 7

{'Engineering', 'Marketing', 'Product'}

Technical Program Manager in Remote
IT Systems Administrator  in Remote
Software Engineer in Test (SDET) in Remote
Software Engineer, Python/Go  in Remote
Senior Technical Product Marketing Manager in Remote
Developer Educator - Python/Golang in Remote
Product Manager in Remote
---
Openai : 44

{'Finance', 'Legal', 'Public Policy', 'Go To Market', 'Communications Design', 'Trust & Safety', 'IT', 'Research, Engineering, Product'}

Full-Stack Developer, Communications & Design in San Francisco, California, United States
Consolidations Accounting Manager in San Francisco, California, United States
Head of Procurement in San Francisco, California, United States
Revenue Accounting Manager in San Francisco, California, United States
Senior Revenue Accountant, Deal Desk in San Francisco, California, United States
Account Engineer in San Francisco, California, United States
IT Application Engineer in San Francisco, California, United

# With Scrapy

In [302]:
import scrapy

In [301]:
# scrape webpage
import scrapy
from scrapy.crawler import CrawlerRunner
# text cleaning
import re
# Reactor restart
from crochet import setup, wait_for
setup()

class QuotesToCsv(scrapy.Spider):
    """scrape first line of  quotes from `wikiquote` by 
    Maynard James Keenan and save to json file"""
    name = "MJKQuotesToCsv"
    start_urls = [
        'https://en.wikiquote.org/wiki/Maynard_James_Keenan',
    ]
    custom_settings = {
        'ITEM_PIPELINES': {
            '__main__.ExtractFirstLine': 1
        },
        'FEEDS': {
            'quotes.csv': {
                'format': 'csv',
                'overwrite': True
            }
        }
    }

    def parse(self, response):
        """parse data from urls"""
        for quote in response.css('div.mw-parser-output > ul > li'):
            yield {'quote': quote.extract()}


class ExtractFirstLine(object):
    def process_item(self, item, spider):
        """text processing"""
        lines = dict(item)["quote"].splitlines()
        first_line = self.__remove_html_tags__(lines[0])

        return {'quote': first_line}

    def __remove_html_tags__(self, text):
        """remove html tags from string"""
        html_tags = re.compile('<.*?>')
        return re.sub(html_tags, '', text)

@wait_for(10)
def run_spider():
    """run spider with MJKQuotesToCsv"""
    crawler = CrawlerRunner()
    d = crawler.crawl(QuotesToCsv)
    return d