In [1]:
# use the import keyword to import pandas, requests, and bs4 modules
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# assign the NY WARN notice url to a variable
url = "https://labor.ny.gov/app/warn/"

In [3]:
# define headers
headers = {'accept-encoding': 'deflate'}

In [4]:
# make a get request to the url using the requests library and assign the response to a variable called 'response'
response = requests.get(url, headers=headers)

In [6]:
# print out status code of response to confirm that your request worked
# response.status_code

In [7]:
# parse the response text using Beautiful Soup's html parser and assign output to a variable called 'soup'
# response.text

In [8]:
# scrape the first table on the page and assign it to a variable called 'table'
soup = BeautifulSoup(response.text, 'html.parser')

In [9]:
# grab all rows from the table and assign to a variable called 'rows'
table = soup.find("table")
# table

In [10]:
# print out the number of rows — this is how many WARN notices there were in 2020
rows = soup.find_all("tr")

In [11]:
len(rows)

1285

In [12]:
# make an array called 'results'
results = []
# loop through the rows using a for loop. each row here is a company
for row in rows:
    # grab the anchor tag (the link tag) in the row and then grab the href attribute from the tag
    a = row.find("a")['href']
    
    # concatenate the root url from above with this href attribute and assign to a variable called 'company_url'
    company_url = f'{url}{a}'
    
    # make a get request to the company url assign the response to a variable called 'company_response'
    company_response = requests.get(company_url, headers=headers)
    
    # parse the response text and assign output to a variable called 'company_soup'
    company_soup = BeautifulSoup(company_response.text, 'html.parser')

    # grab the first table on the page
    company_table = company_soup.find("table")

    # unwrap all of the spans
    
    # loop through all of the p tags
    paragraphs = company_table.find_all("p")
    for p in paragraphs:
        # grab all of the values we want
        text = p.get_text('\n')
        if 'Date of Notice:' in text:
            notice_date = text.split(":")[1].strip().split('\n')[0].strip().replace(',', '').replace(';', '')
            #print(notice_date)
        elif 'Reason Stated for Filing:' in text:
            reason = text.split(":")[1].strip()
            #print(reason)
        elif 'Company:' in text:
            split_company = text.split("\n")
            #print(split_company)
            company = split_company[1].strip()
            address = ''.join(split_company[2:])
#             print(company)
#             print(address)
        elif 'County:' in text:
            county = f'{text.split(":")[1].strip().split("|")[0].strip()} County'
            #print(county)
        elif 'Phone:' in text:
            phone = text.split(":")[1].strip()
            #print(phone)
        elif 'Business Type:' in text:
            business_type = text.split(":")[1].strip().replace('Restaurants', 'Restaurant')
            #print(business_type)
        elif 'Number Affected:' in text:
            if '-----' in text:
                affected = ''
            else:
                affected = text.split(":")[1].strip().split(" ")[0].strip().split('\n')[0].strip()
            #print(affected)
        elif 'Total Employees:' in text:
            if '-----' in text:
                total_employees = ''
            else:
                total_employees = text.split(":")[1].strip().split(" ")[0].strip().replace(',', '')
                #print(total_employees)
        elif 'Layoff Date:' in text:
            #print(text)
            layoff_date = text.split(":")[1].strip().split(" ")[0].strip().split(" ")[0].strip()
            #print(layoff_date)
        elif ('Reason for Dislocation:' in text):
            dislocation = text.split(":")[1].strip()
            #print(dislocation)
        elif ('Union:' in text):
            union = text.split(":")[1].strip()
            #print(union)
        elif ('Classification:' in text):
            classification = text.split(":")[1].strip()
            #print(classification)
            
    # store values in a result object
    result = {
        'notice_date': notice_date,
        'reason': reason,
        'company': company,
        'address': address,
        'county': county,
        'phone': phone,
        'business_type': business_type,
        'affected': affected,
        'total_employees': total_employees,
        'layoff_date': layoff_date,
        'dislocation': dislocation,
        'union': union,
        'classification': classification
     }
    
    # append result object to results
    results.append(result)
    # break

In [13]:
# wrap results in a dataframe
df = pd.DataFrame(results)

In [14]:
pd.options.display.max_rows = 1237
df['layoff_date'].unique()

array(['5/15/2020', '3/15/2020', '5/1/2020', 'Layoffs', '3/23/2020',
       '4/30/2020', '5/5/2020', 'These', '5/8/2020', '5/2/2020',
       '3/30/2020', '3/14/2020', '5/18/2020', '3/27/2020', '5/6/2020',
       '3/18/2020', '3/17/2020', '3/12/2020', '4/1/2020', 'March',
       '3/16/2020', 'The', '3/19/2020', '5/29/2020', '6/1/2020',
       '4/24/2020', '4/16/2020', '3/20/2020', '4/3/2020', '6/30/2020',
       '4/23/2020', '4/4/2020', '4/8/2020', '4/22/2020', '3/24/2020',
       '4/13/2020', '4/10/2020', 'Furloughs', '3/22/2020', 'Separation',
       '3/29/2020', '3/25/2020', 'Separations', '23', '7/30/2020',
       '4/25/2020', '7/19/2020', '6/19/2020', '7/3/2020', '4/20/2020',
       '3/31/2020', '4/14/2020', '3/28/2020', '72', '51', '86', '88',
       '37', 'Nine', '4/6/2020', '(21)', '(27)', '30', '18', '14',
       'Seven', '4/2/2020', '3/21/2020', '356', '34', '330', '69', '52',
       '87', '81', '-----', '4/7/2020', '48', '3/26/2020', '3/10/2020',
       '4/12/2020', '362', '3

In [16]:
# output dataframe to a csv
df.to_csv('../data/warn-gaffney.csv', index=False)