In [1]:
# structure nsl data as objects
from nsl_class import NSL

import os
from bs4 import BeautifulSoup
import requests
import time
from datetime import datetime

# parse apple nsl pages given the directory
def parse_nsl_pages_and_extract_urls(data_dir):
    files = sorted(os.listdir(data_dir))
    files = [f for f in files if '.html' in f]
    
    # Save all links
    nsls = []
    for f in files:
        print('processing ' + f)
        # read the raw html
        with open("{}/{}".format(data_dir, f)) as fin:
            html = fin.read()
        # parse html
        soup = BeautifulSoup(html)
        
        # find the table that list nsl letters
        table = soup.find_all('table', class_='transparency-table no-header large-12')
        # not all pages contain nsl pdfs
        if len(table) == 0:
            continue
        else:
            # find the table that contains nsl pdfs
            table = table[-1]
        
        # record data for each nsl letter
        nsl_file_number = None
        nsl_file_url = None
        nsl_date = None
        nsl_release_date = None
        nsl_release_url = None
    
        idx = 0
        for td in table.find_all('td'):
            if td.text.strip() == "NSL":
                continue
            if td.text.strip() == "Issued Date":
                continue
            if td.text.strip() == "Non-Disclosure Released Date":
                continue
            if idx % 3 == 0:
                nsl_file_name = td.text
                nsl_file_url = td.find('a', href=True)['href']
                nsl_file_url = "https://www.apple.com{}".format(nsl_file_url)
            if idx % 3 == 1:
                nsl_issue_date = td.text
                nsl_issue_date = datetime.strptime(nsl_issue_date, '%B %d, %Y')
            if idx % 3 == 2:
                nsl_release_date = datetime.strptime(td.text, '%B %d, %Y')
                nsls.append(NSL(nsl_issue_date, nsl_file_name, company="Apple", year=nsl_issue_date.year, release_date=nsl_release_date, link_to_nsl_file=nsl_file_url))
            idx += 1
                
    print('total urls: {}'.format(len(nsls)))
    return nsls

def download_file_at_url(url):
    print(url)
    response = requests.get(url)
    with open("{}/{}".format(DATA_DIR, url.split("/")[-1]), 'wb') as f:
        f.write(response.content)
    print(f"Downloaded {url.split('/')[-1]}")
    time.sleep(1)
    

In [2]:
import os
from bs4 import BeautifulSoup
import requests
import time

# Directory where we store all html pages that contain links to nsl
DATA_DIR = "../../data/raw/apple_nsl_pages"
# Get urls for all apple pdfs
nsls = parse_nsl_pages_and_extract_urls(DATA_DIR)

# Store parsed data in CSV file
APPLE_CSV_FILE = "../../data/extracted/nsl_letters_apple.csv"
NSL.export_nsls_as_csv(nsls, APPLE_CSV_FILE) 

processing apple_nsl_page10_accessed_03012024.html
File name NSL-17-435681 match: <re.Match object; span=(0, 13), match='NSL-17-435681'>
File name NSL-17-430590 match: <re.Match object; span=(0, 13), match='NSL-17-430590'>
File name NSL-17-432056 match: <re.Match object; span=(0, 13), match='NSL-17-432056'>
File name NSL-17-434038 match: <re.Match object; span=(0, 13), match='NSL-17-434038'>
File name NSL-17-435520 match: <re.Match object; span=(0, 13), match='NSL-17-435520'>
processing apple_nsl_page11_accessed_03012024.html
File name NSL-17-428677 match: <re.Match object; span=(0, 13), match='NSL-17-428677'>
File name NSL-17-430737 match: <re.Match object; span=(0, 13), match='NSL-17-430737'>
File name NSL-17-428654-1 match: <re.Match object; span=(0, 15), match='NSL-17-428654-1'>
File name NSL-17-428654-2 match: <re.Match object; span=(0, 15), match='NSL-17-428654-2'>
processing apple_nsl_page12_accessed_03012024.html
processing apple_nsl_page13_accessed_03012024.html
File name NSL-

In [None]:
# Download data
DATA_DIR = "../../data/raw/nsl/apple"
for nsl in nsls:
    download_file_at_url(nsl.link_to_nsl_file)