In [1]:
#TODO: patch faulty data for 2023 NSL

# structure nsl data as objects
from nsl_class import NSL

import os
from bs4 import BeautifulSoup
import requests
import time
from datetime import datetime

# Directory where we store all html pages that contain links to nsl
# here, data from March 1, 2024
DATA_DIR = "../../data/raw/google_nsl_pages/2024_03_01"

# Get all raw html pages
files = sorted(os.listdir(DATA_DIR))
files = [f for f in files if '.html' in f]

nsls = []
# Process NSL Letters
for f in files:
    print('Processing ' + f)
    # read the raw html
    with open("{}/{}".format(DATA_DIR, f)) as fin:
        html = fin.read()
    # parse html
    soup = BeautifulSoup(html)
    
    # find the table that list nsl letters
    table = soup.find_all('table', class_='google-visualization-table-table')[-1]
    
    # record data for each nsl letter
    nsl_file_number = None
    nsl_file_url = None
    nsl_issue_date = None
    nsl_release_date = None
    nsl_release_url = None

    idx = 0
    for td in table.find_all('td'):
        if idx % 3 == 0:
            nsl_file_name = td.text
            nsl_file_url = td.find('a', href=True)['href']
        if idx % 3 == 1:
            nsl_issue_date = td.text
            # Convert to obj
            nsl_issue_date = datetime.strptime(nsl_issue_date, '%b %d, %Y')

        if idx % 3 == 2:
            nsl_release_date = datetime.strptime(td.text, '%b %d, %Y')
            nsl_release_url = td.find('a', href=True)['href']
            # Structure this data as object.
            nsls.append(NSL(nsl_issue_date, nsl_file_name, company="Google", year=nsl_issue_date.year, release_date=nsl_release_date, link_to_nsl_file=nsl_file_url, link_to_release_letter=nsl_release_url))
        idx += 1

print('total nsls: {}'.format(len(nsls)))


Processing google_nsl_page_01.html
File name NSL-20-508732 match: <re.Match object; span=(0, 13), match='NSL-20-508732'>
File name NSL-14-403836 match: <re.Match object; span=(0, 13), match='NSL-14-403836'>
File name NSL-19-492212 match: <re.Match object; span=(0, 13), match='NSL-19-492212'>
File name NSL-14-397200 match: <re.Match object; span=(0, 13), match='NSL-14-397200'>
File name NSL-17-429526 match: <re.Match object; span=(0, 13), match='NSL-17-429526'>
File name NSL-21-542008 match: <re.Match object; span=(0, 13), match='NSL-21-542008'>
File name NSL-20-517093 match: <re.Match object; span=(0, 13), match='NSL-20-517093'>
File name NSL-20-521515 match: <re.Match object; span=(0, 13), match='NSL-20-521515'>
File name NSL-22-546665 match: <re.Match object; span=(0, 13), match='NSL-22-546665'>
File name NSL-21-542755 match: <re.Match object; span=(0, 13), match='NSL-21-542755'>
File name NSL-21-533612 match: <re.Match object; span=(0, 13), match='NSL-21-533612'>
File name NSL-21-54

In [3]:
# Store parsed data in CSV file
GOOGLE_CSV_FILE = "../../data/extracted/nsl_letters_google.csv"
NSL.export_nsls_as_csv(nsls, GOOGLE_CSV_FILE)

In [5]:
# Download data
DATA_DIR = "../../data/raw/nsl/google"

def download_file_at_url(url):
    print(url)
    response = requests.get(url)
    with open("{}/{}".format(DATA_DIR, url.split("/")[-1]), 'wb') as f:
        f.write(response.content)
    print(f"Downloaded {url.split('/')[-1]}")
    time.sleep(1)

for nsl in nsls:
    download_file_at_url(nsl.link_to_nsl_file)
    download_file_at_url(nsl.link_to_release_letter)

https://storage.googleapis.com/transparencyreport/legal/NSLs/20-508732/NSL-20-508732-request-redacted.pdf
Downloaded NSL-20-508732-request-redacted.pdf
https://storage.googleapis.com/transparencyreport/legal/NSLs/20-508732/NSL-20-508732-release-redacted.pdf
Downloaded NSL-20-508732-release-redacted.pdf
https://storage.googleapis.com/transparencyreport/legal/NSLs/14-403836/NSL-14-403836-request-redacted.pdf
Downloaded NSL-14-403836-request-redacted.pdf
https://storage.googleapis.com/transparencyreport/legal/NSLs/14-403836/NSL-14-403836-release-redacted.pdf
Downloaded NSL-14-403836-release-redacted.pdf
https://storage.googleapis.com/transparencyreport/legal/NSLs/19-492212/NSL-19-492212-request-redacted.pdf
Downloaded NSL-19-492212-request-redacted.pdf
https://storage.googleapis.com/transparencyreport/legal/NSLs/19-492212/NSL-19-492212-release-redacted.pdf
Downloaded NSL-19-492212-release-redacted.pdf
https://storage.googleapis.com/transparencyreport/legal/NSLs/14-397200/NSL-14-397200-req