In [76]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time
import re
import sys
import string

In [77]:
mit_extra_fields = ['limitations', 'how-its-accessed', 'how-to-register', 'result-format', 'contact-for-technical-questions', 'what-it-does']

In [78]:
#UCB

# Using the Requests Library to make a GET request 
req = requests.get('https://guides.lib.berkeley.edu/information-studies/apis')

# read the content of the server's response
src = req.text

soup = BeautifulSoup(req.content, 'html.parser')

# Grab all the links in the s-lg-content-74224 div enclosed in paragraphs
links = soup.select("#s-lg-content-74224 p a")


# For each link, print the link text, the href value, 
# and walk several elements forward to what is usually the description

list_of_rows = []

for link in links:
    row_dict = {}
    row_dict['name'] = link.text
    row_dict['for-more-information'] = link.attrs["href"]
    row_dict['source'] = 'UCB'
    for field in mit_extra_fields:
        row_dict[field] = ''
    list_of_rows.append(row_dict)
 




In [79]:
# UCSD 

# Using the Requests Library to make a GET request 
req = requests.get('https://ucsd.libguides.com/data-statistics/apis')

# read the content of the server's response
src = req.text


soup = BeautifulSoup(req.content, 'html.parser')

# Grab all the links in the s-lg-content-74224 div enclosed in paragraphs
links = soup.select("#s-lg-link-list-49042253 li a")


# For each link, print the link text, the href value, 
# and walk several elements forward to what is usually the description


for link in links:
    row_dict = {}
    row_dict['name'] = link.text
    row_dict['for-more-information'] = link.attrs["href"]
    row_dict['source'] = 'UCSD'
    for field in mit_extra_fields:
        row_dict[field] = ''
    list_of_rows.append(row_dict)




In [80]:
# MIT 

# Using the Requests Library to make a GET request 
req = requests.get('https://libraries.mit.edu/scholarly/publishing/apis-for-scholarly-resources')

# read the content of the server's response
src = req.text

soup = BeautifulSoup(req.content, 'html.parser')



# Grab all expandable sections
sections = soup.find_all(class_="expandable")


# For each link, print the link text, the href value, 
# and walk several elements forward to what is usually the description

# For each  (expandable) section:
#  For each paragraph (p tag):
#   Grab the text between <strong>, this is the key
#   Grab remaining text, this is the value

num_of_apis = []

for section in sections:
    header = section.find('h3')

    paragraphs = section.find_all('p')
    
    row_dict = {}
    row_dict = {'name' : header.text}
    row_dict['source'] = 'MIT'
    
    for paragraph in paragraphs:
        
        key = paragraph.find('strong')
        text_of_cell = paragraph.text
        
        value = text_of_cell.replace(key.text, '')
        key_without_colon = key.text.replace(':', '')

        # Need to convert cell text to header friendly format
        # Remove punctuation, case, and convert white space to dash
        
        clean_key = key.text.translate(str.maketrans('', '', string.punctuation))
        clean_key = clean_key.strip()
        clean_key = clean_key.replace('’', '')
        clean_key = clean_key.replace('\xa0', ' ')
        clean_key = clean_key.replace(' ', '-')
        clean_key = clean_key.lower()
        
        clean_value = value.strip()
        clean_value = clean_value.replace('\xa0', ' ')
        row_dict[clean_key] = clean_value
    
    list_of_rows.append(row_dict)



In [81]:
# Write to CSV

import csv


csv_columns = list(list_of_rows[0].keys())
csv_columns.extend(('what-they-do', 'how-theyre-accessed', 'notable-included-apis'))

csv_file = "urls.csv"
try:
    with open(csv_file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        writer.writeheader()
        for data in list_of_rows:
            writer.writerow(data)
except IOError:
    print("I/O error")

