In [37]:
import json
import requests
import numpy as np
import time, datetime
from tqdm import tqdm

In [47]:
universities = dict()
universities['CMU'] = "Category:Carnegie_Mellon_University_alumni"
universities['Stanford'] = "Category:Stanford_University_alumni"
universities['Harvard'] = "Category:Harvard_University_alumni"
universities['Yale'] = "Category:Yale_University_alumni"
universities['UCLA'] = "Category:University_of_California,_Los_Angeles_alumni"
universities['MIT'] = "Category:Massachusetts_Institute_of_Technology_alumni"
universities['Pitt'] = "Category:University_of_Pittsburgh_alumni"

In [48]:
def get_members(cat_name, cont=None):
    url = 'https://en.wikipedia.org/w/api.php'
    params = {"action" : 'query', 'format' : 'json', 'list' : 'categorymembers', 'cmlimit' : 500, 'cmprop' : 'title'}
    params['cmtitle'] = cat_name
    if (cont != None):
        params['cmcontinue'] = cont
    response = requests.get(url, params=params)
    ret_val = []
    data = json.loads(response.text)
    if ('continue' in data):
        ret_val = get_members(cat_name, cont=data['continue']['cmcontinue'])
    ret_val += [x['title'] for x in data['query']['categorymembers']]
    return ret_val
def get_all_subcategories(cat_name, cont=None):
    url = 'https://en.wikipedia.org/w/api.php'
    params = {"action" : 'query', 'format' : 'json', 'list' : 'categorymembers', 'cmlimit' : 500, 'cmtype' : 'subcat'}
    params['cmtitle'] = cat_name
    if (cont != None):
        params['cmcontinue'] = cont
    response = requests.get(url, params=params)
    ret_val = []
    data = json.loads(response.text)
    if ('continue' in data):
        ret_val = get_all_subcategories(cat_name, cont=data['continue']['cmcontinue'])
    ret_val += [x['title'] for x in data['query']['categorymembers']]
    return ret_val
def get_all_members(cat_name):
    subcategories = get_all_subcategories(cat_name)
    ret_val = []
    for subcat in subcategories:
        ret_val += get_members(subcat)
    ret_val += get_members(cat_name)
    return list(filter(lambda a: not a.startswith("Category:"), ret_val))
def get_info(titles):
    url = 'https://en.wikipedia.org/w/api.php'
    params = {"action" : 'query', 'format' : 'json', 'prop' : 'extracts', 'exintro' : '', 'explaintext' : ''}
    params['titles'] = titles
    response = requests.get(url, params=params)
    data = list(json.loads(response.text)['query']['pages'].values())
    ret_val = dict()
    for d in data:
        if (d['extract'] != ""):
            ret_val[d['title']] = d['extract']
    return ret_val
def get_all_info(title_list):
    data = dict()
    partial_titles = ""
    for title in tqdm(title_list):
        if len(partial_titles + title) < 200:
            partial_titles += title + '|'
        else:
            partial_titles += title
            info = get_info(partial_titles)
            data = {**data, **info}
            partial_titles = ""
    return data
def get_all_data(universities, append=False):
    data = dict()
    if (append):
        with open('data.json', 'r') as f:
            data = json.load(f)
    for key, value in universities.items():
        print(key)
        data[key] = get_all_info(get_all_members(value))
    with open('data.json', 'w') as fp:
        json.dump(data, fp)

In [49]:
get_all_data(universities)

CMU


100%|██████████| 801/801 [00:09<00:00, 86.31it/s]


Stanford


100%|██████████| 3865/3865 [00:46<00:00, 82.35it/s]


Harvard


100%|██████████| 19702/19702 [04:31<00:00, 72.68it/s]


Yale


100%|██████████| 7408/7408 [01:38<00:00, 75.57it/s]


UCLA


100%|██████████| 2726/2726 [00:34<00:00, 79.61it/s]


MIT


100%|██████████| 2995/2995 [00:37<00:00, 79.19it/s]


Pitt


100%|██████████| 796/796 [00:10<00:00, 77.38it/s]
