In [0]:

import pandas as pd
import numpy
from bs4 import BeautifulSoup, Comment
import requests
import sqlite3
import json
import os
import urllib
import re

In [0]:
#https://www.mediawiki.org/wiki/API:Query

In [0]:
LIVING_PEOPLE_MAIN_PAGE = "https://en.wikipedia.org/wiki/Category:Living_people"

In [0]:
def get_names_each_page(url, data):
  """
  Inputs: 
    url: link to the webpage we want to scrap
    data: array in which we will store names and url of each name

  Outputs:
    data: array of names and url of each name
    next_page_url: url of the next wiki page that we will use to scrap more names
  """

  req = urllib.request.Request(url)
  response = urllib.request.urlopen(req)
  html_doc = response.read()
  soup = BeautifulSoup(html_doc, 'html.parser')
  categories = soup.find_all(class_=re.compile("mw-category-group"))
  for e in categories:
      items = e.find_all("a")
      for i in items:
          data = data + [[i.text, 'https://en.wikipedia.org' + i['href']]]
  next_page_url = soup.find("a", text="next page")['href']

  return data, next_page_url

In [28]:
data, next_page_url = get_names_each_page(LIVING_PEOPLE_MAIN_PAGE, [])
counter = 0
while(next_page_url and counter < 1):
    data, next_page_url = get_names_each_page('https://en.wikipedia.org' + next_page_url, data)
    counter += 1

df = pd.DataFrame(data)
df.columns = ['name', 'link']


df

# each page contains about 200 names so 20 pages = 4200 names
# but 20 page reaches only 'E' (starting from numbers)

Unnamed: 0,name,link
0,2 Chainz,https://en.wikipedia.org/wiki/2_Chainz
1,2 Cold Scorpio,https://en.wikipedia.org/wiki/2_Cold_Scorpio
2,2 Pistols,https://en.wikipedia.org/wiki/2_Pistols
3,2 Tuff Tony,https://en.wikipedia.org/wiki/2_Tuff_Tony
4,2Baba,https://en.wikipedia.org/wiki/2Baba
5,Mr 2Kay,https://en.wikipedia.org/wiki/Mr_2Kay
6,2Mex,https://en.wikipedia.org/wiki/2Mex
7,2Play,https://en.wikipedia.org/wiki/2Play
8,2wenty,https://en.wikipedia.org/wiki/2wenty
9,3D Na'Tee,https://en.wikipedia.org/wiki/3D_Na%27Tee


In [0]:
def get_plain_text():
  with open('names.json', 'r') as f:
    names = json.load(f)

  S = requests.Session()

  URL = "https://en.wikipedia.org/w/api.php"

  plain_text = {}

  # INTERESTING PROPERTIES
  # length of each page -- from 'length' in 'info' prop
  # number of categories the page falls into -- count 'title' from "categories"
  # categories -- from "categories"
  # number of links in each page -- count 'title' in "links"
  # number of interlanguage links from each page -- count 'lang' in "langlinks"

  for counter, e in enumerate(names):
    # info =  display basic information about the given page
    # categories = list all categories the pages belong to
    # links = returns all links from the given pages
    # linkshere: find all pages that link to the given pages
    # langlinks: returns all interlanguage links from the given pages
    title = e[0]
    NORMAL_PROP_PARAMS = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "info|categories|links|linkshere|langlinks"
    }

    # extract plain text from the first paragraph of the page
    # https://stackoverflow.com/questions/4452102/how-to-get-plain-text-out-of-wikipedia
    PLAIN_TEXT_PARAMS = {
        "action": "query",
        "format": "json",
        "titles": title,
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
    }


    # Nine: I tried looking at categoryinfo prop and it did not give much info 
    # about each page (aka not many pages have the info)

    r = S.get(url=URL, params=PLAIN_TEXT_PARAMS)
    response = r.json()
    page = next(iter(response['query']['pages'].values()))
    plain_text[title] = page['extract']
    
    if((counter+1)%500 == 0):
        n = str(int((counter+1)/500))
        with open('plain_text' + n + '.json', 'w') as fp:
            json.dump(plain_text, fp)