#### Wikipedia Scraping

Wikipedia API can give us the meta data of each company's wikipedia page.

In [None]:
import pandas as pd
import wikipedia
import wptools
import requests
import datetime
import numpy as np
import bs4

In [None]:
# get the list of comapanies basic information
df = pd.read_csv('latest.csv')
wikis = dict()
# start the API session
S = requests.Session()
URL = "https://en.wikipedia.org/w/api.php"
# scrap data from wikipedia API
for i in df.index:
    wiki = dict()
    search = wikipedia.search(df['name'][i])
    try:
        # search if this company has a wikipedia page or not and get the wikipedia page
        page = wptools.page(search[0])
        print(i, "wiki")
        query = wptools.page(search[0]).get_query()
        if df['name'][i].lower().split()[0] == query.data['title'].lower().split()[0]:
            # company information
            try:
                wiki['wikipedia'] = query.data['url']
            except:
                raise
            try:
                wiki['wiki_summary'] = query.data['extext']
            except:
                raise
            # meta information
            # page size
            PARAMS1 = {
                "format": "json",
                "action": "query",
                "prop": "info",
                "titles": search[0]
            }
            info = S.get(url=URL, params=PARAMS1).json()
            try:
                page_id = list(info['query']['pages'].keys())[0]
                wiki['page_size_bytes'] = info['query']['pages'][page_id]['length']
            except:
                raise
            # edits
            PARAMS2 = {
                "format": "json",
                "action": "query",
                "prop": "revisions",
                "titles": search[0],
                "rvprop": "ids|timestamp|size|userid|comment",
                "rvlimit": 500
            }
            revisions = S.get(url=URL, params=PARAMS2).json()
            try:
                edits = pd.DataFrame(revisions['query']['pages'][page_id]['revisions'])
                for index in edits.index:
                    edits['timestamp'][index] = datetime.datetime.strptime(edits['timestamp'][index],'%Y-%m-%dT%H:%M:%SZ')
                wiki['latest_edit_time'] = datetime.datetime.strftime(edits['timestamp'][0], "%Y-%m-%d")
                wiki['avg_day_between_edits'] = int(edits['timestamp'].diff(periods=-1).mean()/np.timedelta64(1, 'D'))
                edits['year'] = None
                for index in edits.index:
                    edits['year'][index] = datetime.datetime.strftime(edits['timestamp'][index],'%Y')
                wiki['avg_edits_per_year'] = int(edits.groupby('year').size().mean())
                try:
                    wiki['num_edits_2019'] = len(edits[edits['year'] == '2019'])
                except:
                    raise
                try:
                    wiki['num_edits_2018'] = len(edits[edits['year'] == '2018'])
                except:
                    raise
                try:
                    wiki['num_edits_2017'] = len(edits[edits['year'] == '2017'])
                except:
                    raise
            except:
                raise
            # page views
            PARAMS3 = {
                "format": "json",
                "action": "query",
                "prop": "pageviews",
                "titles": search[0],
                "pvipdays": 60,
            }
            pageviews = S.get(url=URL, params=PARAMS3).json()
            try:
                views = pd.DataFrame.from_dict(pageviews['query']['pages'][page_id]['pageviews'], orient='index', columns=['pageviews'])
                wiki['pageviews_60d'] = views['pageviews'].sum()
            except:
                raise
            # sections
            PARAMS4 = {
                "format": "json",
                "action": "parse",
                "prop": "sections",
                "page": search[0],
            }
            sections = S.get(url=URL, params=PARAMS4).json()
            try:
                wiki['num_sections'] = len(sections['parse']['sections'])
            except:
                raise
            # redirect links
            PARAMS5 = {
                "format": "json",
                "action": "query",
                "prop": "redirects",
                "titles": search[0],
            }
            redirects = S.get(url=URL, params=PARAMS5).json()
            try:
                wiki['num_redir_links'] = len(redirects['query']['pages'][page_id]['redirects'])
            except:
                raise
            # references
            PARAMS6 = {
                "format": "json",
                "action": "parse",
                "prop": "text",
                "page": search[0],
            }
            text = S.get(url=URL, params=PARAMS6).json()
            try:
                html = bs4.BeautifulSoup(text['parse']['text']['*'])
                references = html.find_all('ol', class_="references")
                if len(references) == 2:
                    wiki['num_references'] = len(references[1].find_all('li'))
                else:
                    wiki['num_references'] = len(references[0].find_all('li'))
            except:
                raise
            
            wikis[i] = wiki
        else:
            print(i, "name unmatch")
    except:
        print(i, "no wiki")
# totally 1831 companies have wikipedia page
len(wikis)
# output the results
output = pd.DataFrame(wikis).transpose()
# output.to_csv('wikipedia_new.csv')
# join with companies list
dataset = pd.read_csv('D&B_dataset.csv')
dataset = dataset.join(output[['num_edits_2017']], how='left')
#dataset.to_excel('D&B_dataset.xlsx')