In [378]:
# Loading required packages
import pandas as pd
import json, time, urllib.parse
import requests

# Data Acquisition

## Step 1: Getting the Article, Population and Region Data

### Getting Article Data

Here we load the csv with the relevant articles for this assignment and save the article names as a list.

In [379]:
articles_df = pd.read_csv('/Users/zach/Jupyter/DATA 512/us_cities_by_state_SEPT.2023.csv')
# Note that duplicates are dropped based on the article name
articles_df = articles_df.drop_duplicates(subset=['page_title'])
articles = articles_df['page_title'].tolist()

Below we define the constants for the API pull to get the article information

In [380]:
#########
#
#    CONSTANTS
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<zprice12@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

# This is just a list of English Wikipedia article titles that we can use for example requests
ARTICLE_TITLES = articles

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}


This defines the function for requesting page information from the API for an article.

In [381]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    
    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['titles'] = article_title

    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


Below we take each article of interest, get the page information from the API, and then save that article and the revision ID in a dataframe.

In [None]:
info_df = pd.DataFrame(columns=['article', 'revid'])
for art in ARTICLE_TITLES:
    info = request_pageinfo_per_article(art)
    num = str(list(info['query']['pages'].keys())[0])
    revidAdd = info['query']['pages'][num].get('lastrevid')
    info_df.loc[len(info_df.index)] = [art, revidAdd]

### Getting State Data

Here we read in the state population data.

In [382]:
pop_df = pd.read_csv('/Users/zach/Jupyter/DATA 512/state_pops.csv')

### Getting Region Data

Here we read in the region data.

In [383]:
regions_df = pd.read_csv('/Users/zach/Jupyter/DATA 512/us_regions.csv')

## Step 2: Getting Article Quality Predictions

Below we define the constants for the API call to get an ORES score for an article

In [None]:
#########
#
#    CONSTANTS
#

#    The current LiftWing ORES API endpoint and prediction model
#
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"

#
#    The throttling rate is a function of the Access token that you are granted when you request the token. The constants
#    come from dissecting the token and getting the rate limits from the granted token. An example of that is below.
#
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (60.0/5000.0)-API_LATENCY_ASSUMED

#    When making automated requests we should include something that is unique to the person making the request
#    This should include an email - your UW email would be good to put in there
#    
#    Because all LiftWing API requests require some form of authentication, you need to provide your access token
#    as part of the header too
#
REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "<zprice12@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023",
    'Content-Type': 'application/json',
    'Authorization': "Bearer {access_token}"
}
#
#    This is a template for the parameters that we need to supply in the headers of an API request
#
REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address' : "",         # your email address should go here
    'access_token'  : ""          # the access token you create will need to go here
}

#
#    A dictionary of English Wikipedia article titles (keys) and sample revision IDs that can be used for this ORES scoring example
#
ARTICLE_REVISIONS = dict(zip(info_df.article, info_df.revid))

#
#    This is a template of the data required as a payload when making a scoring request of the ORES model
#
ORES_REQUEST_DATA_TEMPLATE = {
    "lang":        "en",     # required that its english - we're scoring English Wikipedia revisions
    "rev_id":      "",       # this request requires a revision id
    "features":    True
}

#
#    These are used later - defined here so they, at least, have empty values
#
USERNAME = ""
ACCESS_TOKEN = ""
#

Here we save the access token for the API.

In [384]:
ACCESS_TOKEN = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJhdWQiOiIzN2JlNGY4NzFmZDViNzU1Njc1NmI2ZTY1ZWQ1MWMwMyIsImp0aSI6IjdkZDVlZmZkNzdiZjZlZmQ2YmE5M2UyNmFkYmNmN2EwNjczYzI4Y2Q4ZGQyMDkyYTlhZmU2YjhkYTJmY2UyMDMxNDU3ODY0MzI3ZjE1MjdkIiwiaWF0IjoxNjk3NDMzMDA5LjgyNzIxOCwibmJmIjoxNjk3NDMzMDA5LjgyNzIyMiwiZXhwIjozMzI1NDM0MTgwOS44MjU5NzQsInN1YiI6Ijc0MDE0Mzg5IiwiaXNzIjoiaHR0cHM6Ly9tZXRhLndpa2ltZWRpYS5vcmciLCJyYXRlbGltaXQiOnsicmVxdWVzdHNfcGVyX3VuaXQiOjUwMDAsInVuaXQiOiJIT1VSIn0sInNjb3BlcyI6WyJiYXNpYyJdfQ.EE1Xieff9g50pThxSrNZErIir5yqIy7bB1Dghd7esgINFKnLqePLYw6ZOeiQW9cA9yyojVrij-6XdwrvODssZzNbLAe-HGj8CVBSqZHuG2FWlxT0fEkl4GkuDP510CzDeWq2HRBAFNghes37N4H9N4I6c9V80CnMvVIEvPueUUPP6yxMMwsVCOyabJNedZhfGXYbQ4ZGKLpENIm3xu_JM5YF5mVW5pXdhx355w1NF41eRt0OKKI10XZCbrHKMDZz7qMkQvHf8dc1mJuoE0JCNSKVCSxSZgiu0WYr6J8Urey9gfUdEazASbkFL7bYrV0ZDBW12xSdg0ynWkB-z_jrFkTncrvGG-iXuJIbSS_h7Rnq2oCe4HNaPodeGVSRKIptkirQWf0mxqsvhIaVTjtkeK-qPu-NrTjDToNbeYyD2HrWxtPHtAA0Oiy17_0foDo1lJmIDyGOZRUlZLVd-e0jNeHFTKhR80tC4pdJkBb87wSTbDhTBagSTLSZ0qgbSa3Elj-ndfBzHyf-TCl_KgeHTy-2Air1m7-KwCDZXaEdtZJzZ3fgFbIh1ATIvqRNtpxumHu_YTJt2uysphPyKsr9XlUAUPPUkg-FM-k1Lt4c3iyLnZ3ZW_oXS-JyBU21Ng2V_hc9cqxOm4vQNegQERrw92LRuFxVpIpUrdowfmV0_04'

Below we define the function for retrieving an ORES score for an article from the API.

In [None]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_ores_score_per_article(article_revid = None, email_address=None, access_token=None,
                                   endpoint_url = API_ORES_LIFTWING_ENDPOINT, 
                                   model_name = API_ORES_EN_QUALITY_MODEL, 
                                   request_data = ORES_REQUEST_DATA_TEMPLATE, 
                                   header_format = REQUEST_HEADER_TEMPLATE, 
                                   header_params = REQUEST_HEADER_PARAMS_TEMPLATE):
    
    #    Make sure we have an article revision id, email and token
    #    This approach prioritizes the parameters passed in when making the call
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token
    
    #   Making a request requires a revision id - an email address - and the access token
    if not request_data['rev_id']:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")
    
    # Create the request URL with the specified model parameter - default is a article quality score request
    request_url = endpoint_url.format(model_name=model_name)
    
    # Create a compliant request header from the template and the supplied parameters
    headers = dict()
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free data
        # source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        #response = requests.get(request_url, headers=headers)
        response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

Here we calculate the ORES score for each article of interest using the article title and revision ID. If an ORES score can't be calculated, we save that article in a separate dataframe for later consideration.

In [None]:
info_final_df = pd.DataFrame(columns=['article_title', 'revision_id', 'article_quality'])
info_fail_df = pd.DataFrame(columns=['article_title', 'revision_id'])
for art in list(ARTICLE_REVISIONS.keys()):
    try:
        score = request_ores_score_per_article(article_revid=ARTICLE_REVISIONS[art],
                                       email_address="zprice12@uw.edu",
                                       access_token=ACCESS_TOKEN)
        score_val = score['enwiki']['scores'][str(ARTICLE_REVISIONS[art])]['articlequality']['score']['prediction']
        info_final_df.loc[len(info_final_df.index)] = [art, ARTICLE_REVISIONS[art], score_val]
    except:
        info_fail_df.loc[len(info_fail_df.index)] = [art, ARTICLE_REVISIONS[art]]

Below we see that the info_fail_df dataframe is empty, implying we were able to get scores for all of the articles

In [386]:
info_fail_df

Unnamed: 0,article_title,revision_id


## Step 3: Combining the Datasets

Here we create a new column for state of each article using the oringal csv with article names

In [387]:
info_final_df['state'] = articles_df['state']

Below we merge the article info with the region data.

In [388]:
all_info_df = info_final_df.merge(regions_df, left_on=['state'], right_on=['STATE'], how='left')

Below we merge the article info with the population data.

In [389]:
all_info_df = all_info_df.merge(pop_df, on=['state'], how='left')
# Reorder columns
all_info_df = all_info_df[['state','DIVISION', 'population', 'article_title', 'revision_id', 'article_quality']]
# Rename region column
all_info_df = all_info_df.rename(columns={'DIVISION': 'regional_division'})

Here we're finding the list of states that did not have an article present

In [390]:
states_used = all_info_df.state.unique().tolist()
states = pop_df.state.tolist()
list(set(states) - set(states_used))

['Nebraska',
 'New Hampshire',
 'District of Columbia',
 'North Dakota',
 'South Dakota',
 'Puerto Rico',
 'North Carolina',
 'New Jersey',
 'Connecticut',
 'New York',
 'New Mexico',
 'Rhode Island',
 'Georgia',
 'South Carolina',
 'West Virginia']

Notice that any state with a space does not have match. This is because our data used underscore instead of a space, we fix this wiht the code below. Also notice that non-states, such as District of Columbia and Puerto Rico do not have matches. Georgia has a strange spelling and also needs a mapping. Lastly, Nebraska and Connecticut simply don't have matches.

In [391]:
state_map = {'Georgia_(U.S._state)':'Georgia', 'New_Hampshire':'New Hampshire', 'North_Dakota':'North Dakota', 
            'South_Dakota':'South Dakota', 'North_Carolina':'North Carolina', 'New_Jersey':'New Jersey', 
            'New_York':'New York', 'New_Mexico':'New Mexico', 'South_Carolina': 'South Carolina', 
             'West_Virginia':'West Virginia', 'Rhode_Island':'Rhode Island'}
all_info_df['state'] = all_info_df['state'].replace(state_map)

Below we convert population to float type for later calculations.

In [392]:
all_info_df['population'] = all_info_df['population'].str.replace(',','').astype('float')

Here we drop duplicates of our final dataset as a final cleaning step.

In [393]:
all_info_df = all_info_df.drop_duplicates()

Here we save the article information file.

In [394]:
all_info_df.to_csv(r'/Users/zach/Jupyter/DATA 512/wp_scored_city_articles_by_state.csv', index=False)

## Step 4: Analysis / Step 5: Results

### 1. Top 10 US States By Coverage

Below we get the count of articles by state.

In [395]:
grp_df = all_info_df.groupby(['state'], as_index=False)['revision_id'].count()

Here we merge the population data for each state to its article count.

In [396]:
grp_df = grp_df.merge(pop_df, on=['state'], how='left')

Here we calculate articles per capita in each state.

In [397]:
grp_df['art_cap'] = grp_df['revision_id'].astype('float')/grp_df['population'].str.replace(',','').astype('float')

This displays the top 10 US states by coverage per capita.

In [398]:
grp_df.drop(columns=['revision_id','population'], axis=1).sort_values(by=['art_cap'], ascending=False)[:10]

Unnamed: 0,state,art_cap
42,Vermont,0.000508
31,North Dakota,0.000457
17,Maine,0.000349
38,South Dakota,0.000342
13,Iowa,0.000326
1,Alaska,0.000203
35,Pennsylvania,0.000197
20,Michigan,0.000177
47,Wyoming,0.00017
26,New Hampshire,0.000168


### 2. Bottom 10 US States By Coverage

This displays the bottom 10 US states by coverage per capita.

In [399]:
grp_df.drop(columns=['revision_id','population'], axis=1).sort_values(by=['art_cap'], ascending=True)[:10]

Unnamed: 0,state,art_cap
30,North Carolina,5e-06
25,Nevada,6e-06
4,California,1.2e-05
2,Arizona,1.2e-05
43,Virginia,1.5e-05
7,Florida,1.9e-05
33,Oklahoma,1.9e-05
14,Kansas,2.1e-05
18,Maryland,2.5e-05
46,Wisconsin,3.2e-05


### 3. Top 10 US States By High Quality

Here we calculate the number of featured articles and good articles per state and keep the group dataframe.

In [400]:
ga_df = all_info_df[all_info_df['article_quality']=='GA']
grp_df = ga_df.groupby(['state'], as_index=False)['revision_id'].count()
fa_df = all_info_df[all_info_df['article_quality']=='FA']
grp_df2 = fa_df.groupby(['state'], as_index=False)['revision_id'].count()
grp_df = grp_df.merge(grp_df2, on=['state'], how='left')
grp_df['revision_id_y'] = grp_df['revision_id_y'].fillna(value=0.0)
grp_df['revision_id'] = grp_df['revision_id_x'] + grp_df['revision_id_y']

Here we merge population data with the grouped dataframe and calculate articles per capita.

In [401]:
grp_df = grp_df.merge(pop_df, on=['state'], how='left')
grp_df['art_cap'] = grp_df['revision_id'].astype('float')/grp_df['population'].str.replace(',','').astype('float')

Below we display the top 10 US States by high quality articles per capita.

In [402]:
grp_df.drop(columns=['revision_id','population', 'revision_id_x', 'revision_id_y',], axis=1).sort_values(by=['art_cap'], ascending=False)[:10]

Unnamed: 0,state,art_cap
42,Vermont,7e-05
47,Wyoming,6.7e-05
38,South Dakota,6.2e-05
45,West Virginia,6e-05
24,Montana,4.9e-05
26,New Hampshire,4.5e-05
35,Pennsylvania,4.4e-05
23,Missouri,4.3e-05
1,Alaska,4.2e-05
27,New Jersey,4.1e-05


### 4. Bottom 10 US States By High Quality

Below we display the bottom 10 US States by high quality articles per capita.

In [403]:
grp_df.drop(columns=['revision_id','population', 'revision_id_x', 'revision_id_y',], axis=1).sort_values(by=['art_cap'], ascending=True)[:10]

Unnamed: 0,state,art_cap
30,North Carolina,2e-06
43,Virginia,2e-06
25,Nevada,3e-06
2,Arizona,3e-06
4,California,4e-06
7,Florida,5e-06
29,New York,6e-06
18,Maryland,7e-06
14,Kansas,7e-06
33,Oklahoma,8e-06


### 5. Census Divisions By Total Coverage

Below we merge the regional data with state populations to get a grouped dataframe that has the population for each region.

In [404]:
region_pops = regions_df.merge(pop_df, left_on=['STATE'], right_on=['state']).drop(columns=['STATE'], axis=1)
region_pops['population'] = region_pops['population'].str.replace(',','').astype('float')
region_pops = region_pops.groupby(['DIVISION'], as_index=False)['population'].sum().rename(columns={'DIVISION': 'regional_division'})

Here we calculate the articles for each region.

In [405]:
grp_df = all_info_df.groupby(['regional_division'], as_index=False)['revision_id'].count()

Below we merge the region populations with the article counts for each region and calculate articles per capita.

In [406]:
grp_df = grp_df.merge(region_pops, on=['regional_division'], how='left')
grp_df['art_cap'] = grp_df['revision_id'].astype('float')/grp_df['population']

Here we display the regions in descending order by articles per capita.

In [407]:
grp_df.drop(columns=['revision_id','population'], axis=1).sort_values(by=['art_cap'], ascending=False)

Unnamed: 0,regional_division,art_cap
7,West North Central,0.000134
0,East North Central,0.000101
1,East South Central,7.8e-05
4,New England,7.7e-05
2,Middle Atlantic,6.1e-05
8,West South Central,5e-05
3,Mountain,4.2e-05
5,Pacific,2.4e-05
6,South Atlantic,1.1e-05


### 6. Census Divisions By High Quality Coverage

Here we calculate the number of featured articles and good articles per region and keep the grouped dataframe.

In [408]:
ga_df = all_info_df[all_info_df['article_quality']=='GA']
grp_df = ga_df.groupby(['regional_division'], as_index=False)['revision_id'].count()
fa_df = all_info_df[all_info_df['article_quality']=='FA']
grp_df2 = fa_df.groupby(['regional_division'], as_index=False)['revision_id'].count()
grp_df = grp_df.merge(grp_df2, on=['regional_division'], how='left')
grp_df['revision_id_y'] = grp_df['revision_id_y'].fillna(value=0.0)
grp_df['revision_id'] = grp_df['revision_id_x'] + grp_df['revision_id_y']

Below we merge the regional data with state populations to get a grouped dataframe that has the population for each region.

In [409]:
region_pops = regions_df.merge(pop_df, left_on=['STATE'], right_on=['state']).drop(columns=['STATE'], axis=1)
region_pops['population'] = region_pops['population'].str.replace(',','').astype('float')
region_pops = region_pops.groupby(['DIVISION'], as_index=False)['population'].sum().rename(columns={'DIVISION': 'regional_division'})

Below we merge the region populations with the article counts for each region and calculate high quality articles per capita.

In [410]:
grp_df = grp_df.merge(region_pops, on=['regional_division'], how='left')
grp_df['art_cap'] = grp_df['revision_id'].astype('float')/grp_df['population']

Here we display the regions in descending order of high quality articles per capita.

In [411]:
grp_df.drop(columns=['revision_id','population', 'revision_id_x', 'revision_id_y'], axis=1).sort_values(by=['art_cap'], ascending=False)

Unnamed: 0,regional_division,art_cap
7,West North Central,2.6e-05
1,East South Central,1.6e-05
8,West South Central,1.5e-05
0,East North Central,1.5e-05
2,Middle Atlantic,1.4e-05
3,Mountain,1.2e-05
4,New England,1e-05
5,Pacific,9e-06
6,South Atlantic,3e-06
