# Data 512 HW2: Considering Bias in Data

Import relevant libraries

In [1]:
import os, json, time, urllib.parse
from dotenv import load_dotenv  # for API key
import requests
import pandas as pd

### Step 1: Getting the Article and Population Data
Wikipeidia articles of politicians and data for country populations

In [2]:
politicians_by_country = pd.read_csv("/Users/dianechiang/Desktop/data_512/data-512-homework_2/politicians_by_country_AUG.2024.csv")
politicians_by_country.head()

Unnamed: 0,name,url,country
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan
3,Khadija Zahra Ahmadi,https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...,Afghanistan
4,Aziza Ahmadyar,https://en.wikipedia.org/wiki/Aziza_Ahmadyar,Afghanistan


In [45]:
population_by_country = pd.read_csv("/Users/dianechiang/Desktop/data_512/data-512-homework_2/population_by_country_AUG.2024.csv")
population_by_country.head()

Unnamed: 0,Geography,Population
0,WORLD,8009.0
1,AFRICA,1453.0
2,NORTHERN AFRICA,256.0
3,Algeria,46.8
4,Egypt,105.2


## Step 2: Getting Article Quality Predictions
Get the predicted quality scores for each article in Wikipedia dataset

In [4]:
#########
#
#    CONSTANTS
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"
API_HEADER_AGENT = 'User-Agent'

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<tchiang0@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2024'
}

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

In [5]:
def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    """
    Get page info of each politian article
    """
    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['titles'] = article_title

    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    if API_HEADER_AGENT not in headers:
        raise Exception(f"The header data should include a '{API_HEADER_AGENT}' field that contains your UW email address.")

    if 'uwnetid@uw' in headers[API_HEADER_AGENT]:
        raise Exception(f"Use your UW email address in the '{API_HEADER_AGENT}' field.")

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [6]:
# List of English Wikipedia article titles that we can use for example requests
ARTICLE_TITLES = politicians_by_country['name'].tolist()

In [196]:
# Initialize list of article rev id as None
ARTICLE_REVISIONS = [None] * len(ARTICLE_TITLES)

Get rev_id for each politician article

In [None]:
for idx, article in enumerate(ARTICLE_TITLES):
    info = request_pageinfo_per_article(article)
    lastrevid = None
    for page_id, page_info in info['query']['pages'].items():
        if page_id != "-1":
            # print(f"{article} has page id {page_id}")  # commented out for readability
            lastrevid = page_info['lastrevid']
        ARTICLE_REVISIONS[idx] = lastrevid
    # print(f"{article} has lastrevid: {lastrevid}")  # commented out for readability

Make ORES request using the patge title and current revision id

In [9]:
#########
#
#    CONSTANTS
#

#    The current LiftWing ORES API endpoint and prediction model
#
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"

#
#    The throttling rate is a function of the Access token that you are granted when you request the token. The constants
#    come from dissecting the token and getting the rate limits from the granted token. An example of that is below.
#
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = ((60.0*60.0)/5000.0)-API_LATENCY_ASSUMED  # The key authorizes 5000 requests per hour

#    When making automated requests we should include something that is unique to the person making the request
#    This should include an email - your UW email would be good to put in there
#    
#    Because all LiftWing API requests require some form of authentication, you need to provide your access token
#    as part of the header too
#
REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "<{email_address}>, University of Washington, MSDS DATA 512 - AUTUMN 2024",
    'Content-Type': 'application/json',
    'Authorization': "Bearer {access_token}"
}
#
#    This is a template for the parameters that we need to supply in the headers of an API request
#
REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address' : "",         # your email address should go here
    'access_token'  : ""          # the access token you create will need to go here
}
#
#    This is a template of the data required as a payload when making a scoring request of the ORES model
#
ORES_REQUEST_DATA_TEMPLATE = {
    "lang":        "en",     # required that its english - we're scoring English Wikipedia revisions
    "rev_id":      "",       # this request requires a revision id
    "features":    True
}

load_dotenv()
# wikimedia API access token
ACCESS_TOKEN = os.getenv("ACCESS_TOKEN")

In [10]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_ores_score_per_article(article_revid = None, email_address=None, access_token=None,
                                   endpoint_url = API_ORES_LIFTWING_ENDPOINT, 
                                   model_name = API_ORES_EN_QUALITY_MODEL, 
                                   request_data = ORES_REQUEST_DATA_TEMPLATE, 
                                   header_format = REQUEST_HEADER_TEMPLATE, 
                                   header_params = REQUEST_HEADER_PARAMS_TEMPLATE):
    """
    Get predicted score for each politician article
    """
    #    Make sure we have an article revision id, email and token
    #    This approach prioritizes the parameters passed in when making the call
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token
    
    #   Making a request requires a revision id - an email address - and the access token
    if not request_data['rev_id']:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")
    
    # Create the request URL with the specified model parameter - default is a article quality score request
    request_url = endpoint_url.format(model_name=model_name)
    
    # Create a compliant request header from the template and the supplied parameters
    headers = dict()
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free data
        # source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        #response = requests.get(request_url, headers=headers)
        response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [197]:
# Initialize list of article scrore as None
ARTICLE_SCORES = [None] * len(ARTICLE_TITLES)
RATINGS = ["B", "C", "FA", "GA", "Start", "Stub"]  # all ratings

In [23]:
for idx, article_title in enumerate(ARTICLE_TITLES):
    if not ARTICLE_REVISIONS[idx]:
        print(f"{article_title} is not found in wiki")
        continue
    # print(f"Getting LiftWing ORES scores for '{article_title}' with revid: {ARTICLE_REVISIONS[idx]:d}")  # commented out for readibility
    score = request_ores_score_per_article(article_revid=int(ARTICLE_REVISIONS[idx]),
                                           email_address="tchiang0@uw.edu",
                                           access_token=ACCESS_TOKEN)
    if score and 'enwiki' in score:
        rating = score['enwiki']['scores'][str(ARTICLE_REVISIONS[idx])]['articlequality']['score']['prediction']
        if rating in RATINGS:
            ARTICLE_SCORES[idx] = rating
        # print(f"The predicted rating of {article_title} is {ARTICLE_SCORES[idx]}")  # commented out for readibility
    # else:
    #     print(f"ERROR: {score}")  # commented out for readibility
    #     print(f"The predicted rating of {article_title} is {ARTICLE_SCORES[idx]}")  # commented out for readibility

Barbara Eibinger-Miedl is not found in wiki
Mehrali Gasimov is not found in wiki
Kyaw Myint is not found in wiki
André Ngongang Ouandji is not found in wiki
Tomás Pimentel is not found in wiki
Richard Sumah is not found in wiki
Segun ''Aeroland'' Adewale is not found in wiki
Bashir Bililiqo is not found in wiki


Save the rev id and score prediction to json to avoid re-running API calls

In [28]:
data = {
    "article_title": ARTICLE_TITLES, 
    "article_rev": ARTICLE_REVISIONS,
    "article_score": ARTICLE_SCORES
}

with open('lists.json', 'w') as outfile:
    json.dump(data, outfile)

Read from lists.json to populate ARTICLE_REVISIONS and ARTICLE_SCORES

In [201]:
list_json = {}
with open('lists.json', 'r') as infile:
    list_json = json.load(infile)

ARTICLE_REVISIONS = list_json['article_rev']
ARTICLE_SCORES = list_json['article_score']

In [202]:
# score error rate: ratio of the number of articles for which you were not able to get a score divided by the total number of articles
print(len(ARTICLE_SCORES))
print(len(ARTICLE_TITLES))
print(len(ARTICLE_REVISIONS))

count_no_score = 0
for article_score in ARTICLE_SCORES:
    if not article_score:
        count_no_score += 1

score_err_rate = count_no_score / len(ARTICLE_TITLES)
print(score_err_rate)

7155
7155
7155
0.0025157232704402514


## Step 3: Combining the Datasets

Merge the wikipedia data and population data together \
Add an additional `region` column to the population dataset

In [75]:
population_by_country['region'] = None
cur_region = None

meta_regions = ['AFRICA', 'LATIN AMERICA AND THE CARIBBEAN', 'ASIA']  # higher hierarchy regions
regions = population_by_country[population_by_country['Geography'].str.isupper()]['Geography'].tolist()  # all regions in population dataset (capitalized)

for idx, row in population_by_country.iterrows():
    # keep the low hierarchy region
    if row['Geography'] in regions and row['Geography'] not in meta_regions:
        cur_region = row['Geography']

    population_by_country.at[idx, 'region'] = cur_region

In [76]:
population_by_country

Unnamed: 0,Geography,Population,region
0,WORLD,8009,WORLD
1,AFRICA,1453,WORLD
2,NORTHERN AFRICA,256,NORTHERN AFRICA
3,Algeria,47,NORTHERN AFRICA
4,Egypt,105,NORTHERN AFRICA
...,...,...,...
228,Samoa,0,OCEANIA
229,Solomon Islands,1,OCEANIA
230,Tonga,0,OCEANIA
231,Tuvalu,0,OCEANIA


In [77]:
pd.set_option('display.float_format', '{:.0f}'.format)  # display the revision_id as "numbers" instead of scientific notation
politicians_by_country['revision_id'] = ARTICLE_REVISIONS
politicians_by_country['article_quality'] = ARTICLE_SCORES
politicians_by_country.head(10)

Unnamed: 0,name,url,country,revision_id,article_quality
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan,1233202991,Start
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan,1230459615,B
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan,1225661708,Start
3,Khadija Zahra Ahmadi,https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...,Afghanistan,1234741562,Stub
4,Aziza Ahmadyar,https://en.wikipedia.org/wiki/Aziza_Ahmadyar,Afghanistan,1195651393,Start
5,Muqadasa Ahmadzai,https://en.wikipedia.org/wiki/Muqadasa_Ahmadzai,Afghanistan,1235521766,Start
6,Mohammad Sarwar Ahmedzai,https://en.wikipedia.org/wiki/Mohammad_Sarwar_...,Afghanistan,1176429234,Start
7,Amir Muhammad Akhundzada,https://en.wikipedia.org/wiki/Amir_Muhammad_Ak...,Afghanistan,1247931713,Start
8,Nasrullah Baryalai Arsalai,https://en.wikipedia.org/wiki/Nasrullah_Baryal...,Afghanistan,1225385278,Start
9,Abdul Rahim Ayoubi,https://en.wikipedia.org/wiki/Abdul_Rahim_Ayoubi,Afghanistan,1226326055,Start


In [78]:
# some countries are labeled differently (ex. Korea, South vs. Korea (South)) or are not present in the population_by_country.AUG.2024.csv file
# left merge; keeps all the article title
merged_df = pd.merge(politicians_by_country, population_by_country, left_on='country', right_on='Geography', how='left').drop(columns=['Geography', 'url'])
merged_df = merged_df.rename(columns={
    'name': 'article_title',
    'Population': 'population'
})

# reorder column names in the dataframe
merged_df = merged_df[['country', 'region', 'population', 'article_title', 'revision_id', 'article_quality']]
merged_df

Unnamed: 0,country,region,population,article_title,revision_id,article_quality
0,Afghanistan,SOUTH ASIA,42,Majah Ha Adrif,1233202991,Start
1,Afghanistan,SOUTH ASIA,42,Haroon al-Afghani,1230459615,B
2,Afghanistan,SOUTH ASIA,42,Tayyab Agha,1225661708,Start
3,Afghanistan,SOUTH ASIA,42,Khadija Zahra Ahmadi,1234741562,Stub
4,Afghanistan,SOUTH ASIA,42,Aziza Ahmadyar,1195651393,Start
...,...,...,...,...,...,...
7150,Zimbabwe,EASTERN AFRICA,17,Josiah Tongogara,1203429435,C
7151,Zimbabwe,EASTERN AFRICA,17,Langton Towungana,1246280093,Stub
7152,Zimbabwe,EASTERN AFRICA,17,Sengezo Tshabangu,1228478288,Start
7153,Zimbabwe,EASTERN AFRICA,17,Herbert Ushewokunze,959111842,Stub


In [100]:
# right merge; keeps all the country, region, and population
right_merged_df = pd.merge(politicians_by_country, population_by_country, left_on='country', right_on='Geography', how='right')
right_merged_df

Unnamed: 0,name,url,country,revision_id,article_quality,Geography,Population,region
0,,,,,,WORLD,8009,WORLD
1,,,,,,AFRICA,1453,WORLD
2,,,,,,NORTHERN AFRICA,256,NORTHERN AFRICA
3,Abdelkader Zoukh,https://en.wikipedia.org/wiki/Abdelkader_Zoukh,Algeria,1250270534,Start,Algeria,47,NORTHERN AFRICA
4,Chaâbane Aït Abderrahim,https://en.wikipedia.org/wiki/Chaâbane_Aït_Abd...,Algeria,1234096174,Start,Algeria,47,NORTHERN AFRICA
...,...,...,...,...,...,...,...,...
7075,Solofa Uota,https://en.wikipedia.org/wiki/Solofa_Uota,Tuvalu,1242819518,Stub,Tuvalu,0,OCEANIA
7076,Vincent Lunabek,https://en.wikipedia.org/wiki/Vincent_Lunabek,Vanuatu,1132674478,Stub,Vanuatu,0,OCEANIA
7077,President of Vanuatu,https://en.wikipedia.org/wiki/President_of_Van...,Vanuatu,1211211712,Start,Vanuatu,0,OCEANIA
7078,Jimmy Stevens (politician),https://en.wikipedia.org/wiki/Jimmy_Stevens_(p...,Vanuatu,1169502061,Start,Vanuatu,0,OCEANIA


Geography (regions + countries) that are not in the politician table

In [107]:
regions_not_in_politician = right_merged_df[right_merged_df['name'].isna()]['Geography'].drop_duplicates().values.tolist()
regions_not_in_politician[:5]

['WORLD', 'AFRICA', 'NORTHERN AFRICA', 'Western Sahara', 'WESTERN AFRICA']

Countries that are in the politician dataframe but not in the population table

In [102]:
countries_not_in_population_df = merged_df[merged_df['region'].isna() | (merged_df['article_title'].isna() & ~merged_df['country'].isna())]['country'].drop_duplicates().values.tolist()
countries_not_in_population_df

['Guinea-Bissau', 'Korean', 'Korea, South']

Write both lists to `wp_countries-no_match.txt`

In [108]:
with open('wp_countries-no_match.txt', 'w') as outfile:
    for region in regions_not_in_politician:
        outfile.write(region + '\n')
    
    for region in countries_not_in_population_df:
        outfile.write(region + '\n')

Remaining data into `wp_politicians_by_country.cs`

In [110]:
politicians_by_country_df = merged_df[~merged_df['region'].isna()]
politicians_by_country_df.to_csv('wp_politicians_by_country.csv')

## Step 4: Analysis

### Count number of articles by country
Population is in millions, so we multiplied the `population` by $10^6$ \
`total_articles_per_capita` is calculated by having `total_article_count` divided by `population`

In [149]:
pd.options.display.float_format = '{:12.5e}'.format  # represent large & small numbers in scientific notation
article_count_per_country = merged_df.groupby(['country'])['article_title'].count()  # group by country and count number of articles

# inner join on country/Geography
article_count_per_country_population = pd.merge(article_count_per_country, population_by_country, left_on='country', right_on='Geography').drop(columns=['region'])

# rename columns
article_count_per_country_population = article_count_per_country_population.rename(
    columns={
        'article_title': 'total_article_count',
        'Geography': 'country',
        'Population': 'population'
    }
)

# multiply population by 10^6 (in millions)
article_count_per_country_population['population'] = article_count_per_country_population['population'] * 10**6

# calculate total_articles_per_capita by dividing total_article_count by population
article_count_per_country_population['total_articles_per_capita'] = article_count_per_country_population['total_article_count'] / article_count_per_country_population['population']

# reorder the columns
article_count_per_country_population = article_count_per_country_population[['country', 'population', 'total_article_count', 'total_articles_per_capita']]
article_count_per_country_population

Unnamed: 0,country,population,total_article_count,total_articles_per_capita
0,Afghanistan,4.24000e+07,85,2.00472e-06
1,Albania,2.70000e+06,70,2.59259e-05
2,Algeria,4.68000e+07,71,1.51709e-06
3,Angola,3.67000e+07,58,1.58038e-06
4,Antigua and Barbuda,1.00000e+05,33,3.30000e-04
...,...,...,...,...
161,Venezuela,2.88000e+07,56,1.94444e-06
162,Vietnam,9.89000e+07,36,3.64004e-07
163,Yemen,3.44000e+07,32,9.30233e-07
164,Zambia,2.02000e+07,3,1.48515e-07


### Count number of high quality (FA and GA) articles by country
`total_articles_per_capita` is calculated by having `total_article_count` divided by `population`

In [132]:
# high quality articles
good_quality_article_score = ['FA', 'GA']

In [233]:
# count the number of high quality articles
high_quality_article_count_by_country_df = merged_df.groupby('country')['article_quality'].apply(lambda x: x.isin(good_quality_article_score).sum())

# inner join on country/Geography
high_quality_article_count_by_country_population_df = pd.merge(high_quality_article_count_by_country_df, population_by_country, left_on='country', right_on='Geography').drop(columns={'region'})

# rename columns
high_quality_article_count_by_country_population_df = high_quality_article_count_by_country_population_df.rename(
    columns={
        'article_quality': 'total_high_quality_article_count',
        'Geography': 'country',
        'Population': 'population'
    }
)

# high_quality_articles_per_capita is calculated by dividing total_high_quality_article_count by population
high_quality_article_count_by_country_population_df['high_quality_articles_per_capita'] = high_quality_article_count_by_country_population_df['total_high_quality_article_count'] / high_quality_article_count_by_country_population_df['population']
# reorder columns
high_quality_article_count_by_country_population_df = high_quality_article_count_by_country_population_df[['country', 'population', 'total_high_quality_article_count', 'high_quality_articles_per_capita']]
high_quality_article_count_by_country_population_df

Unnamed: 0,country,population,total_high_quality_article_count,high_quality_articles_per_capita
0,Afghanistan,4.24000e+01,3,7.07547e-02
1,Albania,2.70000e+00,7,2.59259e+00
2,Algeria,4.68000e+01,1,2.13675e-02
3,Angola,3.67000e+01,2,5.44959e-02
4,Antigua and Barbuda,1.00000e-01,0,0.00000e+00
...,...,...,...,...
161,Venezuela,2.88000e+01,1,3.47222e-02
162,Vietnam,9.89000e+01,2,2.02224e-02
163,Yemen,3.44000e+01,0,0.00000e+00
164,Zambia,2.02000e+01,0,0.00000e+00


### Count number of articles by region
Population is in millions, so we multiplied the `population` by $10^6$ \
`total_articles_per_capita` is calculated by having `total_article_count` divided by `population`

In [203]:
article_count_per_region = merged_df.groupby(['region'])['article_title'].count()  # count number of articles by region

# merge article_count_per_region and population_by_country by country/Geography
article_count_per_region_population = pd.merge(article_count_per_region, population_by_country, left_on='region', right_on='Geography').drop(columns=['region'])

# rename columns
article_count_per_region_population = article_count_per_region_population.rename(
    columns={
        'article_title': 'total_article_count',
        'Geography': 'region',
        'Population': 'population'
    }
)

# multiply population by 10^6 (in millions)
article_count_per_region_population['population'] = article_count_per_region_population['population'] * 10**6

# calculate total_articles_per_capita by dividing total_article_count by population
article_count_per_region_population['total_articles_per_capita'] = article_count_per_region_population['total_article_count'] / article_count_per_region_population['population']

# reorder columns
article_count_per_region_population = article_count_per_region_population[['region', 'population', 'total_article_count', 'total_articles_per_capita']]
article_count_per_region_population

Unnamed: 0,region,population,total_article_count,total_articles_per_capita
0,CARIBBEAN,44000000.0,219,4.97727e-06
1,CENTRAL AMERICA,182000000.0,188,1.03297e-06
2,CENTRAL ASIA,80000000.0,106,1.325e-06
3,EAST ASIA,1648000000.0,152,9.2233e-08
4,EASTERN AFRICA,483000000.0,665,1.37681e-06
5,EASTERN EUROPE,285000000.0,709,2.48772e-06
6,MIDDLE AFRICA,202000000.0,231,1.14356e-06
7,NORTHERN AFRICA,256000000.0,302,1.17969e-06
8,NORTHERN EUROPE,108000000.0,191,1.76852e-06
9,OCEANIA,45000000.0,72,1.6e-06


### Count number of high quality (FA and GA) articles by region
`total_articles_per_capita` is calculated by having `total_article_count` divided by `population`

In [169]:
# count number of high quality articles by region
high_quality_article_count_by_region_df = merged_df.groupby('region')['article_quality'].apply(lambda x: x.isin(good_quality_article_score).sum())

# inner join on country/Geography
high_quality_article_count_by_region_population_df = pd.merge(high_quality_article_count_by_region_df, population_by_country, left_on='region', right_on='Geography').drop(columns={'Geography'})

# rename columns
high_quality_article_count_by_region_population_df = high_quality_article_count_by_region_population_df.rename(
    columns={
        'article_quality': 'total_high_quality_article_count',
        'Population': 'population'
    }
)

# high_quality_articles_per_capita is calculated by dividing total_high_quality_article_count by population
high_quality_article_count_by_region_population_df['high_quality_articles_per_capita'] = high_quality_article_count_by_region_population_df['total_high_quality_article_count'] / high_quality_article_count_by_region_population_df['population']

# reorder columns
high_quality_article_count_by_region_population_df = high_quality_article_count_by_region_population_df[['region', 'population', 'total_high_quality_article_count', 'high_quality_articles_per_capita']]
high_quality_article_count_by_region_population_df

Unnamed: 0,region,population,total_high_quality_article_count,high_quality_articles_per_capita
0,CARIBBEAN,44.0,9,0.204545
1,CENTRAL AMERICA,182.0,10,0.0549451
2,CENTRAL ASIA,80.0,5,0.0625
3,EAST ASIA,1648.0,3,0.00182039
4,EASTERN AFRICA,483.0,17,0.0351967
5,EASTERN EUROPE,285.0,38,0.133333
6,MIDDLE AFRICA,202.0,8,0.039604
7,NORTHERN AFRICA,256.0,17,0.0664062
8,NORTHERN EUROPE,108.0,9,0.0833333
9,OCEANIA,45.0,1,0.0222222


Count the total of high quality articles out of 7155 articles

In [206]:
high_quality_article_count_by_region_population_df['total_high_quality_article_count'].sum()

np.int64(303)

## Step 5: Results

### Table 1: Top 10 countries by coverage
The 10 countries with the highest total articles per capita (desc) \
Since there are ties (ex. Montenegro and Seychelles) both have total_articles_per_capita = 6.00000e-05, we included the 10 ranked (dense rank)

In [212]:
top_10_countries_by_coverage = article_count_per_country_population
top_10_countries_by_coverage['dense_rank'] = top_10_countries_by_coverage['total_articles_per_capita'].rank(method='dense', ascending=False).astype(int)
top_10_countries_by_coverage[top_10_countries_by_coverage['dense_rank'].between(1, 10)].sort_values('dense_rank')

Unnamed: 0,country,population,total_article_count,total_articles_per_capita,dense_rank
96,Monaco,0.0,10,inf,1
154,Tuvalu,0.0,1,inf,1
4,Antigua and Barbuda,100000.0,33,0.00033,2
51,Federated States of Micronesia,100000.0,14,0.00014,3
93,Marshall Islands,100000.0,13,0.00013,4
149,Tonga,100000.0,10,0.0001,5
12,Barbados,300000.0,25,8.33333e-05,6
98,Montenegro,600000.0,36,6e-05,7
125,Seychelles,100000.0,6,6e-05,7
17,Bhutan,800000.0,44,5.5e-05,8


### Table 2: Bottom 10 countries by coverage
The 10 countries with the lowest total articles per capita (asc) \
No ties in the bottom 10 countires

In [217]:
bottom_10_countries_by_coverage = top_10_countries_by_coverage.nlargest(10, 'dense_rank')
bottom_10_countries_by_coverage

Unnamed: 0,country,population,total_article_count,total_articles_per_capita,dense_rank
31,China,1411300000.0,16,1.13371e-08,156
66,India,1428600000.0,151,1.05698e-07,155
57,Ghana,34100000.0,4,1.17302e-07,154
122,Saudi Arabia,36900000.0,5,1.35501e-07,153
164,Zambia,20200000.0,3,1.48515e-07,152
108,Norway,5500000.0,1,1.81818e-07,151
70,Israel,9800000.0,2,2.04082e-07,150
45,Egypt,105200000.0,32,3.04183e-07,149
37,Cote d'Ivoire,30900000.0,10,3.23625e-07,148
50,Ethiopia,126500000.0,44,3.47826e-07,147


### Table 3: Top 10 countries by high quality
The 10 countries with the highest high quality articles per capita (desc)

In [230]:
top_10_countries_by_high_quality = high_quality_article_count_by_country_population_df
top_10_countries_by_high_quality = top_10_countries_by_high_quality.fillna(0)
top_10_countries_by_high_quality['dense_rank'] = top_10_countries_by_high_quality['high_quality_articles_per_capita'].rank(method='dense', ascending=False).astype(int)
top_10_countries_by_high_quality[top_10_countries_by_high_quality['dense_rank'].between(1, 10)].sort_values('dense_rank')

Unnamed: 0,country,population,total_high_quality_article_count,high_quality_articles_per_capita,dense_rank
98,Montenegro,0.6,3,5.0,1
86,Luxembourg,0.7,2,2.85714,2
1,Albania,2.7,7,2.59259,3
76,Kosovo,1.7,3,1.76471,4
90,Maldives,0.6,1,1.66667,5
85,Lithuania,2.9,4,1.37931,6
38,Croatia,3.8,5,1.31579,7
62,Guyana,0.8,1,1.25,8
111,Palestinian Territory,5.5,6,1.09091,9
129,Slovenia,2.1,2,0.952381,10


### Table 4: Bottom 10 countries by high quality
The 10 countries with the lowest high quality articles per capita (asc)

In [231]:
bottom_10_countries_by_high_quality = top_10_countries_by_high_quality.nlargest(10, 'dense_rank')
bottom_10_countries_by_high_quality

Unnamed: 0,country,population,total_high_quality_article_count,high_quality_articles_per_capita,dense_rank
4,Antigua and Barbuda,0.1,0,0.0,98
9,Bahamas,0.4,0,0.0,98
12,Barbados,0.3,0,0.0,98
15,Belize,0.5,0,0.0,98
16,Benin,13.7,0,0.0,98
17,Bhutan,0.8,0,0.0,98
20,Botswana,2.7,0,0.0,98
27,Cape Verde,0.6,0,0.0,98
29,Chad,18.3,0,0.0,98
31,China,1411.3,0,0.0,98


### Table 5: Geographic regions by total coverage
A rank ordered list of geographic regions (desc) by total articles per capita

In [243]:
geo_regions_by_total_coverage = article_count_per_region_population
geo_regions_by_total_coverage['dense_rank'] = geo_regions_by_total_coverage['total_articles_per_capita'].rank(method='dense', ascending=False).astype(int)
geo_regions_by_total_coverage.sort_values('dense_rank')

Unnamed: 0,region,population,total_article_count,total_articles_per_capita,dense_rank
14,SOUTHERN EUROPE,152000000.0,797,5.24342e-06,1
0,CARIBBEAN,44000000.0,219,4.97727e-06,2
17,WESTERN EUROPE,199000000.0,498,2.50251e-06,3
5,EASTERN EUROPE,285000000.0,709,2.48772e-06,4
16,WESTERN ASIA,299000000.0,610,2.04013e-06,5
8,NORTHERN EUROPE,108000000.0,191,1.76852e-06,6
13,SOUTHERN AFRICA,70000000.0,123,1.75714e-06,7
9,OCEANIA,45000000.0,72,1.6e-06,8
4,EASTERN AFRICA,483000000.0,665,1.37681e-06,9
10,SOUTH AMERICA,426000000.0,569,1.33568e-06,10


### Table 6: Geographic regions by high quality coverage
A rank ordered list of geographic regions (desc) by high quality articles per capita


In [245]:
geo_regions_by_high_quality_coverage = high_quality_article_count_by_region_population_df
geo_regions_by_high_quality_coverage['dense_rank'] = geo_regions_by_high_quality_coverage['high_quality_articles_per_capita'].rank(method='dense', ascending=False).astype(int)
geo_regions_by_high_quality_coverage.sort_values('dense_rank')

Unnamed: 0,region,population,total_high_quality_article_count,high_quality_articles_per_capita,dense_rank
14,SOUTHERN EUROPE,152.0,52,0.342105,1
0,CARIBBEAN,44.0,9,0.204545,2
5,EASTERN EUROPE,285.0,38,0.133333,3
13,SOUTHERN AFRICA,70.0,8,0.114286,4
17,WESTERN EUROPE,199.0,21,0.105528,5
16,WESTERN ASIA,299.0,27,0.090301,6
8,NORTHERN EUROPE,108.0,9,0.0833333,7
7,NORTHERN AFRICA,256.0,17,0.0664062,8
2,CENTRAL ASIA,80.0,5,0.0625,9
1,CENTRAL AMERICA,182.0,10,0.0549451,10
