## Getting the data

In [4]:
import pandas  as pd
import numpy as np

In [5]:
wiki_data = pd.read_csv('raw_data/page_data.csv')
pop_data = pd.read_csv('raw_data/WPDS_2018_data.csv')

In [6]:
wiki_data.head()

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


In [7]:
pop_data.head()

Unnamed: 0,Geography,Population mid-2018 (millions)
0,AFRICA,1284.0
1,Algeria,42.7
2,Egypt,97.0
3,Libya,6.5
4,Morocco,35.2


Renaming the Population column

In [8]:
pop_data = pop_data.rename({'Population mid-2018 (millions)': 'population'}, axis=1)

In [9]:
pop_data.head()

Unnamed: 0,Geography,population
0,AFRICA,1284.0
1,Algeria,42.7
2,Egypt,97.0
3,Libya,6.5
4,Morocco,35.2


## Combining the dataset

Doing an inner joing to ensure that any non-matching rows are dropped.

In [10]:
comb = pd.merge(wiki_data, pop_data, left_on='country', right_on='Geography',
               how='inner').drop('Geography', axis=1)

In [11]:
comb.head()

Unnamed: 0,page,country,rev_id,population
0,Template:ZambiaProvincialMinisters,Zambia,235107991,17.7
1,Gladys Lundwe,Zambia,757566606,17.7
2,Mwamba Luchembe,Zambia,764848643,17.7
3,Thandiwe Banda,Zambia,768166426,17.7
4,Sylvester Chisembele,Zambia,776082926,17.7


In [12]:
comb.isnull().any()

page          False
country       False
rev_id        False
population    False
dtype: bool

## Getting Article Quality Predictions

This section of code is heavily borrowed from the example repository that explains how to use the Wikipedia ORES (Object Revision Evaluation Service) API to get ORES scores. Link: https://github.com/Ironholds/data-512-a2/blob/master/hcds-a2-bias_demo.ipynb

In [13]:
import requests
import json

# Define the endpoint
endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'

# Specify the parameters
params = {'project' : 'enwiki',
          'model'   : 'wp10'
         }

headers = {
    'User-Agent' : 'https://github.com/whamsy', 
    'From' : 'whamsy@uw.edu'
}

def get_ores_data(revision_ids, headers):
    # smushing all the revision IDs together separated by | marks.
    params['revids'] = '|'.join(str(x) for x in revision_ids)
    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()
    scores = response['enwiki']['scores']
    
    final_scores = []
    
    for score in scores:
        try:
            final_scores.append(scores[score]['wp10']['score']['prediction'])
        except KeyError:
            final_scores.append('no_score')
    
    return final_scores

Now that we have a function that can take a list of revision ids and return the ORES predictions, we can go ahead and apply it our comb dataset.

In [14]:
# Converting the list of revision_ids into chunks of 140

chunks = []

rev_ids = comb.rev_id.tolist()
for i in range(0, len(rev_ids), 140):
    chunks.append(rev_ids[i:i + 140])

In [15]:
from tqdm import tqdm

In [None]:
quality_predictions = [(get_ores_data(chunk, headers)) for chunk in tqdm(chunks)]

  5%|▍         | 16/322 [00:05<01:39,  3.08it/s]

Now we have a list of lists, which we flatten to a single list of all preds. (Source: https://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python)

In [None]:
final_preds = [single_pred for chunk in quality_predictions 
               for single_pred in chunk]

In [None]:
#check size match
len(final_preds) == len(rev_ids)

In [None]:
#adding the predictions to df
comb['article_quality'] = final_preds

In [None]:
comb.head()

In [None]:
comb = comb.rename({'page':'article_name',
            'rev_id': 'revision_id'}, axis=1)

In [None]:
#rearrange order of columns

cols2 = ['country','article_name','revision_id', 'article_quality','population']
comb = comb[cols2]
comb.head()

Belatedly, while analysis below realized that population has strings with ',' so replacing those and converting to float

In [None]:
comb.population = comb.population.str.replace(',', '').astype(float)

In [None]:
#exporting to csv
comb.to_csv('final_data.csv')

## Analysis

In order to produce the requisite tables, am using the data to derive stats that can then be easily used.

In [None]:
final_raw = comb

In [None]:
final_raw.head()

In [None]:
#setting warning off, expected behavior
pd.options.mode.chained_assignment = None  # default='warn'

final_raw['num_articles'] = final_raw.groupby('country').transform('count').article_name
final_raw['high_quality'] = (final_raw['article_quality'] == 'GA') | (final_raw['article_quality'] == 'FA')

In [None]:
final_raw.head()

We dont need the revision_id or article_name at this point.

In [None]:
final_raw = final_raw.drop(['article_name','revision_id'], axis=1)

In [None]:
#grouping-by common attributes to get the % of high quality articles)
final_raw = final_raw.groupby(['country','population','num_articles']).mean().reset_index()
final_raw.head()

In [None]:
#converting population to millions
final_raw.loc[:,'population'] *= 1000000
final_raw.head()

In [None]:
final_raw['article_pop_ratio'] = (final_raw.num_articles * 100) / final_raw.population
final_raw['article_quality_ratio'] = 100 * final_raw.high_quality

In [None]:
final_raw.head()

In [None]:
import matplotlib as plt

### 10 highest-ranked countries in terms of number of politician articles as a proportion of country population

In [None]:
%matplotlib inline

top_10_pop_art = final_raw[['country','population','article_pop_ratio']].sort_values('article_pop_ratio',ascending=False).head(10)
top_10_pop_art

In [None]:
top_10_pop_art.plot.bar(x='country', y='article_pop_ratio', subplots=True, layout=(2,1), figsize=(12,12))
top_10_pop_art.plot.bar(x='country', y='population', subplots=True, layout=(2,1), figsize=(12,12))

### 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population

In [None]:
bottom_10_pop_art = final_raw[['country','population','article_pop_ratio']].sort_values('article_pop_ratio').head(10)
bottom_10_pop_art

In [None]:
bottom_10_pop_art.plot.bar(x='country', y='article_pop_ratio', subplots=True, layout=(2,1), figsize=(12,12))
bottom_10_pop_art.plot.bar(x='country', y='population', subplots=True, layout=(2,1), figsize=(12,12))

### 10 highest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country

In [None]:
top_10_art_quality = final_raw[['country','population','article_quality_ratio']].sort_values('article_quality_ratio',ascending=False).head(10)
top_10_art_quality

In [None]:
top_10_art_quality.plot.bar(x='country', y='article_quality_ratio', subplots=True, layout=(2,1), figsize=(12,12))
top_10_art_quality.plot.bar(x='country', y='population', subplots=True, layout=(2,1), figsize=(12,12))

### 10 lowest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country

In [None]:
bottom_10_art_quality = final_raw[['country','population','article_quality_ratio']].sort_values('article_quality_ratio').head(10)
bottom_10_art_quality

In [None]:
bottom_10_art_quality.plot.bar(x='country', y='article_quality_ratio', subplots=True, layout=(2,1), figsize=(12,12))
bottom_10_art_quality.plot.bar(x='country', y='population', subplots=True, layout=(2,1), figsize=(12,12))