# Getting Quality Predictions for World Politicians' Wikipedia Articles

### Homework #2 – Data 512
### Daniel Vogler

# Combining Politician and Population Datasets

## Environment Setup

In [1]:
import pandas as pd
import json

## Data Import

In [7]:
# get politicians with predicted article quality...
politician_article_quality_filepath = "../output_data/quality_predictions.json"

with open(politician_article_quality_filepath, "r") as f:
    politician_article_quality_js = json.load(f)

politician_article_quality_df = pd.DataFrame(politician_article_quality_js)

print(len(politician_article_quality_df))

7155


In [11]:
politician_article_quality_df.head()

Unnamed: 0,title,revid,prediction
0,Manuel Flores (Salvadoran politician),1239190000.0,GA
1,Hugo Lindo,1236826000.0,C
2,Gustavo López Davidson,1231945000.0,Start
3,Román Mayorga Quirós,1171435000.0,Stub
4,José Antonio Morales Ehrlich,1231945000.0,Stub


In [16]:
politicians = pd.read_csv("../cleaned_data/politicians_by_country_AUG_2024_clean.csv")
politicians.drop(columns=["Unnamed: 0"], inplace=True)
politicians.head()

Unnamed: 0,name,url,country
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan
3,Khadija Zahra Ahmadi,https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...,Afghanistan
4,Aziza Ahmadyar,https://en.wikipedia.org/wiki/Aziza_Ahmadyar,Afghanistan


In [20]:
populations = pd.read_csv("../cleaned_data/population_by_country_AUG_2024_clean.csv")
populations.drop(columns=["Unnamed: 0"], inplace=True)
populations.head()

Unnamed: 0,Geography,Population
0,WORLD,8009.0
1,AFRICA,1453.0
2,NORTHERN AFRICA,256.0
3,Algeria,46.8
4,Egypt,105.2


To record which countries have no match (no politicians in this dataset), I will:
1. get all of the geographies in the `population` dataset
2. filter out the geographies that are not countries, like `WORLD` and `AFRICA`
3. take the set difference between those geographies and the countries represented in the `politicians` dataset

In [27]:
geographies = set(populations["Geography"])

countries = {geo for geo in geographies if not geo.isupper()}

countries_not_represented = {c for c in countries if c not in set(politicians["country"])}

with open("../output_data/wp-countries_no-match.txt", "w") as f:
    for country in countries_not_represented:
        f.write(country + "\n")

In [42]:
df = pd.merge(politicians, 
         politician_article_quality_df,
         left_on = "name",
         right_on = "title")

df = pd.merge(df, populations, left_on = "country", right_on = "Geography")

df.rename(columns={
    "Population": "population",
    "title": "article_title",
    "revid": "revision_id",
    "prediction": "article_quality"
}, inplace=True)

df.drop(columns={"url", "Geography", "name"}, inplace=True)

In [43]:
df.head()

Unnamed: 0,country,article_title,revision_id,article_quality,population
0,Afghanistan,Majah Ha Adrif,1233203000.0,Start,42.4
1,Afghanistan,Haroon al-Afghani,1230460000.0,B,42.4
2,Afghanistan,Tayyab Agha,1225662000.0,Start,42.4
3,Afghanistan,Khadija Zahra Ahmadi,1234742000.0,Stub,42.4
4,Afghanistan,Aziza Ahmadyar,1195651000.0,Start,42.4


In [56]:
def parse_regional_hierarchy(geographies=populations["Geography"]):
    
    country_lookup = []

    last_upper = ""
    for g in geographies:
        if g.isupper():
            last_upper = g
            continue
        else:
            country_lookup.append(
                {
                    "country": g,
                    "region": last_upper.title()
                }
            )

    country_lookup_df = pd.DataFrame(country_lookup)
    return country_lookup_df

Unnamed: 0,country,region
0,Algeria,Northern Africa
1,Egypt,Northern Africa
2,Libya,Northern Africa
3,Morocco,Northern Africa
4,Sudan,Northern Africa
...,...,...
204,Samoa,Oceania
205,Solomon Islands,Oceania
206,Tonga,Oceania
207,Tuvalu,Oceania


In [58]:
country_lookup = parse_regional_hierarchy()

In [62]:
df = pd.merge(df, country_lookup, on="country")
df.rename(columns={
    "region_x": "region"
}, inplace=True)
df.drop(columns={"region_y"}, inplace=True)

# reorder per assignment spec:
df = df[["country", "region", "population", "article_title", "revision_id", "article_quality"]] 

df.head()

Unnamed: 0,country,region,region.1,population,article_title,revision_id,article_quality
0,Afghanistan,South Asia,South Asia,42.4,Majah Ha Adrif,1233203000.0,Start
1,Afghanistan,South Asia,South Asia,42.4,Haroon al-Afghani,1230460000.0,B
2,Afghanistan,South Asia,South Asia,42.4,Tayyab Agha,1225662000.0,Start
3,Afghanistan,South Asia,South Asia,42.4,Khadija Zahra Ahmadi,1234742000.0,Stub
4,Afghanistan,South Asia,South Asia,42.4,Aziza Ahmadyar,1195651000.0,Start


In [64]:
df.to_csv("../output_data/data_for_analysis.csv")