In [4]:
import pandas as pd

#Load the politicians data
politicians_df = pd.read_csv('../data/politicians_by_country_AUG.2024.csv')

#Load the population data
population_df = pd.read_csv('../data/population_by_country_AUG.2024.csv')

#View the first few rows of each dataset
print(politicians_df.head())
politicians_df.count()

print(population_df.head())
population_df.count()


                   name                                                url  \
0        Majah Ha Adrif       https://en.wikipedia.org/wiki/Majah_Ha_Adrif   
1     Haroon al-Afghani    https://en.wikipedia.org/wiki/Haroon_al-Afghani   
2           Tayyab Agha          https://en.wikipedia.org/wiki/Tayyab_Agha   
3  Khadija Zahra Ahmadi  https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...   
4        Aziza Ahmadyar       https://en.wikipedia.org/wiki/Aziza_Ahmadyar   

       country  
0  Afghanistan  
1  Afghanistan  
2  Afghanistan  
3  Afghanistan  
4  Afghanistan  
         Geography  Population
0            WORLD      8009.0
1           AFRICA      1453.0
2  NORTHERN AFRICA       256.0
3          Algeria        46.8
4            Egypt       105.2


Geography     233
Population    233
dtype: int64

In [5]:
#Check for duplicates based on all columns
duplicate_politicians_all = politicians_df[politicians_df.duplicated()]
print(len(duplicate_politicians_all))

#Check for duplicates based on name + url
duplicate_politicians_name = politicians_df[politicians_df.duplicated(subset=['name'])]
duplicate_politicians_url = politicians_df[politicians_df.duplicated(subset=['url'])]
print(len(duplicate_politicians_name))
print(len(duplicate_politicians_url))

#This means that the only thing setting them apart is different countries for the same name and url.




0
44
44


In [6]:
#Check for duplicates based on all columns
duplicate_population_all = population_df[population_df.duplicated()]
print(len(duplicate_population_all))

#Check for duplicates based on geography
duplicate_population_geo = population_df[population_df.duplicated(subset=['Geography'])]
print(len(duplicate_population_geo))


0
0


In [7]:
# 44 of politician appear in mulitple countries (2 or more). This has been decided (based on the Wikipedia API)
#either by their nationalities or the next country served, so it makes sense to have them be a part of both/all 
#the countries their names appear in

# however we keep a copy of the duplicate politicians
combined_duplicates = pd.concat([duplicate_politicians_name, duplicate_politicians_url]).drop_duplicates()
combined_duplicates.to_csv('combined_duplicates_politicians.csv', index=False)



In [8]:
import pandas as pd

#create a new column to check if Geography is in all caps
population_df['is_region'] = population_df['Geography'].apply(lambda x: x.isupper())

df_region = population_df[population_df['is_region'] == True].copy()
df_country = population_df[population_df['is_region'] == False].copy()

# Drop the helper column
df_region = df_region.drop(columns=['is_region'])
df_country = df_country.drop(columns=['is_region'])

df_country.count()
# df_region.to_csv('population_by_region.csv', index=False)
# df_country.to_csv('population_by_country.csv', index=False)



Geography     209
Population    209
dtype: int64

In [9]:
# Check for missing values in each column
missing_values_politicians = politicians_df.isnull().sum()
print(missing_values_politicians)

# Check for missing values in each column
missing_values_population = population_df.isnull().sum()
print(missing_values_population)



name       0
url        0
country    0
dtype: int64
Geography     0
Population    0
is_region     0
dtype: int64


In [10]:
import requests
import time
import json

# Constants
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"
API_HEADER_AGENT = 'User-Agent'
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

REQUEST_HEADERS = {
    'User-Agent': '<tbaner@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2024'
}

# PageInfo Request Template
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",  # Placeholder for article title
    "prop": "info",
    "inprop": "url|talkid"
}

# Function to request page info and retrieve revision ID
def request_pageinfo_per_article(article_title):
    request_template = PAGEINFO_PARAMS_TEMPLATE.copy()
    request_template['titles'] = article_title

    if API_THROTTLE_WAIT > 0.0:
        time.sleep(API_THROTTLE_WAIT)  # Respect API rate limits

    try:
        response = requests.get(API_ENWIKIPEDIA_ENDPOINT, headers=REQUEST_HEADERS, params=request_template)
        json_response = response.json()
        
        # Extract the page info
        pages = json_response["query"]["pages"]
        for page_id, page_info in pages.items():
            revision_id = page_info.get("lastrevid", None)  # Retrieve the revision ID
            if revision_id:
                return revision_id
            else:
                return None
    except Exception as e:
        print(f"Error fetching page info for {article_title}: {e}")
        return None


In [11]:
# ORES API Constants
ORES_ENDPOINT = "https://ores.wikimedia.org/v3/scores/enwiki/"
ORES_MODEL = "wp10"  # The model used for quality predictions

# Function to request article quality using ORES API
def get_ores_quality_prediction(article_title, revision_id):
    try:
        ores_url = f"{ORES_ENDPOINT}?models={ORES_MODEL}&revids={revision_id}"
        response = requests.get(ores_url)

        if response.status_code == 200:
            data = response.json()
            scores = data['enwiki']['scores'][str(revision_id)]['wp10']['score']['prediction']
            return scores
        else:
            print(f"Failed to get ORES score for {article_title} (Revision ID: {revision_id})")
            return None
    except Exception as e:
        print(f"Error getting ORES score for {article_title}: {e}")
        return None


In [12]:
#list to log articles without scores
error_log = []

#dd columns to store revision ID and quality score
politicians_df['revision_id'] = None
politicians_df['quality_score'] = None

#loop through each politician and get revision ID and quality score
for index, row in politicians_df.iterrows():
    article_title = row['url'].split('/')[-1]
    revision_id = request_pageinfo_per_article(article_title)
    
    if revision_id:
        quality_score = get_ores_quality_prediction(article_title, revision_id)
        politicians_df.at[index, 'revision_id'] = revision_id
        politicians_df.at[index, 'quality_score'] = quality_score
    else:
        error_log.append(article_title)

#save the error log for any missing articles
with open('../data/ores_error_log.txt', 'w') as log_file:
    log_file.write("\n".join(error_log))

politicians_df.to_csv('../data/politicians_with_quality.csv', index=False)

#calculate error rate
error_rate = len(error_log) / len(politicians_df)
print(f"Error rate: {error_rate:.2%}")


Error rate: 0.11%
