In [79]:
import pandas as pd
import requests
import json
import time
import os
from tqdm import tqdm
import re

In [80]:
politicians_df = pd.read_csv('politicians_by_country_AUG.2024.csv')
population_df = pd.read_csv('population_by_country_AUG.2024.csv')

In [81]:
print(politicians_df.head())

                   name                                                url  \
0        Majah Ha Adrif       https://en.wikipedia.org/wiki/Majah_Ha_Adrif   
1     Haroon al-Afghani    https://en.wikipedia.org/wiki/Haroon_al-Afghani   
2           Tayyab Agha          https://en.wikipedia.org/wiki/Tayyab_Agha   
3  Khadija Zahra Ahmadi  https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...   
4        Aziza Ahmadyar       https://en.wikipedia.org/wiki/Aziza_Ahmadyar   

       country  
0  Afghanistan  
1  Afghanistan  
2  Afghanistan  
3  Afghanistan  
4  Afghanistan  


In [82]:
print(population_df.head())

         Geography  Population
0            WORLD      8009.0
1           AFRICA      1453.0
2  NORTHERN AFRICA       256.0
3          Algeria        46.8
4            Egypt       105.2


In [83]:
# Function to extract article title from URL
def extract_title_from_url(url):
    # Check if the URL is a string
    if isinstance(url, str):
        # Extract the part after '/wiki/' in the URL
        match = re.search(r'/wiki/(.+)$', url)
        if match:
            # Get the article title
            title = match.group(1)
            # Replace underscores with spaces
            title = title.replace('_', ' ')
            # Decode URL-encoded characters
            title = requests.utils.unquote(title)
            return title
    return None

# Apply the function to create a new column 'article_title'
politicians_df['article_title'] = politicians_df['url'].apply(extract_title_from_url)

# Check the result
print(politicians_df[['url', 'article_title']].head())

                                                 url         article_title
0       https://en.wikipedia.org/wiki/Majah_Ha_Adrif        Majah Ha Adrif
1    https://en.wikipedia.org/wiki/Haroon_al-Afghani     Haroon al-Afghani
2          https://en.wikipedia.org/wiki/Tayyab_Agha           Tayyab Agha
3  https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...  Khadija Zahra Ahmadi
4       https://en.wikipedia.org/wiki/Aziza_Ahmadyar        Aziza Ahmadyar


In [112]:
# Give your Token here
ORES_ACCESS_TOKEN=''
ACCESS_TOKEN = ORES_ACCESS_TOKEN


In [114]:
# Replace with your actual email address
EMAIL_ADDRESS = 'yasovar@uw.edu'

# Headers for ORES API requests
REQUEST_HEADERS = {
    'User-Agent': f"{EMAIL_ADDRESS}, University of Washington, MSDS DATA 512 - AUTUMN 2024",
    'Content-Type': 'application/json',
    'Authorization': f"Bearer {ACCESS_TOKEN}"
}

In [115]:
# ORES API endpoint and model
ORES_API_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
ORES_MODEL_NAME = "enwiki-articlequality"

# Wikipedia API endpoint
WIKIPEDIA_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"

In [116]:
def get_latest_revision_id(article_title):
    params = {
        "action": "query",
        "format": "json",
        "titles": article_title,
        "prop": "revisions",
        "rvprop": "ids",
        "rvslots": "*"
    }
    
    # Throttle the request to respect rate limits
    time.sleep(0.2)
    
    response = requests.get(WIKIPEDIA_API_ENDPOINT, headers={'User-Agent': REQUEST_HEADERS['User-Agent']}, params=params)
    data = response.json()
    
    pages = data.get('query', {}).get('pages', {})
    for page_id in pages:
        revisions = pages[page_id].get('revisions', [])
        if revisions:
            return revisions[0]['revid']
    return None

In [117]:
def get_article_quality(revision_id):
    url = ORES_API_ENDPOINT.format(model_name=ORES_MODEL_NAME)
    data = {
        "rev_id": revision_id,
        "features": False
    }
    
    # Throttle the request to respect rate limits
    time.sleep(0.2)
    
    response = requests.post(url, headers=REQUEST_HEADERS, data=json.dumps(data))
    
    if response.status_code == 200:
        result = response.json()
        scores = result.get('enwiki', {}).get('scores', {})
        revision_scores = scores.get(str(revision_id), {})
        prediction = revision_scores.get('prediction', None)
        return prediction
    else:
        print(f"Error {response.status_code}: Unable to get quality prediction for revision ID {revision_id}")
        return None

In [118]:
politicians_df['revision_id'] = None
politicians_df['article_quality'] = None

# Initialize a list to keep track of articles with errors
error_articles = []
error_count = 0

# Iterate over the articles with a progress bar
for index, row in tqdm(politicians_df.iterrows(), total=politicians_df.shape[0], desc='Processing articles'):
    article_title = row['article_title']
    
    # Skip if article_title is None
    if not article_title:
        error_articles.append(row['url'])
        error_count += 1
        continue
    
    try:
        # Get the latest revision ID
        revision_id = get_latest_revision_id(article_title)
        
        if revision_id:
            # Get the article quality prediction
            quality = get_article_quality(revision_id)
            
            # Update the DataFrame
            politicians_df.at[index, 'revision_id'] = revision_id
            politicians_df.at[index, 'article_quality'] = quality
        else:
            error_articles.append(row['url'])
            error_count += 1
    except Exception as e:
        print(f"Error processing article '{article_title}': {e}")
        error_articles.append(row['url'])
        error_count += 1
    
    # Update progress bar postfix with error count
    tqdm.set_postfix(tqdm(), errors=error_count)

# After processing, calculate the error rate
total_articles = len(politicians_df)
error_rate = (error_count / total_articles) * 100

print(f"Total articles: {total_articles}")
print(f"Articles with errors: {error_count}")
print(f"Error rate: {error_rate:.2f}%")

if error_rate > 1:
    print("Error rate is higher than 1%. Please review your code.")

# Separate successful and failed articles
failed_articles_df = politicians_df[politicians_df['revision_id'].isnull()]
success_articles_df = politicians_df.dropna(subset=['revision_id', 'article_quality'])

# Load the population dataset
population_df = pd.read_csv('population_by_country_AUG.2024.csv')

# Convert all column names to lowercase
population_df.columns = population_df.columns.str.lower()
success_articles_df.columns = success_articles_df.columns.str.lower()

merged_df = pd.merge(
    success_articles_df,
    population_df[['geography', 'population', 'region']],  # Adjust column names as necessary
    left_on='country',
    right_on='geography',
    how='left'
)

merged_df = merged_df.drop(columns=['geography'])
output_columns = ['country', 'region', 'population', 'article_title', 'revision_id', 'article_quality']
merged_df = merged_df[output_columns]

merged_df.to_csv('wp_politicians_by_country.csv', index=False)
unmatched_countries = merged_success[merged_success['population'].isna()]['country'].unique()

with open('wp_countries-no_match.txt', 'w', encoding='utf-8') as f:
    for country in unmatched_countries:
        f.write(f"{country}\n")

Processing articles:   0%|                             | 0/7155 [00:00<?, ?it/s]

Error 401: Unable to get quality prediction for revision ID 1233202991



0it [00:00, ?it/s][A
0it [00:00, ?it/s, errors=0][A
Processing articles:   0%|                   | 1/7155 [00:01<3:36:30,  1.82s/it]


KeyboardInterrupt: 

## Table Visualization

In [110]:
# Ensure 'population' is numeric
merged_success['population'] = pd.to_numeric(merged_success['population'], errors='coerce')


high_quality_classes = ['FA', 'GA']
country_group = merged_success.groupby('country')

total_articles_per_country = country_group['article_title'].count().rename('total_articles')

# High-quality articles per country
high_quality_articles_per_country = country_group.apply(
    lambda x: x[x['article_quality'].isin(high_quality_classes)].shape[0]
).rename('high_quality_articles')

population_per_country = country_group['population'].first()

country_stats = pd.concat([
    total_articles_per_country,
    high_quality_articles_per_country,
    population_per_country
], axis=1)

# Filter out countries with zero or missing population
country_stats = country_stats[(country_stats['population'].notnull()) & (country_stats['population'] != 0)]

# Convert population to float (population is in millions)
country_stats['population'] = country_stats['population'].astype(float)

# Calculate articles per capita
country_stats['articles_per_capita'] = country_stats['total_articles'] / (country_stats['population'] * 1e6)
country_stats['articles_per_million'] = country_stats['articles_per_capita'] * 1e6

# Top 10 Countries by Coverage (Highest Articles per Capita)

In [111]:
top_10_coverage = country_stats.sort_values('articles_per_capita', ascending=False).head(10)

print("Top 10 Countries by Coverage (Highest Articles per Capita):")
print(top_10_coverage[['total_articles', 'population', 'articles_per_capita', 'articles_per_million']])

Top 10 Countries by Coverage (Highest Articles per Capita):
                                total_articles  population  \
country                                                      
Antigua and Barbuda                         33         0.1   
Federated States of Micronesia              14         0.1   
Marshall Islands                            13         0.1   
Tonga                                       10         0.1   
Barbados                                    25         0.3   
Montenegro                                  38         0.6   
Seychelles                                   6         0.1   
Bhutan                                      44         0.8   
Maldives                                    33         0.6   
St. Vincent and the Grenadines               4         0.1   

                                articles_per_capita  articles_per_million  
country                                                                    
Antigua and Barbuda                        

# Bottom 10 Countries by Coverage (Lowest Articles per Capita)

In [104]:
bottom_10_coverage = country_stats.sort_values('articles_per_capita', ascending=True).head(10)

print("\nBottom 10 Countries by Coverage (Lowest Articles per Capita):")
print(bottom_10_coverage[['total_articles', 'population', 'articles_per_capita', 'articles_per_million']])


Bottom 10 Countries by Coverage (Lowest Articles per Capita):
               total_articles  population  articles_per_capita  \
country                                                          
China                      16      1411.3         1.133707e-08   
Ghana                       3        34.1         8.797654e-08   
India                     151      1428.6         1.056979e-07   
Saudi Arabia                5        36.9         1.355014e-07   
Zambia                      3        20.2         1.485149e-07   
Norway                      1         5.5         1.818182e-07   
Israel                      2         9.8         2.040816e-07   
Egypt                      32       105.2         3.041825e-07   
Cote d'Ivoire              10        30.9         3.236246e-07   
Mozambique                 12        33.9         3.539823e-07   

               articles_per_million  
country                              
China                      0.011337  
Ghana                      0.0

# Top 10 Countries by High-Quality Articles per Capita

In [105]:
top_10_high_quality = country_stats.sort_values('high_quality_articles_per_capita', ascending=False).head(10)

print("\nTop 10 Countries by High-Quality Articles per Capita:")
print(top_10_high_quality[['high_quality_articles', 'population', 'high_quality_articles_per_capita', 'high_quality_articles_per_million']])


Top 10 Countries by High-Quality Articles per Capita:
                       high_quality_articles  population  \
country                                                    
Montenegro                                 3         0.6   
Luxembourg                                 2         0.7   
Albania                                    7         2.7   
Kosovo                                     4         1.7   
Lithuania                                  6         2.9   
Maldives                                   1         0.6   
Croatia                                    5         3.8   
Guyana                                     1         0.8   
Palestinian Territory                      6         5.5   
Slovenia                                   2         2.1   

                       high_quality_articles_per_capita  \
country                                                   
Montenegro                                 5.000000e-06   
Luxembourg                                 2.85

# Bottom 10 Countries by High-Quality Articles per Capita

In [106]:
bottom_10_high_quality = country_stats.sort_values('high_quality_articles_per_capita', ascending=True).head(10)

print("\nBottom 10 Countries by High-Quality Articles per Capita:")
print(bottom_10_high_quality[['high_quality_articles', 'population', 'high_quality_articles_per_capita', 'high_quality_articles_per_million']])


Bottom 10 Countries by High-Quality Articles per Capita:
                     high_quality_articles  population  \
country                                                  
Zimbabwe                                 0        16.7   
Congo                                    0         6.1   
Kuwait                                   0         4.4   
St. Lucia                                0         0.2   
Cote d'Ivoire                            0        30.9   
St. Kitts and Nevis                      0         0.1   
Solomon Islands                          0         0.8   
Cyprus                                   0         1.3   
Singapore                                0         5.8   
Djibouti                                 0         1.1   

                     high_quality_articles_per_capita  \
country                                                 
Zimbabwe                                          0.0   
Congo                                             0.0   
Kuwait           

In [107]:
region_stats = pd.concat([
    total_articles_per_region,
    high_quality_articles_per_region,
    population_per_region
], axis=1)

region_stats['articles_per_capita'] = region_stats['total_articles'] / (region_stats['population'] * 1e6)
region_stats['high_quality_articles_per_capita'] = region_stats['high_quality_articles'] / (region_stats['population'] * 1e6)

region_stats['articles_per_million'] = region_stats['articles_per_capita'] * 1e6
region_stats['high_quality_articles_per_million'] = region_stats['high_quality_articles_per_capita'] * 1e6

# Geographic Regions by Total Coverage (Articles per Capita)

In [108]:
regions_by_coverage = region_stats.sort_values('articles_per_capita', ascending=False)

print("\nGeographic Regions by Total Coverage (Articles per Capita):")
print(regions_by_coverage[['total_articles', 'population', 'articles_per_capita', 'articles_per_million']])


Geographic Regions by Total Coverage (Articles per Capita):
                  total_articles  population  articles_per_capita  \
region                                                              
Tuvalu                         1         0.0                  inf   
Monaco                        10         0.0                  inf   
Marshall Islands              13         1.3         1.000000e-05   
Grenada                        2         0.2         1.000000e-05   
Tonga                         10         1.0         1.000000e-05   
...                          ...         ...                  ...   
Nigeria                      245     54831.0         4.468275e-09   
Pakistan                      97     23328.5         4.158004e-09   
Indonesia                    114     31771.8         3.588088e-09   
China                         16     22580.8         7.085666e-10   
India                        151    215718.6         6.999860e-10   

                  articles_per_million  


# Geographic Regions by High-Quality Coverage (High-Quality Articles per Capita)

In [109]:
regions_by_high_quality = region_stats.sort_values('high_quality_articles_per_capita', ascending=False)

print("\nGeographic Regions by High-Quality Coverage (High-Quality Articles per Capita):")
print(regions_by_high_quality[['high_quality_articles', 'population', 'high_quality_articles_per_capita', 'high_quality_articles_per_million']])


Geographic Regions by High-Quality Coverage (High-Quality Articles per Capita):
            high_quality_articles  population  \
region                                          
Montenegro                      3        22.8   
Luxembourg                      2        18.9   
Gabon                           1        12.0   
Kosovo                          4        49.3   
Latvia                          1        13.3   
...                           ...         ...   
Honduras                        0       174.6   
Haiti                           0       394.4   
Zimbabwe                        0      1152.3   
Monaco                          0         0.0   
Tuvalu                          0         0.0   

            high_quality_articles_per_capita  \
region                                         
Montenegro                      1.315789e-07   
Luxembourg                      1.058201e-07   
Gabon                           8.333333e-08   
Kosovo                          8.113590e