In [41]:
import pandas as pd

#Load the politicians data
politicians_df = pd.read_csv('../data/politicians_by_country_AUG.2024.csv')

#Load the population data
population_df = pd.read_csv('../data/population_by_country_AUG.2024.csv')

#View the first few rows of each dataset
print(politicians_df.head())
politicians_df.count()

print(population_df.head())
population_df.count()


                   name                                                url  \
0        Majah Ha Adrif       https://en.wikipedia.org/wiki/Majah_Ha_Adrif   
1     Haroon al-Afghani    https://en.wikipedia.org/wiki/Haroon_al-Afghani   
2           Tayyab Agha          https://en.wikipedia.org/wiki/Tayyab_Agha   
3  Khadija Zahra Ahmadi  https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...   
4        Aziza Ahmadyar       https://en.wikipedia.org/wiki/Aziza_Ahmadyar   

       country  
0  Afghanistan  
1  Afghanistan  
2  Afghanistan  
3  Afghanistan  
4  Afghanistan  
         Geography  Population
0            WORLD      8009.0
1           AFRICA      1453.0
2  NORTHERN AFRICA       256.0
3          Algeria        46.8
4            Egypt       105.2


Geography     233
Population    233
dtype: int64

In [42]:
#Check for duplicates based on all columns
duplicate_politicians_all = politicians_df[politicians_df.duplicated()]
print(len(duplicate_politicians_all))

#Check for duplicates based on name + url
duplicate_politicians_name = politicians_df[politicians_df.duplicated(subset=['name'])]
duplicate_politicians_url = politicians_df[politicians_df.duplicated(subset=['url'])]
print(len(duplicate_politicians_name))
print(len(duplicate_politicians_url))

#This means that the only thing setting them apart is different countries for the same name and url.




0
44
44


In [43]:
#Check for duplicates based on all columns
duplicate_population_all = population_df[population_df.duplicated()]
print(len(duplicate_population_all))

#Check for duplicates based on geography
duplicate_population_geo = population_df[population_df.duplicated(subset=['Geography'])]
print(len(duplicate_population_geo))


0
0


In [46]:
# 44 of politician appear in mulitple countries (2 or more). This has been decided (based on the Wikipedia API)
#either by their nationalities or the next country served, so it makes sense to have them be a part of both/all 
#the countries their names appear in

# however we keep a copy of the duplicate politicians
combined_duplicates = pd.concat([duplicate_politicians_name, duplicate_politicians_url]).drop_duplicates()
combined_duplicates.to_csv('combined_duplicates_politicians.csv', index=False)



In [49]:
import pandas as pd

#create a new column to check if Geography is in all caps
population_df['is_region'] = population_df['Geography'].apply(lambda x: x.isupper())

df_region = population_df[population_df['is_region'] == True].copy()
df_country = population_df[population_df['is_region'] == False].copy()

# Drop the helper column
df_region = df_region.drop(columns=['is_region'])
df_country = df_country.drop(columns=['is_region'])

df_country.count()
# df_region.to_csv('population_by_region.csv', index=False)
# df_country.to_csv('population_by_country.csv', index=False)



Geography     209
Population    209
dtype: int64

In [9]:
# Check for missing values in each column
missing_values_politicians = politicians_df.isnull().sum()
print(missing_values_politicians)

# Check for missing values in each column
missing_values_population = population_df.isnull().sum()
print(missing_values_population)



name       0
url        0
country    0
dtype: int64
Geography     0
Population    0
dtype: int64


In [None]:
# Step 1: Clean the Politicians Data
# Drop any duplicate rows based on name and URL (just in case)
politicians_df.drop_duplicates(subset=['name', 'url'], inplace=True)

# Verify if there are any missing values in key columns (name, url, country)
missing_values = politicians_df.isnull().sum()
print("\nMissing Values in Politicians Dataset:")
print(missing_values)

# Step 2: Clean the Population Data
# Ensure 'geography' column in the population data is properly formatted
# Capitalized rows indicate regions, so we should keep these separate for now
population_df['is_region'] = population_df['Geography'].apply(lambda x: x.isupper())

# Separate country-level data and region-level data
country_population_df = population_df[~population_df['is_region']].copy()
region_population_df = population_df[population_df['is_region']].copy()

# Strip any leading/trailing spaces from the 'Geography' column
country_population_df['Geography'] = country_population_df['Geography'].str.strip()

# Convert 'Population' column to numeric, handling any non-numeric values
country_population_df['Population'] = pd.to_numeric(country_population_df['Population'], errors='coerce')

# Check for missing values in the population data
missing_population = country_population_df.isnull().sum()
print("\nMissing Values in Population Dataset (Country Level):")
print(missing_population)

# Document inconsistencies in both datasets
# Politicians dataset: Any rows with missing country, URL, or name can be noted or removed
cleaned_politicians_df = politicians_df.dropna(subset=['country', 'name', 'url'])

# Population dataset: Missing population values can be investigated or flagged
# For now, we'll drop countries without population data for simplicity
cleaned_population_df = country_population_df.dropna(subset=['Population'])

# Step 3: Handle Data Inconsistencies
# Example: There might be countries in the politicians dataset that do not appear in the population data and vice versa.
# Let's check for countries in the politicians dataset that are not in the population data.

politician_countries = cleaned_politicians_df['country'].unique()
population_countries = cleaned_population_df['Geography'].unique()

# Find missing countries in either dataset
missing_in_population = set(politician_countries) - set(population_countries)
missing_in_politicians = set(population_countries) - set(politician_countries)

print("\nCountries in Politicians Data but Missing in Population Data:")
print(missing_in_population)

print("\nCountries in Population Data but Missing in Politicians Data:")
print(missing_in_politicians)

# Document how to handle these inconsistencies
# Option 1: Drop entries that don't match across datasets
# Option 2: Keep unmatched countries and investigate them separately
