## Notebook Description

This notebook leverages the data product `wp_politicians_by_country.csv` to create the six analysis products as described in [Step 5](https://docs.google.com/document/d/12Y4lPd5ORyK3s1vv-MQgF7-bYDpbpmKFKvSUgPmiLSs/edit?tab=t.0) of HW 2 for DATA 512.

A cell will be designated to a specific analysis product, read the doc strings at the top of a cell to determine which product the cell belongs to.

### Notes & Assumptions
* The population of countries is represented in millions. Some countries are designated with 0.0 as their population value. In reality, they have a population greater than 0.0, however due to representation of that value in millions it is shown in the data as 0.0. For the purposes of all analysis below, any country with a population of 0.0 is not part of the analysis.

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Constants used in Analytic 3 & 4 below
HIGH_QUALITY = ['FA', 'GA']

In [3]:
# Load data into Pandas DataFrame from csv
df = pd.read_csv('wp_politicians_by_country.csv')

# Get countries from data
countries = df['country'].unique()

# Create list of tuples, each tuple with schema (country, number of articles, country population).
country_population_list = [(country, df['country'].value_counts().get(country), df[df['country'] == country]['population'].iloc[0]) for country in countries]

# Remove any countries with popluation 0.0
country_population_list = [t for t in country_population_list if t[2]>0]

In [4]:
"""
Analytic 1: Top 10 countries by coverage
"""
articles_per_capita = np.array([t[1]/t[2] for t in country_population_list])
indices = np.argsort(articles_per_capita)
top_countries = [country_population_list[index] for index in indices[-10:]]
analytic_dict_top = {'country': [tc[0] for tc in top_countries], 'articles_per_captia': [articles_per_capita[index] for index in indices[-10:]]}
analytic_top_df = pd.DataFrame(analytic_dict_top)
analytic_top_df.sort_index(ascending=False)

Unnamed: 0,country,articles_per_captia
9,Antigua and Barbuda,330.0
8,Barbados,83.333333
7,Seychelles,60.0
6,Montenegro,60.0
5,Bhutan,55.0
4,Maldives,55.0
3,St. Vincent and the Grenadines,40.0
2,Luxembourg,38.571429
1,St. Kitts and Nevis,30.0
0,Bahrain,26.875


In [6]:
"""
Analytic 2: Bottom 10 countries by coverage
"""
bottom_countries = [country_population_list[index] for index in indices[:10]]
analytic_dict = {'country': [tc[0] for tc in bottom_countries], 'articles_per_captia': [articles_per_capita[index] for index in indices[:10]]}
analytic_df = pd.DataFrame(analytic_dict)
analytic_df.reset_index(drop=True).sort_index(ascending=True)

Unnamed: 0,country,articles_per_captia
0,China,0.011337
1,Ghana,0.087977
2,India,0.105698
3,Vanuatu,0.10989
4,Saudi Arabia,0.135501
5,Tuvalu,0.140845
6,Zambia,0.148515
7,Norway,0.181818
8,Israel,0.204082
9,Egypt,0.304183


In [7]:
"""
Analytic 3: Top 10 countries by high quality
"""
analytic_list = []
for t in country_population_list:
    country = t[0]
    population = t[2]
    article_quality = df[df['country']==country]['article_quality'].tolist()
    article_quality = [aq for aq in article_quality if aq in HIGH_QUALITY]
    #analytic_dict['country'] = country
    #analytic_dict['article_quality_per_capita'] = len(article_quality)/population
    analytic_list.append(len(article_quality)/population)

indices = np.argsort(analytic_list)
sorted_countries = [country_population_list[index] for index in indices]

analytic_dict = {'country': [sc[0] for sc in sorted_countries], 'high_quality_articles_per_capita': [analytic_list[index] for index in indices]}
analytic_df = pd.DataFrame(analytic_dict)
analytic_df[-10:].reset_index(drop=True).sort_index(ascending=False)

Unnamed: 0,country,high_quality_articles_per_capita
9,Montenegro,5.0
8,Luxembourg,2.857143
7,Albania,2.592593
6,Kosovo,2.352941
5,Maldives,1.666667
4,Lithuania,1.37931
3,Croatia,1.315789
2,Guyana,1.25
1,Palestinian Territory,1.090909
0,Slovenia,0.952381


In [8]:
"""
Analytic 4: Bottom 10 countries by high quality
NOTE: Any country with a `high_quality_articles_per_capita` value that equals 0 was removed.
"""
analytic_df = analytic_df[analytic_df['high_quality_articles_per_capita']!=0]
analytic_df[:10].reset_index(drop=True).sort_index(ascending=True)

Unnamed: 0,country,high_quality_articles_per_capita
0,Bangladesh,0.005764
1,Egypt,0.009506
2,Ethiopia,0.01581
3,Japan,0.016064
4,Pakistan,0.016632
5,Colombia,0.019157
6,Congo DR,0.01955
7,Vietnam,0.020222
8,Uganda,0.020576
9,Algeria,0.021368


In [9]:
"""
Analytic 5: Geographic regions by total coverage
"""
# Compute region populations
region_populations = df.groupby('region')['population'].sum().to_dict()

# Compute number of articles per region
region_articles = df.groupby('region').size().to_dict()

# Compute number of articles per capita for each region
region_articles_per_capita_dict = {key: region_articles[key]/region_populations[key] for key in region_articles.keys()}

# Create DataFrame and show in notebook
keys = region_articles_per_capita_dict.keys()
region_articles_per_capita_df = pd.DataFrame({'country': [key for key in keys], 
                                              'total_articles_per_capita': [region_articles_per_capita_dict[key] for key in keys]})

region_articles_per_capita_df.sort_values(by='total_articles_per_capita', ascending=False)

Unnamed: 0,country,total_articles_per_capita
8,NORTHERN EUROPE,0.164358
0,CARIBBEAN,0.155315
1,CENTRAL AMERICA,0.135974
2,CENTRAL ASIA,0.051959
13,SOUTHERN EUROPE,0.045693
15,WESTERN ASIA,0.045586
4,EASTERN AFRICA,0.026658
16,WESTERN EUROPE,0.026212
7,NORTHERN AFRICA,0.024807
5,EASTERN EUROPE,0.02441


In [10]:
"""
Analytic 6: Geographic regions by high quality coverage
"""
# Get articles grouped by region
region_articles = df.groupby('region')
region_articles

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x106f93990>