In [49]:
import sys

sys.path.append("/home/lsys/pwned_pols/venv/lib/python3.10/site-packages")

import pandas as pd
from tqdm.notebook import tqdm
import pycountry

iso2_iso3 = {country.alpha_2: country.alpha_3 for country in pycountry.countries}

import janitor
from everypolitician import EveryPolitician

ep = EveryPolitician()

import warnings

warnings.filterwarnings("ignore")

In [34]:
records = []
for country in tqdm(ep.countries()):
    country_data = country.country_data

    for legislature in country_data["legislatures"]:
        legislature_name = legislature["name"]
        ltype = legislature["type"]
        person_count = legislature["person_count"]
        lastmod = legislature["lastmod"]

        for term in legislature["legislative_periods"]:
            term_name = term["name"]
            csv_url = term["csv_url"]
            leg_start_date = term["start_date"]

            term_data = pd.read_csv(csv_url)
            n_rows = len(term_data)
            n_unique_emails = term_data["email"].nunique()

            records.append(
                dict(
                    country=country.slug,
                    cc=country.code,
                    legislature=legislature_name,
                    term=term_name,
                    row_count=n_rows,
                    n_unique_emails=n_unique_emails,
                    leg_start_date=leg_start_date,
                    person_count_legistype=person_count,
                    ltype=ltype,
                    lastmod=lastmod,
                    url=csv_url,
                )
            )

  0%|          | 0/233 [00:00<?, ?it/s]

In [95]:
df = (
    pd.DataFrame(records)
    # ==================================================
    # Get popsize
    .assign(cc3=lambda df_: df_["cc"].map(iso2_iso3))
    .merge(
        (
            pd.read_csv("../data/popsize.csv")
            .clean_names()
            .dropna(subset="cc3")
            .rename_column("2024_[yr2024]", "pop2024")
            .query("pop2024!='..'")
            .select_columns(["cc3", "pop2024"])
            .astype({"pop2024": float})
        ),
        how="left",
        on="cc3",
        validate="m:1",
    )
    # ==================================================
    .assign(
        lastmod_year=lambda df_: pd.to_datetime(df_["lastmod"], unit="s").dt.year,
        leg_start_date=lambda df_: pd.to_datetime(
            df_["leg_start_date"], format="mixed", errors="coerce"
        ),
        leg_start_year=lambda df_: df_["leg_start_date"].dt.year,
    )
    # ==================================================
    .reorder_columns(
        [
            "country",
            "legislature",
            "term",
            "leg_start_year",
            "row_count",
            "n_unique_emails",
            "person_count_legistype",
            "ltype",
            "url",
        ]
    )
    .sort_values("pop2024", ascending=False, ignore_index=True)
)
df.to_csv("../data/everypol-walkthrough.csv", index=False)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 757 entries, 0 to 756
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   country                 757 non-null    object        
 1   legislature             757 non-null    object        
 2   term                    757 non-null    object        
 3   leg_start_year          757 non-null    int32         
 4   row_count               757 non-null    int64         
 5   n_unique_emails         757 non-null    int64         
 6   person_count_legistype  757 non-null    int64         
 7   ltype                   757 non-null    object        
 8   url                     757 non-null    object        
 9   cc                      757 non-null    object        
 10  leg_start_date          757 non-null    datetime64[ns]
 11  lastmod                 757 non-null    object        
 12  cc3                     722 non-null    object    

Unnamed: 0,country,legislature,term,leg_start_year,row_count,n_unique_emails,person_count_legistype,ltype,url,cc,leg_start_date,lastmod,cc3,pop2024,lastmod_year
0,India,Lok Sabha,16th Lok Sabha,2014,541,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1.441720e+09,2019
1,China,National People’s Congress,12th National People’s Congress,2013,2987,0,2956,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,CN,2013-03-05,1554305411,CHN,1.410132e+09,2019
2,United-States-of-America,House of Representatives,113th Congress,2013,449,0,1739,lower house,https://cdn.rawgit.com/everypolitician/everypo...,US,2013-01-06,1557715874,USA,3.366107e+08,2019
3,United-States-of-America,House of Representatives,114th Congress,2015,447,0,1739,lower house,https://cdn.rawgit.com/everypolitician/everypo...,US,2015-01-06,1557715874,USA,3.366107e+08,2019
4,United-States-of-America,House of Representatives,108th Congress,2003,444,0,1739,lower house,https://cdn.rawgit.com/everypolitician/everypo...,US,2003-01-06,1557715874,USA,3.366107e+08,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
752,Wallis-and-Futuna,Territorial Assembly,2017–,2017,20,0,29,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,WF,2017-03-26,1553649605,WLF,,2019
753,Wallis-and-Futuna,Territorial Assembly,2012–2017,2012,20,0,29,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,WF,2012-03-25,1553649605,WLF,,2019
754,Aland,Lagting,2015–2019,2015,30,30,60,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,AX,2015-11-02,1554633432,ALA,,2019
755,Aland,Lagting,2011–2015,2011,30,14,60,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,AX,2011-01-01,1554633432,ALA,,2019


In [101]:
pd.set_option("display.max_rows", 300)
(
    df.sort_values("pop2024", ascending=False, ignore_index=True)
    .query("n_unique_emails>10")
    .head(100)
    .reset_index(drop=True)
)

Unnamed: 0,country,legislature,term,leg_start_year,row_count,n_unique_emails,person_count_legistype,ltype,url,cc,leg_start_date,lastmod,cc3,pop2024,lastmod_year
0,India,Lok Sabha,16th Lok Sabha,2014,541,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
1,Nigeria,House of Representatives,8th National Assembly of Nigeria,2015,370,67,370,lower house,https://cdn.rawgit.com/everypolitician/everypo...,NG,2015-06-09,1557729811,NGA,229152200.0,2019
2,Nigeria,Senate,8th National Assembly of Nigeria,2015,115,73,115,upper house,https://cdn.rawgit.com/everypolitician/everypo...,NG,2015-06-09,1557247803,NGA,229152200.0,2019
3,Iran,Majles,10th Assembly,2016,290,139,508,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IR,2016-05-28,1554352744,IRN,89809780.0,2019
4,Iran,Majles,9th Assembly,2012,290,52,508,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IR,2012-05-27,1554352744,IRN,89809780.0,2019
5,Tanzania,National Assembly,8th Parliament of Tanzania,2000,282,29,906,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,TZ,2000-10-29,1557208331,TZA,69419070.0,2019
6,Tanzania,National Assembly,10th Parliament of Tanzania,2010,350,147,906,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,TZ,2010-10-31,1557208331,TZA,69419070.0,2019
7,Tanzania,National Assembly,11th Parliament of Tanzania,2015,410,405,906,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,TZ,2015-11-17,1557208331,TZA,69419070.0,2019
8,Tanzania,National Assembly,9th Parliament of Tanzania,2005,321,71,906,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,TZ,2005-10-30,1557208331,TZA,69419070.0,2019
9,UK,House of Commons,57th Parliament of the United Kingdom,2017,675,646,1437,lower house,https://cdn.rawgit.com/everypolitician/everypo...,GB,2017-06-09,1557814781,GBR,68556800.0,2019


In [97]:
df.query("n_unique_emails>10").head(50)["country"].nunique()

21

In [98]:
df.query("n_unique_emails>10").head(50)["country"].unique()

array(['India', 'Nigeria', 'Iran', 'Tanzania', 'UK', 'South-Africa',
       'Italy', 'Kenya', 'Colombia', 'South-Korea', 'Uganda', 'Canada',
       'Ghana', 'Nepal', 'Cameroon', 'Australia', 'Burkina-Faso',
       'Sri-Lanka', 'Zambia', 'Romania', 'Netherlands'], dtype=object)