In [1]:
import pandas as pd
import janitor
import numpy as np
from IPython.display import display

import sys

sys.path.append("/home/lsys/pwned_pols/venv/lib/python3.10/site-packages")

from utilities import (
    clean_dedupe_email_column,
    pandas_to_tex,
    #     clean_email_column_no_dedupe,
    classify_comm_gov_email,
)
from utilities import LIST_SERIOUS_DATACLASSES, DELINQUENTS

pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 150)
pd.set_option("display.max_colwidth", None)

import warnings

warnings.filterwarnings("ignore")

## Prep EP data

In [2]:
df_ep_emails = (
    pd.read_csv(
        "../data/everypol/everypol_combined_legislature_data.csv", low_memory=False
    )
    .sort_values(["cc3", "leg_start_year", "email"])
    .pipe(lambda df_: clean_dedupe_email_column(df_))
    .drop_duplicates(subset=["email"], keep="first", ignore_index=True)
    # ================================================================
    # Fix missing cc3 for Wales/Scotland
    # Wales, Scotland = GBR
    .assign(
        cc3=lambda df_: np.where(
            df_["cc"] == "GB-SCT",
            "GBR",
            np.where(df_["cc"] == "GB-WLS", "GBR", df_["cc3"]),
        )
    )
    # ================================================================
    # Fix missing pop for Wales/Scotland
    .assign(
        pop2024=lambda df_: np.where(
            df_["cc"] == "GB-SCT",
            "GBR",
            np.where(df_["cc"] == "GB-WLS", 68556800, df_["pop2024"]),
        )
    )
    # ================================================================
    # Fix ltype for Namibia
    .assign(ltype=lambda df_: np.where((df_["cc3"]=="NAM") & (df_["legislature"]=="National Assembly"), "lower house", df_["ltype"]))
    # ================================================================
    # Fix ltype for India to bicameral
    .assign(
        ltype=lambda df_: np.where(
            df_["cc3"] == "IND", "lower house", df_["ltype"]
        )
    )
    # ================================================================
    # Get #emails per country (cc3)
    .assign(nemail_cc3=lambda df_: df_.groupby("cc3")["email"].transform("nunique"))
    # ================================================================
    .reorder_columns(
        [
            "email",
            "cc3",
            "country",
            "ltype",
            "legislature",
            "chamber",
            "leg_start_year",
            "nemail_cc3",
            "gender",
        ]
    )
)

assert (df_ep_emails["nemail_cc3"] >= 30).all()
display(df_ep_emails.head(3))
df_ep_emails.info()

Unnamed: 0,email,cc3,country,ltype,legislature,chamber,leg_start_year,nemail_cc3,gender,id,name,sort_name,twitter,facebook,group,group_id,area_id,area,term_x,start_date,end_date,image,wikidata,wikidata_group,wikidata_area,row_id,term_y,row_count,n_unique_emails,person_count_legistype,url,cc,leg_start_date,lastmod,pop2024,lastmod_year
0,albana.vokshi@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,female,c8bd71fc-4815-40b5-a5c4-bd359e3b0cef,Albana Vokshi,VOKSHI ALBANA,,,PD,c26946f7-8c5e-4474-a85e-ee5342cf4006,durrës_county,Durrës County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Albana-Vokshi-PD.jpg,Q4709025,Q845743,,490,VII Pluralist Legislature,140,31,292,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1557734928,2740502.0,2019
1,aldo.bumci@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,589ba883-b893-40ed-95f8-69b10f0497d9,Aldo Bumçi,BUMÇI ALDO,,,PD,c26946f7-8c5e-4474-a85e-ee5342cf4006,tirana_county,Tirana County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Aldo-Bumci-PD.jpg,Q2832310,Q845743,,490,VII Pluralist Legislature,140,31,292,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1557734928,2740502.0,2019
2,bashkim.fino@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,9d0181c7-7ebe-4b16-800a-6ea28baf22f5,Bashkim Fino,FINO BASHKIM,,,PS,0c83d4f1-9e86-48f2-95ee-9c3d158d1141,fier_county,Fier County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Bashkim-Fino-PS.jpg,Q809978,Q642882,,490,VII Pluralist Legislature,140,31,292,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1557734928,2740502.0,2019


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8372 entries, 0 to 8371
Data columns (total 36 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   email                   8372 non-null   object
 1   cc3                     8372 non-null   object
 2   country                 8372 non-null   object
 3   ltype                   8372 non-null   object
 4   legislature             8372 non-null   object
 5   chamber                 8372 non-null   object
 6   leg_start_year          8372 non-null   int64 
 7   nemail_cc3              8372 non-null   int64 
 8   gender                  7188 non-null   object
 9   id                      8372 non-null   object
 10  name                    8372 non-null   object
 11  sort_name               8372 non-null   object
 12  twitter                 2375 non-null   object
 13  facebook                1568 non-null   object
 14  group                   8371 non-null   object
 15  grou

In [3]:
# # countries with incomplete gender coverage
# for c in df_ep_emails["cc3"].unique().tolist():
#     _df = df_ep_emails.query(f"cc3=='{c}'")
#     n_gender = len(_df.dropna(subset=["gender"]))
#     n_country = len(_df)
#     if n_gender < n_country:
#         name = _df.reset_index().loc[0, "country"]
#         print(f"{name} ({c}):\n {n_gender}, {n_country}")

In [4]:
set_edomain = set()
for _, row in df_ep_emails.iterrows():
    try:
        edomain = row["email"].split("@")[1]
        set_edomain.add(edomain)
    except IndexError:
        print(_)
#         print(row["email"])

len(set_edomain)

354

In [5]:
# for cc in df_ep_emails["cc3"].unique().tolist():
#     _df = df_ep_emails.query(f"cc3=='{cc}'")
#     _set_edomain = set()
#     for _, row in _df.iterrows():
#         edomain = row["email"].split("@")[1]
#         _set_edomain.add(edomain)

#     print(cc)
#     print(len(_set_edomain))
#     print(_set_edomain)

### Basic numbers

In [6]:
df_ep_emails["nemail_cc3"].describe()

count    8372.000000
mean      283.002867
std       222.489537
min        31.000000
25%       136.000000
50%       208.000000
75%       403.000000
max       832.000000
Name: nemail_cc3, dtype: float64

In [7]:
df_ep_emails["ltype"].unique()

array(['unicameral legislature', 'lower house', 'upper house'],
      dtype=object)

In [8]:
df_ep_emails["legislature"].unique()

array(['Kuvendi', 'Consell General', 'National Assembly',
       'House of Representatives', 'Senate', 'Chamber of Representatives',
       'Parliament', 'House of Commons', 'Assemblée Nationale',
       'Cámara de Representantes', 'Folketing', 'Riigikogu', 'Eduskunta',
       'Parliament of Georgia', 'States', 'Hellenic Parliament',
       'Inatsisartut', 'Congress', 'Legislative Council', 'Országgyűlés',
       'Lok Sabha', 'Majles', 'Chamber of Deputies', 'Parlament',
       'Sobranie', 'National Council', 'Tweede Kamer',
       'Constituent Assembly', 'New Zealand Parliament',
       'National Parliament', 'Assembly', 'House of Assembly',
       'Scottish Parliament', 'National Assembly for Wales'], dtype=object)

In [9]:
df_ep_emails["legislature"].nunique()

34

In [10]:
df_ep_emails["email"].nunique()

8372

In [11]:
df_ep_emails["cc3"].nunique()

55

In [12]:
df_ep_emails.groupby(["cc3", "legislature"]).ngroups

61

## Prep scraped data

In [13]:
df_scraped_emails = (
    pd.read_csv("../data/scraped_pol_combined_legislature_data.csv")
    .sort_values(["cc3", "email"])
    .pipe(lambda df_: clean_dedupe_email_column(df_))
    .drop_duplicates(subset=["email"], keep="first", ignore_index=True)
    # ================================================================
    # Get popsize
    .merge(
        (
            pd.read_csv("../data/popsize.csv")
            .dropna(subset=["cc3"])
            .rename_column("2024 [YR2024]", "pop2024")
        ),
        how="left",
        on="cc3",
        validate="m:1",
    )
    # ================================================================
    # Get #emails per country (cc3)
    .assign(nemail_cc3=lambda df_: df_.groupby("cc3")["email"].transform("nunique"))
    # ================================================================
    .reorder_columns(
        [
            "email",
            "cc3",
            "country",
            "ltype",
            "legislature",
            "chamber",
            #             "year",
            "nemail_cc3",
        ]
    )
    #     # ================================================================
    #     # Renaming fields to harmonise with EP
    .rename_column("leg_start_year", "leg_start_year_sg")
    .assign(
        leg_start_year=lambda df_: np.where(
            df_["cc3"] == "SGP", df_["leg_start_year_sg"], 2025
        ).astype(int)
    )
)
assert (df_scraped_emails["nemail_cc3"] >= 30).all()
display(df_scraped_emails.head(3))
df_scraped_emails.info()

Unnamed: 0,email,cc3,country,ltype,legislature,chamber,nemail_cc3,name,party,title,address,private_phone,mobile_phone,work_phone,minister_phone,fax,mpsno,initial,firstname,lastname,gender,partyfname,partysname,statename,constname,profession,presentfaddr,presentladdr,delhiphone,permanentfaddr,permanentladdr,personalphone,lastloksabha,lsexpr,age,phone,noofterms,status,imageurl,profileurl,dob,numberofsons,numberofdaughters,qualification,freedom,profession2,categorycode,currentpagenumber,perpagesize,totalelements,totalpages,source_file,mplastfirstname,mpfirstlastname,maritalstatus,createdat,updatedat,email_fix,rank,constituency,leg_start_year_sg,bloque,apellido,nombre,provincia,partido_o_alianza_por_el_que_ingreso,designacion_legal,cese_legal,designacion_real,cese_real,telefono,facebook,twitter,instagram,youtube,nome_parlamentar,partido,uf,titularidade,mandato,telefones,dtnasc,chefe_gab,endereco,state,district,id,parliament_address,parliament_number,social_media,region_x,contact,sr_no,photo,serial_no_,tel_no_r_,constituency_name,permanent_address,mobile,tele_no_res_no_,ac_no,candidate,Country Name,Series Name,Series Code,pop2024,region_y,EU,politician,civil_servants,leg_start_year
0,alejandra.vigo@senado.gob.ar,ARG,Argentina,upper house,Parliament,,71,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,UNIDAD FEDERAL,VIGO,ALEJANDRA MARÍA,CÓRDOBA,HACEMOS POR CÓRDOBA,2021-12-10,2027-12-09,2021-12-10,Sin Datos,1441 / 1444 / 1456,https://www.facebook.com/VigoAlejandra,https://twitter.com/@alevigo,https://www.instagram.com/@vigo_alejandra,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Argentina,"Population, total",SP.POP.TOTL,46936024,,,,,2025
1,alfredo.deangeli@senado.gob.ar,ARG,Argentina,upper house,Parliament,,71,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,FRENTE PRO,DE ANGELI,ALFREDO LUIS,ENTRE RÍOS,JUNTOS POR EL CAMBIO,2019-12-10,2025-12-09,2019-12-10,Sin Datos,3580 / 81 / 82 / 84,,https://twitter.com/alfredodeangeli,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Argentina,"Population, total",SP.POP.TOTL,46936024,,,,,2025
2,alicia.kirchner@senado.gob.ar,ARG,Argentina,upper house,Parliament,,71,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,UNIDAD CIUDADANA,KIRCHNER,ALICIA MARGARITA ANTONIA,SANTA CRUZ,ALIANZA UNIÓN POR LA PATRIA,2023-12-10,2029-12-09,2023-12-10,Sin Datos,1389 / 1390,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Argentina,"Population, total",SP.POP.TOTL,46936024,,,,,2025


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4551 entries, 0 to 4550
Columns: 111 entries, email to leg_start_year
dtypes: float64(21), int64(2), object(88)
memory usage: 3.9+ MB


In [14]:
df_scraped_emails.groupby("cc3").size()

cc3
ARG      71
BRA      81
DNK     186
GRC     368
IND    3233
NGA      68
NOR     174
SGP     370
dtype: int64

### Basic numbers

In [15]:
df_scraped_emails["nemail_cc3"].describe()

count    4551.000000
mean     2374.359701
std      1346.464748
min        68.000000
25%       370.000000
50%      3233.000000
75%      3233.000000
max      3233.000000
Name: nemail_cc3, dtype: float64

In [16]:
df_scraped_emails["legislature"].unique()

array(['Parliament', 'Folketing', 'Lok Sabha', 'State Legislature',
       'Senate', 'Storting'], dtype=object)

In [17]:
df_scraped_emails["email"].nunique()

4551

In [18]:
df_scraped_emails["name"].nunique()

1494

In [19]:
df_scraped_emails["cc3"].nunique()

8

In [20]:
df_scraped_emails.groupby(["cc3", "legislature"]).ngroups

9

In [21]:
set_edomain = set()
for _, row in df_scraped_emails.iterrows():
    try:
        edomain = row["email"].split("@")[1]
        set_edomain.add(edomain)
    except IndexError:
        print(_)
#         print(row["email"])

len(set_edomain)

233

In [22]:
# for cc in df_scraped_emails["cc3"].unique().tolist():
#     _df = df_scraped_emails.query(f"cc3=='{cc}'")
#     _set_edomain = set()
#     for _, row in _df.iterrows():
#         edomain = row["email"].split("@")[1]
#         _set_edomain.add(edomain)

#     print(cc)
#     print(len(_set_edomain))
#     print(_set_edomain)

## Combine EP + Scraped

In [23]:
df_pol_emails = (
    pd.concat(
        [df_ep_emails.assign(source="ep"), df_scraped_emails.assign(source="scraped")],
        ignore_index=True,
    )
    .query("email not in @DELINQUENTS")
    .pipe(lambda df_: clean_dedupe_email_column(df_))
    .drop_duplicates(subset=["email"], keep="first", ignore_index=True)
    # ================================================================
    # Get indicator for years and chambers for table1
    .assign(
        years=lambda df_: df_.groupby("cc3")["leg_start_year"].transform(
            lambda x: ", ".join(map(str, sorted(x.unique())))
        ),
        chambers=lambda df_: df_.groupby("cc3")["ltype"]
        .transform(lambda x: ", ".join(map(str, sorted(x.unique()))))
        .replace("legislature", "", regex=True)
        .replace("house", "", regex=True)
        .str.strip()
        .str.title(),
        legislatures=lambda df_: df_.groupby("cc3")["legislature"].transform(
            lambda x: ", ".join(map(str, sorted(x.unique())))
        ),
    )
    # ================================================================
    # Get #emails per country (cc3)
    .assign(nemail_cc3=lambda df_: df_.groupby("cc3")["email"].transform("nunique"))
    .sort_values(["cc3", "leg_start_year", "email"], ignore_index=True)
)
# del df_ep_emails, df_scraped_emails

assert (df_pol_emails["nemail_cc3"] >= 30).all()
display(df_pol_emails.head(3))
df_pol_emails.info()

Unnamed: 0,email,cc3,country,ltype,legislature,chamber,leg_start_year,nemail_cc3,gender,id,name,sort_name,twitter,facebook,group,group_id,area_id,area,term_x,start_date,end_date,image,wikidata,wikidata_group,wikidata_area,row_id,term_y,row_count,n_unique_emails,person_count_legistype,url,cc,leg_start_date,lastmod,pop2024,lastmod_year,source,party,title,address,private_phone,mobile_phone,work_phone,minister_phone,fax,mpsno,initial,firstname,lastname,partyfname,partysname,statename,constname,profession,presentfaddr,presentladdr,delhiphone,permanentfaddr,permanentladdr,personalphone,lastloksabha,lsexpr,age,phone,noofterms,status,imageurl,profileurl,dob,numberofsons,numberofdaughters,qualification,freedom,profession2,categorycode,currentpagenumber,perpagesize,totalelements,totalpages,source_file,mplastfirstname,mpfirstlastname,maritalstatus,createdat,updatedat,email_fix,rank,constituency,leg_start_year_sg,bloque,apellido,nombre,provincia,partido_o_alianza_por_el_que_ingreso,designacion_legal,cese_legal,designacion_real,cese_real,telefono,instagram,youtube,nome_parlamentar,partido,uf,titularidade,mandato,telefones,dtnasc,chefe_gab,endereco,state,district,parliament_address,parliament_number,social_media,region_x,contact,sr_no,photo,serial_no_,tel_no_r_,constituency_name,permanent_address,mobile,tele_no_res_no_,ac_no,candidate,Country Name,Series Name,Series Code,region_y,EU,politician,civil_servants,years,chambers,legislatures
0,albana.vokshi@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,female,c8bd71fc-4815-40b5-a5c4-bd359e3b0cef,Albana Vokshi,VOKSHI ALBANA,,,PD,c26946f7-8c5e-4474-a85e-ee5342cf4006,durrës_county,Durrës County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Albana-Vokshi-PD.jpg,Q4709025,Q845743,,490.0,VII Pluralist Legislature,140.0,31.0,292.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1557735000.0,2740502.0,2019.0,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"2009, 2013, 2017",Unicameral,Kuvendi
1,aldo.bumci@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,589ba883-b893-40ed-95f8-69b10f0497d9,Aldo Bumçi,BUMÇI ALDO,,,PD,c26946f7-8c5e-4474-a85e-ee5342cf4006,tirana_county,Tirana County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Aldo-Bumci-PD.jpg,Q2832310,Q845743,,490.0,VII Pluralist Legislature,140.0,31.0,292.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1557735000.0,2740502.0,2019.0,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"2009, 2013, 2017",Unicameral,Kuvendi
2,bashkim.fino@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,9d0181c7-7ebe-4b16-800a-6ea28baf22f5,Bashkim Fino,FINO BASHKIM,,,PS,0c83d4f1-9e86-48f2-95ee-9c3d158d1141,fier_county,Fier County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Bashkim-Fino-PS.jpg,Q809978,Q642882,,490.0,VII Pluralist Legislature,140.0,31.0,292.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1557735000.0,2740502.0,2019.0,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"2009, 2013, 2017",Unicameral,Kuvendi


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12384 entries, 0 to 12383
Columns: 137 entries, email to legislatures
dtypes: float64(26), int64(2), object(109)
memory usage: 12.9+ MB


In [24]:
df_pol_emails["nemail_cc3"].describe()

count    12384.000000
mean      1107.047965
std       1361.953022
min         31.000000
25%        158.000000
50%        368.000000
75%       3331.000000
max       3331.000000
Name: nemail_cc3, dtype: float64

In [25]:
df_pol_emails["email"].nunique()

12384

In [26]:
df_pol_emails["cc3"].nunique()

59

In [27]:
df_pol_emails.groupby(["cc3", "legislature"]).ngroups

67

In [28]:
set_edomains = set()
for cc in df_pol_emails["cc3"].unique().tolist():
    _df = df_pol_emails.query(f"cc3=='{cc}'")
    _set_edomain = set()
    for _, row in _df.iterrows():
        edomain = row["email"].split("@")[1]
        _set_edomain.add(edomain)
        set_edomains.add(edomain)

#     print(cc)
#     print(len(_set_edomain))
#     print(_set_edomain)

In [29]:
len(set_edomains)

530

## Table1

In [30]:
df_tab1 = (
    df_pol_emails.drop_duplicates(["cc3"], ignore_index=True)
    #     .filter(["cc3", "country", "nemail_cc3", "female_prop_cc3", "years", "chambers", "legislatures", "pop2024"])
    #     .assign(female_prop_cc3=lambda df_: (100 * df_["female_prop_cc3"]).round(1).astype(str) + "\\%")
    .filter(
        ["cc3", "country", "nemail_cc3", "years", "chambers", "legislatures", "pop2024"]
    )
    # =============================================================
    # Tidy pop
    .assign(
        pop2024=lambda df_: (df_["pop2024"].astype(float) / 1_000_000)
        .round(1)
        .apply(str)
    )
    # =============================================================
    # Tidy country names
    .replace("Bosnia-and-Herzegovina", "Bosnia")
    .assign(country=lambda df_: df_["country"].replace("-", " ", regex=True))
    # =============================================================
    # Tidy legislatures
    .replace(
        "House of Commons, National Assembly for Wales, Scottish Parliament",
        "Commons, Senedd, Scottish Parliament",
    )
    # Tidy year strings
    .replace("1997, 2001, 2005, 2007, 2010, 2011, 2015, 2016, 2017", "1997--2017")
    .replace("2004, 2007, 2010, 2013, 2016", "2004--2016")
    .replace("2001, 2005, 2007, 2011, 2015", "2001--2015")
    .replace("2004, 2007, 2009, 2012, 2015", "2004--2015")
    .replace("2001, 2005, 2007, 2011, 2015, 2025", "2001--2025")
    .replace("2008, 2011, 2014, 2017", "2008--2017")
    .replace("2006, 2010, 2012, 2016", "2006--2016")
    .replace("2001, 2006, 2011, 2015, 2021", "2001--2025")
    .replace("2004, 2007, 2009, 2012, 2015, 2025", "2004--2025")
    # =============================================================
    # Tidy chambers
    .replace("Lower ", "Lower", regex=True)
    .replace("Upper, Lower, Upper", "Lower, Upper", regex=True)
    # =============================================================
    .assign(ix=lambda df_: range(1, 1 + len(df_)))
    .reorder_columns(["ix"])
)
df_tab1

Unnamed: 0,ix,cc3,country,nemail_cc3,years,chambers,legislatures,pop2024
0,1,ALB,Albania,140,"2009, 2013, 2017",Unicameral,Kuvendi,2.7
1,2,AND,Andorra,31,2015,Unicameral,Consell General,0.1
2,3,ARG,Argentina,71,2025,Upper,Parliament,46.9
3,4,ARM,Armenia,119,2019,Unicameral,National Assembly,2.8
4,5,AUS,Australia,177,2004--2016,"Lower, Upper","House of Representatives, Senate",26.9
5,6,BEL,Belgium,149,2014,Lower,Chamber of Representatives,11.9
6,7,BGR,Bulgaria,205,"2013, 2014, 2017",Unicameral,National Assembly,6.4
7,8,BIH,Bosnia,42,2014,Lower,House of Representatives,3.2
8,9,BLR,Belarus,59,2016,Unicameral,House of Representatives,9.1
9,10,BMU,Bermuda,33,2017,Lower,Parliament,


In [31]:
df_tab1["nemail_cc3"].sum()

np.int64(12384)

In [32]:
df_tab1["pop2024"].astype(float).sum()

np.float64(2884.5)

In [33]:
# % coverage of global pop
100 * (df_tab1["pop2024"].astype(float).sum()) / 8000

np.float64(36.05625)

In [34]:
# % coverage of global pop in electoral regimes
# https://ourworldindata.org/less-democratic
100 * (df_tab1["pop2024"].astype(float).sum()) / (1050+1300+3500)

np.float64(49.30769230769231)

In [35]:
pandas_to_tex(
    df_tab1.replace("nan", "---"),
    "../tables/hibp_pooled_emailcoverage_summary.tex",
    index=False,
)

In [36]:
!cat ../tables/hibp_pooled_emailcoverage_summary.tex

\midrule
1 & ALB & Albania & 140 & 2009, 2013, 2017 & Unicameral & Kuvendi & 2.7 \\
2 & AND & Andorra & 31 & 2015 & Unicameral & Consell General & 0.1 \\
3 & ARG & Argentina & 71 & 2025 & Upper & Parliament & 46.9 \\
4 & ARM & Armenia & 119 & 2019 & Unicameral & National Assembly & 2.8 \\
5 & AUS & Australia & 177 & 2004--2016 & Lower, Upper & House of Representatives, Senate & 26.9 \\
6 & BEL & Belgium & 149 & 2014 & Lower & Chamber of Representatives & 11.9 \\
7 & BGR & Bulgaria & 205 & 2013, 2014, 2017 & Unicameral & National Assembly & 6.4 \\
8 & BIH & Bosnia & 42 & 2014 & Lower & House of Representatives & 3.2 \\
9 & BLR & Belarus & 59 & 2016 & Unicameral & House of Representatives & 9.1 \\
10 & BMU & Bermuda & 33 & 2017 & Lower & Parliament & --- \\
11 & BRA & Brazil & 81 & 2025 & Upper & Parliament & 217.6 \\
12 & BTN & Bhutan & 46 & 2013 & Lower & National Assembly & 0.8 \\
13 & CAN & Canada & 432 & 2011, 2015 & Lower, Upper & House of Commons, Senate & 40.4 \\
14

## Merge to HIBP

### EP data

In [37]:
df_ep_email_breach_expanded = (
    # All EP emails
    df_ep_emails.filter(
        [
            "email",
            "gender",
            "cc3",
            "country",
            "ltype",
            "legislature",
            "chamber",
            "leg_start_year",
            "nemail_cc3",
        ]
    )
    # ========================================================================
    # Merge to breached status
    .merge(
        (
            pd.read_csv("../data/everypol_hibp.csv")
            .clean_names()
            .rename_column("filename", "email")
            # needed to normalize strings (e.g., unicode/ASCII) -- do not remove
            .pipe(lambda df_: clean_dedupe_email_column(df_, dedup=False))
        ),
        how="left",
        on="email",
        validate="1:m",
    ).dropna(subset=["breach"])
    # ========================================================================
    # Merge to expanded breach info
    .merge(
        (
            pd.read_parquet("../data/breaches_01_2025_expanded.parquet")
            .remove_columns(
                [
                    "description",
                    "title",
                    "domain",
                    "breachdate",
                    "addeddate",
                    "modifieddate",
                    "logopath",
                    "timetopublic",
                    "dataclasses",
                ]
            )
            .rename_column("name", "breach")
        ),
        how="left",
        on="breach",
        validate="m:1",
    )
    # ========================================================================
    #     .query("present==True")
    .reset_index(drop=True)
)
df_ep_email_breach_expanded.head(3)

Unnamed: 0,email,gender,cc3,country,ltype,legislature,chamber,leg_start_year,nemail_cc3,breach,present,pwncount,isverified,isfabricated,issensitive,isretired,isspamlist,ismalware,issubscriptionfree,yearstopublic,n_dataclasses,Website activity,Sexual orientations,Eating habits,Ethnicities,Passport numbers,Buying preferences,Physical attributes,Living costs,Apps installed on devices,Browser user agent details,Delivery instructions,Nationalities,Social media profiles,Appointments,Deceased date,Places of birth,Education levels,School grades (class levels),Partial credit card data,Career levels,Purchases,Age groups,Payment methods,Cryptocurrency wallet addresses,Auth tokens,Spouses names,Avatars,Password hints,Salutations,Family members' names,Purchasing habits,Survey results,Ages,Device usage tracking data,Reward program balances,Time zones,Bank account numbers,Financial transactions,Fitness levels,Net worths,Company names,Payment histories,Partial phone numbers,Device serial numbers,Spoken languages,Drug habits,Recovery email addresses,Sexual fetishes,Government issued IDs,Vehicle identification numbers (VINs),Bios,Clothing sizes,Dates of birth,IMEI numbers,...,IMSI numbers,Profile photos,Astrological signs,Mnemonic phrases,Photos,HIV statuses,Private messages,Tattoo status,Social connections,Home ownership statuses,Homepage URLs,Charitable donations,Car ownership statuses,Partial dates of birth,Job titles,Login histories,Instant messenger identities,Employment statuses,Credit card CVV,Parenting plans,Employers,Smoking habits,Browsing histories,Loyalty program details,Security questions and answers,Credit cards,User statuses,Religions,Geographic locations,Telecommunications carrier,Account balances,Beauty ratings,Utility bills,Genders,Support tickets,Chat logs,Income levels,Biometric data,Taxation records,Occupations,Vehicle details,Professional skills,Address book contacts,Usernames,User website URLs,SMS messages,Customer interactions,Licence plates,Nicknames,Mothers maiden names,Job applications,Encrypted keys,Audio recordings,Warranty claims,Travel plans,Personal descriptions,IP addresses,Personal interests,Cellular network names,Work habits,Personal health data,Years of professional experience,Email addresses,Passwords,PINs,Travel habits,Email messages,MAC addresses,Comments,Physical addresses,Political donations,Driver's licenses,Family structure,Marital statuses,Credit status information
0,albana.vokshi@parlament.al,female,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,000webhost,False,14936670,True,False,False,False,False,False,False,0.654795,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0
1,albana.vokshi@parlament.al,female,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,123RF,False,8661578,True,False,False,False,False,False,False,0.652055,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0
2,albana.vokshi@parlament.al,female,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,2844Breaches,False,80115532,False,False,False,False,False,False,False,0.019178,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0


### Scraped data

In [38]:
df_scraped_email_breach_expanded = (
    df_scraped_emails
    .filter(["email", "cc3", "country", "leg_start_year", "nemail_cc3"])
    # ============================================================================
    # Merge to breached status
    .merge(
        (
            pd.read_csv("../data/scraped_pol_hibp.csv")
            .clean_names()
            .rename_column("filename", "email")
        ),
        how="left",
        on="email",
        validate="1:m",
    )
    #     .dropna(subset=["breach"])
    .assign(present=lambda df_: df_["present"].fillna(False))
    # ============================================================================
    # Merge to expanded breach info
    .merge(
        (
            pd.read_parquet("../data/breaches_01_2025_expanded.parquet")
            .remove_columns(
                [
                    "description",
                    "title",
                    "domain",
                    "breachdate",
                    "addeddate",
                    "modifieddate",
                    "logopath",
                    "timetopublic",
                    "dataclasses",
                ]
            )
            .rename_column("name", "breach")
        ),
        how="left",
        on="breach",
        validate="m:1",
    )
    # ============================================================================
    #     .query("present==True")
    .reset_index(drop=True)
)
df_scraped_email_breach_expanded.head()

Unnamed: 0,email,cc3,country,leg_start_year,nemail_cc3,breach,present,pwncount,isverified,isfabricated,issensitive,isretired,isspamlist,ismalware,issubscriptionfree,yearstopublic,n_dataclasses,Website activity,Sexual orientations,Eating habits,Ethnicities,Passport numbers,Buying preferences,Physical attributes,Living costs,Apps installed on devices,Browser user agent details,Delivery instructions,Nationalities,Social media profiles,Appointments,Deceased date,Places of birth,Education levels,School grades (class levels),Partial credit card data,Career levels,Purchases,Age groups,Payment methods,Cryptocurrency wallet addresses,Auth tokens,Spouses names,Avatars,Password hints,Salutations,Family members' names,Purchasing habits,Survey results,Ages,Device usage tracking data,Reward program balances,Time zones,Bank account numbers,Financial transactions,Fitness levels,Net worths,Company names,Payment histories,Partial phone numbers,Device serial numbers,Spoken languages,Drug habits,Recovery email addresses,Sexual fetishes,Government issued IDs,Vehicle identification numbers (VINs),Bios,Clothing sizes,Dates of birth,IMEI numbers,Device information,Historical passwords,Relationship statuses,Phone numbers,...,IMSI numbers,Profile photos,Astrological signs,Mnemonic phrases,Photos,HIV statuses,Private messages,Tattoo status,Social connections,Home ownership statuses,Homepage URLs,Charitable donations,Car ownership statuses,Partial dates of birth,Job titles,Login histories,Instant messenger identities,Employment statuses,Credit card CVV,Parenting plans,Employers,Smoking habits,Browsing histories,Loyalty program details,Security questions and answers,Credit cards,User statuses,Religions,Geographic locations,Telecommunications carrier,Account balances,Beauty ratings,Utility bills,Genders,Support tickets,Chat logs,Income levels,Biometric data,Taxation records,Occupations,Vehicle details,Professional skills,Address book contacts,Usernames,User website URLs,SMS messages,Customer interactions,Licence plates,Nicknames,Mothers maiden names,Job applications,Encrypted keys,Audio recordings,Warranty claims,Travel plans,Personal descriptions,IP addresses,Personal interests,Cellular network names,Work habits,Personal health data,Years of professional experience,Email addresses,Passwords,PINs,Travel habits,Email messages,MAC addresses,Comments,Physical addresses,Political donations,Driver's licenses,Family structure,Marital statuses,Credit status information
0,alejandra.vigo@senado.gob.ar,ARG,Argentina,2025,71,StealerLogsJan2025,False,71039833.0,True,False,False,False,False,False,False,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,alfredo.deangeli@senado.gob.ar,ARG,Argentina,2025,71,StealerLogsJan2025,False,71039833.0,True,False,False,False,False,False,False,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,alicia.kirchner@senado.gob.ar,ARG,Argentina,2025,71,StealerLogsJan2025,False,71039833.0,True,False,False,False,False,False,False,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,anabel.fernandezsagasti@senado.gob.ar,ARG,Argentina,2025,71,StealerLogsJan2025,False,71039833.0,True,False,False,False,False,False,False,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,andrea.cristina@senado.gob.ar,ARG,Argentina,2025,71,StealerLogsJan2025,False,71039833.0,True,False,False,False,False,False,False,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
df_scraped_email_breach_expanded.columns.tolist()

['email',
 'cc3',
 'country',
 'leg_start_year',
 'nemail_cc3',
 'breach',
 'present',
 'pwncount',
 'isverified',
 'isfabricated',
 'issensitive',
 'isretired',
 'isspamlist',
 'ismalware',
 'issubscriptionfree',
 'yearstopublic',
 'n_dataclasses',
 'Website activity',
 'Sexual orientations',
 'Eating habits',
 'Ethnicities',
 'Passport numbers',
 'Buying preferences',
 'Physical attributes',
 'Living costs',
 'Apps installed on devices',
 'Browser user agent details',
 'Delivery instructions',
 'Nationalities',
 'Social media profiles',
 'Appointments',
 'Deceased date',
 'Places of birth',
 'Education levels',
 'School grades (class levels)',
 'Partial credit card data',
 'Career levels',
 'Purchases',
 'Age groups',
 'Payment methods',
 'Cryptocurrency wallet addresses',
 'Auth tokens',
 'Spouses names',
 'Avatars',
 'Password hints',
 'Salutations',
 "Family members' names",
 'Purchasing habits',
 'Survey results',
 'Ages',
 'Device usage tracking data',
 'Reward program balances'

### Manually check

In [40]:
# non_breached_emails = []
# for pol in df["email"].unique():
#     if pol not in df_ep_email_breach_expanded["email"].unique().tolist():
#         non_breached_emails.append(pol)

In [41]:
# import random
# random.seed(42)

# # Sample 10 unique emails
# sampled_emails = random.sample(non_breached_emails, 20)
# sampled_emails

Paste (no breach) 

* g.akriotis@parliament.gr

### Combine

In [42]:
df_email_breach_expanded = (
    pd.concat(
        [
            df_ep_email_breach_expanded.remove_columns(
                ["chamber", "legislature"]
            ).assign(source="ep"),
            df_scraped_email_breach_expanded.assign(source="scraped"),
        ],
        ignore_index=True,
    )
    .dropna(subset=["breach"])
    # Dedup by email-breach
    .sort_values(["source", "email"])
    .drop_duplicates(subset=["email", "breach"], keep="first")
    .assign(present=lambda df_: df_["present"].astype(bool))
    # ========================================================================
    .assign(
        seriousbreach=lambda df_: df_[LIST_SERIOUS_DATACLASSES].max(axis=1).astype(int)
    )
    # ========================================================================
    .pipe(lambda df_: classify_comm_gov_email(df_))
)
df_email_breach_expanded

Unnamed: 0,email,gender,cc3,country,ltype,leg_start_year,nemail_cc3,breach,present,pwncount,isverified,isfabricated,issensitive,isretired,isspamlist,ismalware,issubscriptionfree,yearstopublic,n_dataclasses,Website activity,Sexual orientations,Eating habits,Ethnicities,Passport numbers,Buying preferences,Physical attributes,Living costs,Apps installed on devices,Browser user agent details,Delivery instructions,Nationalities,Social media profiles,Appointments,Deceased date,Places of birth,Education levels,School grades (class levels),Partial credit card data,Career levels,Purchases,Age groups,Payment methods,Cryptocurrency wallet addresses,Auth tokens,Spouses names,Avatars,Password hints,Salutations,Family members' names,Purchasing habits,Survey results,Ages,Device usage tracking data,Reward program balances,Time zones,Bank account numbers,Financial transactions,Fitness levels,Net worths,Company names,Payment histories,Partial phone numbers,Device serial numbers,Spoken languages,Drug habits,Recovery email addresses,Sexual fetishes,Government issued IDs,Vehicle identification numbers (VINs),Bios,Clothing sizes,Dates of birth,IMEI numbers,Device information,Historical passwords,...,Mnemonic phrases,Photos,HIV statuses,Private messages,Tattoo status,Social connections,Home ownership statuses,Homepage URLs,Charitable donations,Car ownership statuses,Partial dates of birth,Job titles,Login histories,Instant messenger identities,Employment statuses,Credit card CVV,Parenting plans,Employers,Smoking habits,Browsing histories,Loyalty program details,Security questions and answers,Credit cards,User statuses,Religions,Geographic locations,Telecommunications carrier,Account balances,Beauty ratings,Utility bills,Genders,Support tickets,Chat logs,Income levels,Biometric data,Taxation records,Occupations,Vehicle details,Professional skills,Address book contacts,Usernames,User website URLs,SMS messages,Customer interactions,Licence plates,Nicknames,Mothers maiden names,Job applications,Encrypted keys,Audio recordings,Warranty claims,Travel plans,Personal descriptions,IP addresses,Personal interests,Cellular network names,Work habits,Personal health data,Years of professional experience,Email addresses,Passwords,PINs,Travel habits,Email messages,MAC addresses,Comments,Physical addresses,Political donations,Driver's licenses,Family structure,Marital statuses,Credit status information,source,seriousbreach,ecategory
1104198,016kimkj@gmail.com,male,KOR,South-Korea,unicameral legislature,2016,251,000webhost,False,14936670.0,True,False,False,False,False,False,False,0.654795,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ep,1,Commercial
1104199,016kimkj@gmail.com,male,KOR,South-Korea,unicameral legislature,2016,251,123RF,False,8661578.0,True,False,False,False,False,False,False,0.652055,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,ep,1,Commercial
1104200,016kimkj@gmail.com,male,KOR,South-Korea,unicameral legislature,2016,251,2844Breaches,False,80115532.0,False,False,False,False,False,False,False,0.019178,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ep,1,Commercial
1104201,016kimkj@gmail.com,male,KOR,South-Korea,unicameral legislature,2016,251,500px,False,14867999.0,True,False,False,False,False,False,False,0.720548,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ep,1,Commercial
1104202,016kimkj@gmail.com,male,KOR,South-Korea,unicameral legislature,2016,251,8fit,False,15025407.0,True,False,False,False,False,False,False,0.720548,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ep,1,Commercial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1861594,zoerapti@gmail.com,,GRC,Greece,,2025,368,Twitter200M,False,211524284.0,True,False,False,False,False,False,False,2.010959,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,scraped,0,Commercial
1861595,zoerapti@gmail.com,,GRC,Greece,,2025,368,VerificationsIO,False,763117241.0,True,False,False,False,False,False,False,0.032877,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,scraped,0,Commercial
1861596,zoerapti@gmail.com,,GRC,Greece,,2025,368,YouveBeenScraped,False,66147869.0,True,False,False,False,False,False,False,0.169863,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,scraped,0,Commercial
1861597,zoerapti@gmail.com,,GRC,Greece,,2025,368,db8151dd,False,22802117.0,True,False,False,False,False,False,False,0.232877,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,scraped,0,Commercial


In [43]:
# email-level breach num data w/ cov
df_email_lvl_cov = (
    df_email_breach_expanded
    .rename_column("present", "nbreach")
    .assign(nbreach_serious=lambda df_: df_["nbreach"] * df_["seriousbreach"])
    .groupby(["email", "source", "leg_start_year", "country", "cc3", "ecategory"])
#     .groupby(["email", "source"])
    [["nbreach", "nbreach_serious"]]
#     [["present",]]
    .sum()
    .reset_index()
    .sort_values(["source", "cc3", "leg_start_year", "email",], ignore_index=True)
    .merge(
        (
            pd.read_csv(
                "../data/everypol/everypol_combined_legislature_data.csv", low_memory=False
            )
            .dropna(subset=["email"])
            .filter(["email", "twitter", "facebook", "group_id", "gender", "ltype", "leg_start_year", "leg_start_date", "person_count_legistype"])
            .sort_values(["email", "leg_start_year"])
            .remove_columns("leg_start_year")
            .pipe(lambda df_: clean_dedupe_email_column(df_))
        ), how="left", on="email", validate="m:1"
    )
)
print(len(df_email_lvl_cov.query("nbreach>0")))
print(len(df_email_lvl_cov.query("nbreach_serious>0")))
assert (df_email_lvl_cov["nbreach"]>=df_email_lvl_cov["nbreach_serious"]).all()
df_email_lvl_cov.to_csv("../data/email_lvl_cov.csv", index=False)
df_email_lvl_cov

4091
2671


Unnamed: 0,email,source,leg_start_year,country,cc3,ecategory,nbreach,nbreach_serious,twitter,facebook,group_id,gender,ltype,leg_start_date,person_count_legistype
0,albana.vokshi@parlament.al,ep,2009,Albania,ALB,Official,0,0,,,c26946f7-8c5e-4474-a85e-ee5342cf4006,female,unicameral legislature,2009-09-08,292.0
1,aldo.bumci@parlament.al,ep,2009,Albania,ALB,Official,0,0,,,c26946f7-8c5e-4474-a85e-ee5342cf4006,male,unicameral legislature,2009-09-08,292.0
2,bashkim.fino@parlament.al,ep,2009,Albania,ALB,Official,0,0,,,0c83d4f1-9e86-48f2-95ee-9c3d158d1141,male,unicameral legislature,2009-09-08,292.0
3,besnik.baraj@parlament.al,ep,2009,Albania,ALB,Official,0,0,,,0c83d4f1-9e86-48f2-95ee-9c3d158d1141,male,unicameral legislature,2009-09-08,292.0
4,blendi.klosi@parlament.al,ep,2009,Albania,ALB,Official,0,0,,,0c83d4f1-9e86-48f2-95ee-9c3d158d1141,male,unicameral legislature,2009-09-08,292.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12910,yew.tee.mps@pap.org.sg,scraped,2021,Singapore,SGP,Official,0,0,,,,,,,
12911,yiphonweng@yck.sg,scraped,2021,Singapore,SGP,Commercial,0,0,,,,,,,
12912,zaqy_mohamad@mindef.gov.sg,scraped,2021,Singapore,SGP,Official,1,0,,,,,,,
12913,zaqy_mohamad@mom.gov.sg,scraped,2021,Singapore,SGP,Official,0,0,,,,,,,


In [44]:
(
    df_email_lvl_cov
    .sort_values("nbreach_serious", ascending=False, ignore_index=True)
    .head(30)
)

Unnamed: 0,email,source,leg_start_year,country,cc3,ecategory,nbreach,nbreach_serious,twitter,facebook,group_id,gender,ltype,leg_start_date,person_count_legistype
0,na@gmail.com,scraped,2025,India,IND,Commercial,256,189,,,,,,,
1,mun@yahoo.com,ep,2015,Nigeria,NGA,Commercial,50,37,,,APC,male,lower house,2015-06-09,370.0
2,rmail@gmail.com,ep,2013,Kenya,KEN,Commercial,38,23,,,8abae1f3-e11a-4635-a1cf-06c222bcc2a6,female,lower house,2013-03-28,355.0
3,dutt66@gmail.com,scraped,2025,India,IND,Commercial,24,17,,,,,,,
4,chrisdermen@hotmail.com,scraped,2025,Greece,GRC,Commercial,19,16,,,,,,,
5,antonio.misiani@gmail.com,ep,2018,Italy,ITA,Commercial,20,15,antoniomisiani,antonio.misiani,59b8ea06-546e-4044-8f83-587bb25ed753,male,upper house,2018-03-23,566.0
6,s.kanth79@gmail.com,scraped,2025,India,IND,Commercial,20,13,,,,,,,
7,divyaspandana@gmail.com,scraped,2025,India,IND,Commercial,21,13,,,,,,,
8,drsigas@yahoo.co.in,scraped,2025,India,IND,Commercial,17,12,,,,,,,
9,anchend@absamail.co.za,ep,2014,South-Africa,ZAF,Commercial,16,12,,,Q761877,female,lower house,2014-05-21,499.0


In [45]:
# (
#     df_email_breach_expanded
#     .query("ecategory=='Official'")
#     .drop_duplicates(["email"])
#     .groupby("domain")
#     .size()
#     .reset_index(name="count")
#     .sort_values("count", ascending=False, ignore_index=True)
#     .head(100)
# )

In [46]:
# (
#     df_email_breach_expanded
#     .query("ecategory=='Commercial'")
#     .drop_duplicates(["email"])
#     .groupby("domain")
#     .size()
#     .reset_index(name="count")
#     .sort_values("count", ascending=False, ignore_index=True)
#     .head(50)
# )

In [47]:
# Not found in hibp payloads
# sampleemails = df_email_breach_expanded["email"].unique().tolist()
# delinquents = []
# for email in df_pol_emails["email"].unique().tolist():
#     if email not in sampleemails:
#         delinquents.append(email)
# delinquents        

### Basic numbers

In [48]:
n_emails = df_email_breach_expanded["email"].nunique()
n_emails

12384

In [49]:
# pols involved in 562 data breaches
df_email_breach_expanded["breach"].nunique()

562

In [50]:
# 3750 pols have been breached
n_pwned_pols = df_email_breach_expanded.query("present==True")["email"].nunique()
n_pwned_pols

4091

In [51]:
100 * n_pwned_pols / df_email_breach_expanded["email"].nunique()

33.03456072351421

In [52]:
# 2620 pols have been breached multiple times
n_multi_pwned_pols = (
    df_email_breach_expanded.groupby(["email"])["present"]
    .sum()
    .reset_index()
    .query("present>1")["email"]
    .nunique()
)
n_multi_pwned_pols

2418

In [53]:
# Proportion of multiple breaches
100 * (n_multi_pwned_pols / n_pwned_pols)

59.10535321437301

In [54]:
# Proportion of total
100 * n_multi_pwned_pols / df_email_breach_expanded["email"].nunique()

19.525193798449614

In [55]:
# How many email-pw pairs
_n = (
    df_email_breach_expanded.query("present==True")
    .clean_names()
    .query("email_addresses==1")
    .query("passwords==1")["email"]
    .nunique()
)

In [56]:
100 * _n / n_pwned_pols

63.43192373502811

In [57]:
# How many had serious breaches
n_pwned_pols_sbreach = (
    df_email_breach_expanded.query("present==True")
    .query("seriousbreach==1")["email"]
    .nunique()
)
n_pwned_pols_sbreach

2671

In [58]:
# How many had multiple serious breaches
n_multi_pwned_pols_sbreach = (
    df_email_breach_expanded.query("present==True")
    .query("seriousbreach==1")
    .groupby(["email"])["seriousbreach"]
    .sum()
    .reset_index()
    .query("seriousbreach>1")["email"]
    .nunique()
)
n_multi_pwned_pols_sbreach

1318

### Basic numbers - Personal/Comm

In [59]:
n_comm_emails = df_email_breach_expanded.query("ecategory=='Commercial'")[
    "email"
].nunique()
n_comm_emails

3013

In [60]:
# pols involved in 562 data breaches
df_email_breach_expanded.query("ecategory=='Commercial'")["breach"].nunique()

562

In [61]:
# 1301 pols have been breached
n_pwned_pols_comm = (
    df_email_breach_expanded.query("ecategory=='Commercial'")
    .query("present==True")["email"]
    .nunique()
)
n_pwned_pols_comm

1426

In [62]:
100 * n_pwned_pols_comm / df_email_breach_expanded.query("ecategory=='Commercial'")[
    "email"
].nunique()

47.32824427480916

In [63]:
# 867 pols have been breached multiple times
n_multi_pwned_pols_comm = (
    df_email_breach_expanded.query("ecategory=='Commercial'")
    .groupby(["email"])["present"]
    .sum()
    .reset_index()
    .query("present>1")["email"]
    .nunique()
)
n_multi_pwned_pols_comm

913

In [64]:
# Proportion of multiple breaches
100 * (n_multi_pwned_pols_comm / n_pwned_pols_comm)

64.02524544179524

In [65]:
# How many email-pw pairs
_n = (
    df_email_breach_expanded.query("ecategory=='Commercial'")
    .query("present==True")
    .clean_names()
    .query("email_addresses==1")
    .query("passwords==1")["email"]
    .nunique()
)

In [66]:
# How many had serious breaches
n_pwned_pols_sbreach_comm = (
    df_email_breach_expanded.query("ecategory=='Commercial'")
    .query("present==True")
    .query("seriousbreach==1")["email"]
    .nunique()
)
n_pwned_pols_sbreach_comm

1061

In [67]:
# How many had multiple serious breaches
n_multi_pwned_pols_sbreach_comm = (
    df_email_breach_expanded.query("ecategory=='Commercial'")
    .query("present==True")
    .query("seriousbreach==1")
    .groupby("email")["seriousbreach"]
    .sum()
    .reset_index()
    .query("seriousbreach>1")["email"]
    .nunique()
)
n_multi_pwned_pols_sbreach_comm

603

In [68]:
# How many had multiple serious breaches
n_multi_pwned_pols_sbreach = (
    df_email_breach_expanded.query("present==True")
    .query("seriousbreach==1")
    .groupby("email")["seriousbreach"]
    .sum()
    .reset_index()
    .query("seriousbreach>1")["email"]
    .nunique()
)
n_multi_pwned_pols_sbreach

1318

### Basic numbers - Official

In [69]:
n_govt_emails = df_email_breach_expanded.query("ecategory=='Official'")[
    "email"
].nunique()
n_govt_emails

9371

In [70]:
# pols involved in 563 data breaches
df_email_breach_expanded.query("ecategory=='Official'")["breach"].nunique()

561

In [71]:
# 2791 pols have been breached
n_pwned_pols_govt = (
    df_email_breach_expanded.query("ecategory=='Official'")
    .query("present==True")["email"]
    .nunique()
)
n_pwned_pols_govt

2665

In [72]:
100 * n_pwned_pols_govt / n_govt_emails

28.438800554903427

In [73]:
# 867 pols have been breached multiple times
n_multi_pwned_pols_govt = (
    df_email_breach_expanded.query("ecategory=='Official'")
    .groupby(["email"])["present"]
    .sum()
    .reset_index()
    .query("present>1")["email"]
    .nunique()
)
n_multi_pwned_pols_comm

913

In [74]:
# Proportion of multiple breaches
100 * (n_multi_pwned_pols_govt / n_pwned_pols_govt)

56.47279549718574

In [75]:
# How many email-pw pairs
_n = (
    df_email_breach_expanded.query("ecategory=='Official'")
    .query("present==True")
    .clean_names()
    .query("email_addresses==1")
    .query("passwords==1")["email"]
    .nunique()
)

In [76]:
100 * _n / n_pwned_pols_govt

58.42401500938087

In [77]:
# How many had serious breaches
n_pwned_pols_sbreach_govt = (
    df_email_breach_expanded.query("ecategory=='Official'")
    .query("present==True")
    .query("seriousbreach==1")["email"]
    .nunique()
)
n_pwned_pols_sbreach_govt

1610

In [78]:
# How many had multiple serious breaches
n_multi_pwned_pols_sbreach_govt = (
    df_email_breach_expanded.query("ecategory=='Official'")
    .query("present==True")
    .query("seriousbreach==1")
    .groupby(["email"])["seriousbreach"]
    .sum()
    .reset_index()
    .query("seriousbreach>1")["email"]
    .nunique()
)
n_multi_pwned_pols_sbreach_govt

715

## Tabulate

### Breach summary

In [79]:
_all = (
    df_email_breach_expanded.groupby("email")["present"]
    .sum()
    .pipe(lambda s: s.describe().to_frame().T)
    .assign(perc_at_least_1=100 * n_pwned_pols / n_emails)
    .assign(perc_at_least_2=100 * n_multi_pwned_pols / n_emails)
)
_all

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,perc_at_least_1,perc_at_least_2
present,12384.0,1.010174,3.165044,0.0,0.0,0.0,1.0,256.0,33.034561,19.525194


In [80]:
_govt = (
    df_email_breach_expanded.query("ecategory=='Official'")
    .groupby("email")["present"]
    .sum()
    .pipe(lambda s: s.describe().to_frame().T)
    .astype(int)
    .assign(perc_at_least_1=100 * n_pwned_pols_govt / n_govt_emails)
    .assign(perc_at_least_2=100 * n_multi_pwned_pols_govt / n_govt_emails)
)
_govt

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,perc_at_least_1,perc_at_least_2
present,9371,0,1,0,0,0,1,18,28.438801,16.060186


In [81]:
_comm = (
    df_email_breach_expanded.query("ecategory=='Commercial'")
    .groupby("email")["present"]
    .sum()
    .pipe(lambda s: s.describe().to_frame().T)
    .astype(int)
    .assign(perc_at_least_1=100 * n_pwned_pols_comm / n_comm_emails)
    .assign(perc_at_least_2=100 * n_multi_pwned_pols_comm / n_comm_emails)
)
_comm

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,perc_at_least_1,perc_at_least_2
present,3013,1,5,0,0,0,2,256,47.328244,30.302025


In [82]:
_tab = (
    pd.concat([_all, _govt, _comm], ignore_index=True)
    .assign(
        **{
            col: lambda df_, col=col: df_[col].round(0).astype(int)
            for col in ["min", "25%", "50%", "75%", "max"]
        }
    )
    .round(1)
    .assign(perc_at_least_1=lambda df_: df_["perc_at_least_1"].astype(str) + "\\%")
    .assign(perc_at_least_2=lambda df_: df_["perc_at_least_2"].astype(str) + "\\%")
    .assign(
        count=lambda df_: df_["count"].round(0).astype(int).apply(lambda x: f"{x:,}")
    )
    .astype(str)
    .assign(cat=["All emails", "Government emails", "Personal emails"])
    .reorder_columns(["cat"])
)
pandas_to_tex(_tab, "../tables/pooled_pols_breach_number_summary", index=False)
display(_tab)
!cat ../tables/pooled_pols_breach_number_summary.tex

Unnamed: 0,cat,count,mean,std,min,25%,50%,75%,max,perc_at_least_1,perc_at_least_2
0,All emails,12384,1.0,3.2,0,0,0,1,256,33.0\%,19.5\%
1,Government emails,9371,0.0,1.0,0,0,0,1,18,28.4\%,16.1\%
2,Personal emails,3013,1.0,5.0,0,0,0,2,256,47.3\%,30.3\%


\midrule
All emails & 12,384 & 1.0 & 3.2 & 0 & 0 & 0 & 1 & 256 & 33.0\% & 19.5\% \\
Government emails & 9,371 & 0.0 & 1.0 & 0 & 0 & 0 & 1 & 18 & 28.4\% & 16.1\% \\
Personal emails & 3,013 & 1.0 & 5.0 & 0 & 0 & 0 & 2 & 256 & 47.3\% & 30.3\% \\

In [83]:
_all = (
    df_email_breach_expanded.query("seriousbreach==1")
    .groupby("email")["present"]
    .sum()
    .pipe(lambda s: s.describe().to_frame().T)
    .assign(perc_at_least_1=100 * n_pwned_pols_sbreach / n_emails)
    .assign(perc_at_least_2=100 * n_multi_pwned_pols_sbreach / n_emails)
)
_all

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,perc_at_least_1,perc_at_least_2
present,12384.0,0.482962,2.119967,0.0,0.0,0.0,0.0,189.0,21.568152,10.642765


In [84]:
_govt = (
    df_email_breach_expanded.query("ecategory=='Official'")
    .query("seriousbreach==1")
    .groupby("email")["present"]
    .sum()
    .pipe(lambda s: s.describe().to_frame().T)
    .astype(int)
    .assign(perc_at_least_1=100 * n_pwned_pols_sbreach_govt / n_govt_emails)
    .assign(perc_at_least_2=100 * n_multi_pwned_pols_sbreach_govt / n_govt_emails)
)
_govt

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,perc_at_least_1,perc_at_least_2
present,9371,0,0,0,0,0,0,9,17.180664,7.629922


In [85]:
_comm = (
    df_email_breach_expanded.query("ecategory=='Commercial'")
    .query("seriousbreach==1")
    .groupby("email")["present"]
    .sum()
    .pipe(lambda s: s.describe().to_frame().T)
    .astype(int)
    .assign(perc_at_least_1=100 * n_pwned_pols_sbreach_comm / n_comm_emails)
    .assign(perc_at_least_2=100 * n_multi_pwned_pols_sbreach_comm / n_comm_emails)
)
_comm

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,perc_at_least_1,perc_at_least_2
present,3013,1,3,0,0,0,1,189,35.214072,20.013276


In [86]:
_tab = (
    pd.concat([_all, _govt, _comm], ignore_index=True)
    .assign(
        **{
            col: lambda df_, col=col: df_[col].round(0).astype(int)
            for col in ["min", "25%", "50%", "75%", "max"]
        }
    )
    .round(1)
    .assign(perc_at_least_1=lambda df_: df_["perc_at_least_1"].astype(str) + "\\%")
    .assign(
        count=lambda df_: df_["count"].round(0).astype(int).apply(lambda x: f"{x:,}")
    )
    .astype(str)
    .assign(cat=["All emails", "Government emails", "Personal emails"])
    .reorder_columns(["cat"])
)
pandas_to_tex(_tab, "../tables/pooled_pols_seriousbreach_number_summary", index=False)
display(_tab)
!cat ../tables/pooled_pols_seriousbreach_number_summary.tex

Unnamed: 0,cat,count,mean,std,min,25%,50%,75%,max,perc_at_least_1,perc_at_least_2
0,All emails,12384,0.5,2.1,0,0,0,0,189,21.6\%,10.6
1,Government emails,9371,0.0,0.0,0,0,0,0,9,17.2\%,7.6
2,Personal emails,3013,1.0,3.0,0,0,0,1,189,35.2\%,20.0


\midrule
All emails & 12,384 & 0.5 & 2.1 & 0 & 0 & 0 & 0 & 189 & 21.6\% & 10.6 \\
Government emails & 9,371 & 0.0 & 0.0 & 0 & 0 & 0 & 0 & 9 & 17.2\% & 7.6 \\
Personal emails & 3,013 & 1.0 & 3.0 & 0 & 0 & 0 & 1 & 189 & 35.2\% & 20.0 \\

### Tabulate: Compromised data types

In [87]:
df_pwnpol_datatype = (
    df_email_breach_expanded.query("present==True")
    # ===================================================
    # Filter data types and remove duplicate breach info
    .set_index("email")
    .iloc[:, 18:-3]
    .reset_index()
    .drop_duplicates(ignore_index=True)
    # ===================================================
    # Long by email-datatype
    .melt(id_vars=["email"], var_name="datatype", value_name="present")
    .groupby(["email", "datatype"])["present"]
    .sum()
    .reset_index()
    .assign(present=lambda df_: np.where(df_["present"] > 0, 1, 0))
    # ===================================================
    # Back to wide by email
    .pivot(index="email", columns="datatype", values="present")
    # ===================================================
    .T.sum(axis=1)
    .reset_index(name="count")
    .sort_values(["count", "datatype"], ascending=[False, True], ignore_index=True)
    .assign(percent=lambda df_: 100 * df_["count"] / n_pwned_pols)
    .assign(percent=lambda df_: df_["percent"].apply(lambda x: f"{round(x, 1)}\\%"))
    # ===================================================
    .assign(
        seriousbreach=lambda df_: df_["datatype"]
        .isin(LIST_SERIOUS_DATACLASSES)
        .map({True: r"\checkmark", False: ""})
    )
)
df_pwnpol_datatype

Unnamed: 0,datatype,count,percent,seriousbreach
0,Email addresses,4090,100.0\%,
1,Names,3479,85.0\%,
2,Phone numbers,3200,78.2\%,
3,Job titles,2841,69.4\%,
4,Physical addresses,2701,66.0\%,
5,Social media profiles,2644,64.6\%,
6,Passwords,2596,63.5\%,\checkmark
7,Geographic locations,2090,51.1\%,
8,Employers,1868,45.7\%,
9,Genders,1714,41.9\%,


In [88]:
df_datatype1 = (
    df_pwnpol_datatype.iloc[:20]
    .reset_index(drop=True)
    .assign(ix=range(1, 21))
    .astype(str)
    .reorder_columns(["ix"])
)
df_datatype2 = (
    df_pwnpol_datatype.iloc[20:40]
    .reset_index(drop=True)
    .assign(ix=range(21, 41))
    .astype(str)
    .reorder_columns(["ix"])
)
df_datatype3 = (
    df_pwnpol_datatype.iloc[40:60]
    .reset_index(drop=True)
    .assign(ix=range(41, 61))
    .astype(str)
    .reorder_columns(["ix"])
)
df2tex = pd.concat([df_datatype1, df_datatype2, df_datatype3], axis=1)
# print(df2tex.to_latex(na_rep="", index=False))
df2tex

Unnamed: 0,ix,datatype,count,percent,seriousbreach,ix.1,datatype.1,count.1,percent.1,seriousbreach.1,ix.2,datatype.2,count.2,percent.2,seriousbreach.2
0,1,Email addresses,4090,100.0\%,,21,Marital statuses,99,2.4\%,,41,Payment histories,38,0.9\%,
1,2,Names,3479,85.0\%,,22,Religions,99,2.4\%,,42,Survey results,34,0.8\%,
2,3,Phone numbers,3200,78.2\%,,23,Email messages,96,2.3\%,\checkmark,43,User website URLs,31,0.8\%,
3,4,Job titles,2841,69.4\%,,24,Password hints,81,2.0\%,\checkmark,44,Telecommunications carrier,30,0.7\%,
4,5,Physical addresses,2701,66.0\%,,25,Ethnicities,77,1.9\%,,45,Nationalities,27,0.7\%,\checkmark
5,6,Social media profiles,2644,64.6\%,,26,Home ownership statuses,77,1.9\%,,46,Private messages,27,0.7\%,\checkmark
6,7,Passwords,2596,63.5\%,\checkmark,27,Auth tokens,75,1.8\%,\checkmark,47,Relationship statuses,21,0.5\%,
7,8,Geographic locations,2090,51.1\%,,28,Occupations,75,1.8\%,,48,Company names,20,0.5\%,
8,9,Employers,1868,45.7\%,,29,PINs,69,1.7\%,\checkmark,49,Deceased statuses,18,0.4\%,
9,10,Genders,1714,41.9\%,,30,Partial credit card data,67,1.6\%,\checkmark,50,Website activity,16,0.4\%,


In [89]:
pandas_to_tex(
    df2tex, "../tables/hibp_pwnpols_datatypes.tex", na_rep="", index=False, escape=False
)

!cat "../tables/hibp_pwnpols_datatypes.tex"

\midrule
1 & Email addresses & 4090 & 100.0\% &  & 21 & Marital statuses & 99 & 2.4\% &  & 41 & Payment histories & 38 & 0.9\% &  \\
2 & Names & 3479 & 85.0\% &  & 22 & Religions & 99 & 2.4\% &  & 42 & Survey results & 34 & 0.8\% &  \\
3 & Phone numbers & 3200 & 78.2\% &  & 23 & Email messages & 96 & 2.3\% & \checkmark & 43 & User website URLs & 31 & 0.8\% &  \\
4 & Job titles & 2841 & 69.4\% &  & 24 & Password hints & 81 & 2.0\% & \checkmark & 44 & Telecommunications carrier & 30 & 0.7\% &  \\
5 & Physical addresses & 2701 & 66.0\% &  & 25 & Ethnicities & 77 & 1.9\% &  & 45 & Nationalities & 27 & 0.7\% & \checkmark \\
6 & Social media profiles & 2644 & 64.6\% &  & 26 & Home ownership statuses & 77 & 1.9\% &  & 46 & Private messages & 27 & 0.7\% & \checkmark \\
7 & Passwords & 2596 & 63.5\% & \checkmark & 27 & Auth tokens & 75 & 1.8\% & \checkmark & 47 & Relationship statuses & 21 & 0.5\% &  \\
8 & Geographic locations & 2090 & 51.1\% &  & 28 & Occupations & 75 & 1.8\% &  & 48 

### Tabulate: Breaches

In [90]:
df_pwnpol_breach_incident = (
    df_email_breach_expanded.query("present==True")
    # ===================================================
    .groupby(["breach"])["email"]
    .size()
    .reset_index()
    .rename_column("email", "emails")
    .sort_values("emails", ascending=False, ignore_index=True)
    .head(50)
    .assign(percent=lambda df_: 100 * df_["emails"] / n_pwned_pols)
    .assign(percent=lambda df_: df_["percent"].apply(lambda x: f"{round(x, 1)}\%"))
    # ===================================================
    # Merge back to get breach characteristics
    .merge(
        (
            pd.read_parquet("../data/breaches_01_2025_expanded.parquet")
            .remove_columns(
                [
                    "description",
                    "title",
                    "modifieddate",
                    "logopath",
                    "timetopublic",
                    "dataclasses",
                ]
            )
            .rename_column("name", "breach")
            .assign(
                seriousbreach=lambda df_: df_[LIST_SERIOUS_DATACLASSES]
                .max(axis=1)
                .astype(int)
            )
            .assign(
                seriousbreach=lambda df_: np.where(
                    df_["seriousbreach"] == 1, r"\checkmark", ""
                )
            )
            #             .filter(["breach", "breachdate", "addeddate", "yearstopublic", "n_dataclasses", "seriousbreach", *LIST_ALL_DATACLASSES])
            .filter(
                [
                    "breach",
                    "domain",
                    "breachdate",
                    "addeddate",
                    "yearstopublic",
                    "pwncount",
                    "n_dataclasses",
                    "seriousbreach",
                ]
            )
        ),
        how="left",
        on="breach",
        validate="1:1",
    )
    # ===================================================
    # Remove timestamps
    .assign(
        breachdate=lambda df_: pd.to_datetime(df_["breachdate"]).dt.date,
        addeddate=lambda df_: pd.to_datetime(df_["addeddate"]).dt.date,
    )
    .assign(pwncount=lambda df_: df_["pwncount"] / 1_000_000)
    .round(1)
    .astype(str)
    .assign(pwncount=lambda df_: df_["pwncount"] + "M")
    .assign(yearstopublic=lambda df_: df_["yearstopublic"] + " years")
    # ===================================================
    .assign(ix=lambda df_: range(1, 1 + len(df_)))
    .reorder_columns(["ix"])
    .replace("None", "---")
)
df_pwnpol_breach_incident

Unnamed: 0,ix,breach,emails,percent,domain,breachdate,addeddate,yearstopublic,pwncount,n_dataclasses,seriousbreach
0,1,db8151dd,1440,35.2\%,covve.com,2020-02-20,2020-05-15,0.2 years,22.8M,6,
1,2,OnlinerSpambot,1230,30.1\%,---,2017-08-28,2017-08-29,0.0 years,711.5M,2,\checkmark
2,3,PDL,1128,27.6\%,---,2019-10-16,2019-11-22,0.1 years,622.2M,7,
3,4,VerificationsIO,1037,25.3\%,verifications.io,2019-02-25,2019-03-09,0.0 years,763.1M,10,
4,5,LinkedIn,489,12.0\%,linkedin.com,2012-05-05,2016-05-21,4.0 years,164.6M,2,\checkmark
5,6,LinkedInScrape,385,9.4\%,linkedin.com,2021-04-08,2021-10-02,0.5 years,125.7M,7,
6,7,Apollo,340,8.3\%,apollo.io,2018-07-23,2018-10-05,0.2 years,125.9M,8,
7,8,Intelimost,294,7.2\%,intelimost.com,2019-03-10,2019-04-02,0.1 years,3.1M,2,\checkmark
8,9,Cit0day,271,6.6\%,cit0day.in,2020-11-04,2020-11-19,0.0 years,226.9M,2,\checkmark
9,10,Twitter200M,269,6.6\%,twitter.com,2021-01-01,2023-01-05,2.0 years,211.5M,4,


In [91]:
pandas_to_tex(
    df_pwnpol_breach_incident.head(25),
    "../tables/hibp_pwnpols_breach_incidents.tex",
    escape=False,
)

In [92]:
!cat ../tables/hibp_pwnpols_breach_incidents.tex

\midrule
1 & db8151dd & 1440 & 35.2\% & covve.com & 2020-02-20 & 2020-05-15 & 0.2 years & 22.8M & 6 &  \\
2 & OnlinerSpambot & 1230 & 30.1\% & --- & 2017-08-28 & 2017-08-29 & 0.0 years & 711.5M & 2 & \checkmark \\
3 & PDL & 1128 & 27.6\% & --- & 2019-10-16 & 2019-11-22 & 0.1 years & 622.2M & 7 &  \\
4 & VerificationsIO & 1037 & 25.3\% & verifications.io & 2019-02-25 & 2019-03-09 & 0.0 years & 763.1M & 10 &  \\
5 & LinkedIn & 489 & 12.0\% & linkedin.com & 2012-05-05 & 2016-05-21 & 4.0 years & 164.6M & 2 & \checkmark \\
6 & LinkedInScrape & 385 & 9.4\% & linkedin.com & 2021-04-08 & 2021-10-02 & 0.5 years & 125.7M & 7 &  \\
7 & Apollo & 340 & 8.3\% & apollo.io & 2018-07-23 & 2018-10-05 & 0.2 years & 125.9M & 8 &  \\
8 & Intelimost & 294 & 7.2\% & intelimost.com & 2019-03-10 & 2019-04-02 & 0.1 years & 3.1M & 2 & \checkmark \\
9 & Cit0day & 271 & 6.6\% & cit0day.in & 2020-11-04 & 2020-11-19 & 0.0 years & 226.9M & 2 & \checkmark \\
10 & Twitter200M & 269 & 6.6\% & twitter.com & 202

In [93]:
(
    df_email_lvl_cov
    .query("ecategory=='Official'")
    .sort_values("nbreach_serious", ascending=False, ignore_index=True)
    .head(50)
)

Unnamed: 0,email,source,leg_start_year,country,cc3,ecategory,nbreach,nbreach_serious,twitter,facebook,group_id,gender,ltype,leg_start_date,person_count_legistype
0,benny.engelbrecht@ft.dk,ep,2007,Denmark,DNK,Official,18,9,BennyEngelbrech,,07b1647d-307e-478b-8dbb-324e7cbacf96,male,unicameral legislature,2007-11-13,612.0
1,lucasi@parliament.uk,ep,2001,UK,GBR,Official,13,9,IanCLucas,ian4wrexham,labour,male,lower house,2001-06-07,1437.0
2,vaizeye@parliament.uk,ep,2005,UK,GBR,Official,15,9,edvaizey,vaizeymp,conservative,male,lower house,2005-05-05,1437.0
3,bottomleyp@parliament.uk,ep,1997,UK,GBR,Official,15,9,PBottomleyMP,Sir-Peter-Bottomley-MP-293136030810246,conservative,male,lower house,1997-05-01,1437.0
4,bryantc@parliament.uk,ep,2001,UK,GBR,Official,17,8,RhonddaBryant,Chris4Rhondda,labour,male,lower house,2001-06-07,1437.0
5,senator.carol.brown@aph.gov.au,ep,2013,Australia,AUS,Official,12,8,SenCarolBrown,,australian_labor_party,female,upper house,2013-09-07,272.0
6,julie.bishop.mp@aph.gov.au,ep,2004,Australia,AUS,Official,13,8,JulieBishopMP,julie.bishop.mp,60892610-6d79-42cd-b573-1b5e21c71599,female,lower house,2004-10-09,515.0
7,byrnel@parliament.uk,ep,2001,UK,GBR,Official,13,8,LiamByrneMP,LiamByrneHodgeHill,labour,male,lower house,2001-06-07,1437.0
8,woodsr@parliament.uk,ep,2005,UK,GBR,Official,10,8,robertabwMP,robertablackmanwoodsmp,labour,female,lower house,2005-05-05,1437.0
9,mccabes@parliament.uk,ep,1997,UK,GBR,Official,15,8,steve_mccabe,stevemccabemp,labour,male,lower house,1997-05-01,1437.0


In [94]:
(
    df_email_lvl_cov
#     .query("ecategory=='Commercial'")
    .sort_values("nbreach_serious", ascending=False, ignore_index=True)
    .head(50)
)

Unnamed: 0,email,source,leg_start_year,country,cc3,ecategory,nbreach,nbreach_serious,twitter,facebook,group_id,gender,ltype,leg_start_date,person_count_legistype
0,na@gmail.com,scraped,2025,India,IND,Commercial,256,189,,,,,,,
1,mun@yahoo.com,ep,2015,Nigeria,NGA,Commercial,50,37,,,APC,male,lower house,2015-06-09,370.0
2,rmail@gmail.com,ep,2013,Kenya,KEN,Commercial,38,23,,,8abae1f3-e11a-4635-a1cf-06c222bcc2a6,female,lower house,2013-03-28,355.0
3,dutt66@gmail.com,scraped,2025,India,IND,Commercial,24,17,,,,,,,
4,chrisdermen@hotmail.com,scraped,2025,Greece,GRC,Commercial,19,16,,,,,,,
5,antonio.misiani@gmail.com,ep,2018,Italy,ITA,Commercial,20,15,antoniomisiani,antonio.misiani,59b8ea06-546e-4044-8f83-587bb25ed753,male,upper house,2018-03-23,566.0
6,s.kanth79@gmail.com,scraped,2025,India,IND,Commercial,20,13,,,,,,,
7,divyaspandana@gmail.com,scraped,2025,India,IND,Commercial,21,13,,,,,,,
8,drsigas@yahoo.co.in,scraped,2025,India,IND,Commercial,17,12,,,,,,,
9,anchend@absamail.co.za,ep,2014,South-Africa,ZAF,Commercial,16,12,,,Q761877,female,lower house,2014-05-21,499.0
