In [1]:
import pandas as pd
import janitor
import numpy as np
from IPython.display import display

import sys

sys.path.append("/home/lsys/pwned_pols/venv/lib/python3.10/site-packages")

from utilities import (
    clean_dedupe_email_column,
    pandas_to_tex,
    clean_email_column_no_dedupe,
    classify_comm_gov_email,
)
from utilities import LIST_SERIOUS_DATACLASSES

pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 150)
pd.set_option("display.max_colwidth", None)

import warnings

warnings.filterwarnings("ignore")

## Prep EP data

In [2]:
df_ep_emails = (
    pd.read_csv(
        "../data/everypol/everypol_combined_legislature_data.csv", low_memory=False
    )
    .sort_values(["cc3", "leg_start_year", "email"])
    .pipe(lambda df_: clean_email_column_no_dedupe(df_))
    .drop_duplicates(subset=["email"], keep="first", ignore_index=True)
    # ================================================================
    # Fix missing cc3 for Wales/Scotland
    # Wales, Scotland = GBR
    .assign(
        cc3=lambda df_: np.where(
            df_["cc"] == "GB-SCT",
            "GBR",
            np.where(df_["cc"] == "GB-WLS", "GBR", df_["cc3"]),
        )
    )
    # ================================================================
    # Fix missing pop for Wales/Scotland
    .assign(
        pop2024=lambda df_: np.where(
            df_["cc"] == "GB-SCT",
            "GBR",
            np.where(df_["cc"] == "GB-WLS", 68556800, df_["pop2024"]),
        )
    )
    # ================================================================
    # Fix ltype for India to bicameral
    .assign(
        ltype=lambda df_: np.where(
            df_["cc3"] == "IND", "bicameral legislature", df_["ltype"]
        )
    )
    # ================================================================
    # Get #emails per country (cc3)
    .assign(nemail_cc3=lambda df_: df_.groupby("cc3")["email"].transform("nunique"))
    # ================================================================
    .reorder_columns(
        [
            "email",
            "cc3",
            "country",
            "ltype",
            "legislature",
            "chamber",
            "leg_start_year",
            "nemail_cc3",
            "gender",
        ]
    )
)

# assert (df_ep_emails["nemail_cc3"] >= 30).all()
display(df_ep_emails.head(3))
df_ep_emails.info()

Unnamed: 0,email,cc3,country,ltype,legislature,chamber,leg_start_year,nemail_cc3,gender,id,name,sort_name,twitter,facebook,group,group_id,area_id,area,term_x,start_date,end_date,image,wikidata,wikidata_group,wikidata_area,row_id,term_y,row_count,n_unique_emails,person_count_legistype,url,cc,leg_start_date,lastmod,pop2024,lastmod_year
0,albana.vokshi@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,female,c8bd71fc-4815-40b5-a5c4-bd359e3b0cef,Albana Vokshi,VOKSHI ALBANA,,,PD,c26946f7-8c5e-4474-a85e-ee5342cf4006,durrës_county,Durrës County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Albana-Vokshi-PD.jpg,Q4709025,Q845743,,490,VII Pluralist Legislature,140,31,292,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1557734928,2740502.0,2019
1,aldo.bumci@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,589ba883-b893-40ed-95f8-69b10f0497d9,Aldo Bumçi,BUMÇI ALDO,,,PD,c26946f7-8c5e-4474-a85e-ee5342cf4006,tirana_county,Tirana County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Aldo-Bumci-PD.jpg,Q2832310,Q845743,,490,VII Pluralist Legislature,140,31,292,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1557734928,2740502.0,2019
2,bashkim.fino@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,9d0181c7-7ebe-4b16-800a-6ea28baf22f5,Bashkim Fino,FINO BASHKIM,,,PS,0c83d4f1-9e86-48f2-95ee-9c3d158d1141,fier_county,Fier County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Bashkim-Fino-PS.jpg,Q809978,Q642882,,490,VII Pluralist Legislature,140,31,292,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1557734928,2740502.0,2019


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8511 entries, 0 to 8510
Data columns (total 36 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   email                   8511 non-null   object
 1   cc3                     8511 non-null   object
 2   country                 8511 non-null   object
 3   ltype                   8511 non-null   object
 4   legislature             8511 non-null   object
 5   chamber                 8511 non-null   object
 6   leg_start_year          8511 non-null   int64 
 7   nemail_cc3              8511 non-null   int64 
 8   gender                  7299 non-null   object
 9   id                      8511 non-null   object
 10  name                    8511 non-null   object
 11  sort_name               8511 non-null   object
 12  twitter                 2393 non-null   object
 13  facebook                1597 non-null   object
 14  group                   8509 non-null   object
 15  grou

In [3]:
# # countries with incomplete gender coverage
# for c in df_ep_emails["cc3"].unique().tolist():
#     _df = df_ep_emails.query(f"cc3=='{c}'")
#     n_gender = len(_df.dropna(subset=["gender"]))
#     n_country = len(_df)
#     if n_gender < n_country:
#         name = _df.reset_index().loc[0, "country"]
#         print(f"{name} ({c}):\n {n_gender}, {n_country}")

In [4]:
set_edomain = set()
for _, row in df_ep_emails.iterrows():
    try:
        edomain = row["email"].split("@")[1]
        set_edomain.add(edomain)
    except IndexError:
        print(_)
#         print(row["email"])

len(set_edomain)

479

In [5]:
# for cc in df_ep_emails["cc3"].unique().tolist():
#     _df = df_ep_emails.query(f"cc3=='{cc}'")
#     _set_edomain = set()
#     for _, row in _df.iterrows():
#         edomain = row["email"].split("@")[1]
#         _set_edomain.add(edomain)

#     print(cc)
#     print(len(_set_edomain))
#     print(_set_edomain)

### Basic numbers

In [6]:
df_ep_emails["nemail_cc3"].describe()

count    8511.000000
mean      284.645165
std       222.039880
min        31.000000
25%       140.000000
50%       208.000000
75%       405.000000
max       835.000000
Name: nemail_cc3, dtype: float64

In [7]:
df_ep_emails["ltype"].unique()

array(['unicameral legislature', 'lower house', 'upper house',
       'bicameral legislature'], dtype=object)

In [8]:
df_ep_emails["legislature"].unique()

array(['Kuvendi', 'Consell General', 'National Assembly',
       'House of Representatives', 'Senate', 'Chamber of Representatives',
       'Parliament', 'House of Commons', 'Assemblée Nationale',
       'Cámara de Representantes', 'Folketing', 'Riigikogu', 'Eduskunta',
       'Parliament of Georgia', 'States', 'Hellenic Parliament',
       'Inatsisartut', 'Congress', 'Legislative Council', 'Országgyűlés',
       'Lok Sabha', 'Majles', 'Chamber of Deputies', 'Parlament',
       'Sobranie', 'National Council', 'Tweede Kamer',
       'Constituent Assembly', 'New Zealand Parliament',
       'National Parliament', 'Assembly', 'House of Assembly',
       'Scottish Parliament', 'National Assembly for Wales'], dtype=object)

In [9]:
df_ep_emails["legislature"].nunique()

34

In [10]:
df_ep_emails["email"].nunique()

8511

In [11]:
df_ep_emails["cc3"].nunique()

55

In [12]:
df_ep_emails.groupby(["cc3", "legislature"]).ngroups

61

## Prep scraped data

In [13]:
df_scraped_emails = (
    pd.read_csv("../data/scraped_pol_combined_legislature_data.csv")
    .sort_values(["cc3", "email"])
    .pipe(lambda df_: clean_email_column_no_dedupe(df_))
    .drop_duplicates(subset=["email"], keep="first", ignore_index=True)
    # ================================================================
    # Get popsize
    .merge(
        (
            pd.read_csv("../data/popsize.csv")
            .dropna(subset=["cc3"])
            .rename_column("2024 [YR2024]", "pop2024")
        ),
        how="left",
        on="cc3",
        validate="m:1",
    )
    # ================================================================
    # Get #emails per country (cc3)
    .assign(nemail_cc3=lambda df_: df_.groupby("cc3")["email"].transform("nunique"))
    # ================================================================
    .reorder_columns(
        [
            "email",
            "cc3",
            "country",
            "ltype",
            "legislature",
            "chamber",
            #             "year",
            "nemail_cc3",
        ]
    )
    #     # ================================================================
    #     # Renaming fields to harmonise with EP
    .rename_column("leg_start_year", "leg_start_year_sg")
    .assign(
        leg_start_year=lambda df_: np.where(
            df_["cc3"] == "SGP", df_["leg_start_year_sg"], 2025
        ).astype(int)
    )
)
assert (df_scraped_emails["nemail_cc3"] >= 30).all()
df_scraped_emails

Unnamed: 0,email,cc3,country,ltype,legislature,chamber,nemail_cc3,name,party,title,address,private_phone,mobile_phone,work_phone,minister_phone,fax,mpsno,initial,firstname,lastname,gender,partyfname,partysname,statename,constname,profession,presentfaddr,presentladdr,delhiphone,permanentfaddr,permanentladdr,personalphone,lastloksabha,lsexpr,age,phone,noofterms,status,imageurl,profileurl,dob,numberofsons,numberofdaughters,qualification,freedom,profession2,categorycode,currentpagenumber,perpagesize,totalelements,totalpages,source_file,mplastfirstname,mpfirstlastname,maritalstatus,createdat,updatedat,email_fix,rank,constituency,leg_start_year_sg,bloque,apellido,nombre,provincia,partido_o_alianza_por_el_que_ingreso,designacion_legal,cese_legal,designacion_real,cese_real,telefono,facebook,twitter,instagram,youtube,nome_parlamentar,partido,uf,titularidade,mandato,telefones,dtnasc,chefe_gab,endereco,state,district,id,parliament_address,parliament_number,social_media,region_x,contact,sr_no,photo,serial_no_,tel_no_r_,constituency_name,permanent_address,mobile,tele_no_res_no_,ac_no,candidate,Country Name,Series Name,Series Code,pop2024,region_y,EU,politician,civil_servants,leg_start_year
0,alejandra.vigo@senado.gob.ar,ARG,Argentina,bicameral legislature,Parliament,,71,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,UNIDAD FEDERAL,VIGO,ALEJANDRA MARÍA,CÓRDOBA,HACEMOS POR CÓRDOBA,2021-12-10,2027-12-09,2021-12-10,Sin Datos,1441 / 1444 / 1456,https://www.facebook.com/VigoAlejandra,https://twitter.com/@alevigo,https://www.instagram.com/@vigo_alejandra,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Argentina,"Population, total",SP.POP.TOTL,46936024,,,,,2025
1,alfredo.deangeli@senado.gob.ar,ARG,Argentina,bicameral legislature,Parliament,,71,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,FRENTE PRO,DE ANGELI,ALFREDO LUIS,ENTRE RÍOS,JUNTOS POR EL CAMBIO,2019-12-10,2025-12-09,2019-12-10,Sin Datos,3580 / 81 / 82 / 84,,https://twitter.com/alfredodeangeli,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Argentina,"Population, total",SP.POP.TOTL,46936024,,,,,2025
2,alicia.kirchner@senado.gob.ar,ARG,Argentina,bicameral legislature,Parliament,,71,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,UNIDAD CIUDADANA,KIRCHNER,ALICIA MARGARITA ANTONIA,SANTA CRUZ,ALIANZA UNIÓN POR LA PATRIA,2023-12-10,2029-12-09,2023-12-10,Sin Datos,1389 / 1390,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Argentina,"Population, total",SP.POP.TOTL,46936024,,,,,2025
3,anabel.fernandezsagasti@senado.gob.ar,ARG,Argentina,bicameral legislature,Parliament,,71,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,UNIDAD CIUDADANA,FERNÁNDEZ SAGASTI,ANABEL,MENDOZA,FRENTE DE TODOS,2021-12-10,2027-12-09,2021-12-10,Sin Datos,1535 / 1467,https://www.facebook.com/anabelfsagasti,https://twitter.com/anabelfsagasti,https://www.instagram.com/anabelfsagasti,https://www.youtube.com/anabelfsagasti,,,,,,,,,,,,,,,,,,,,,,,,,,,,Argentina,"Population, total",SP.POP.TOTL,46936024,,,,,2025
4,andrea.cristina@senado.gob.ar,ARG,Argentina,bicameral legislature,Parliament,,71,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,FRENTE PRO,CRISTINA,ANDREA MARCELA,CHUBUT,JUNTOS POR EL CAMBIO CHUBUT,2023-12-10,2027-12-09,2023-12-10,Sin Datos,3453,https://www.facebook.com/AndreaCristina.Chubut?mibextid=LQQJ4d,https://x.com/andycristina07?s=11,https://www.instagram.com/andreacristina.chubut?igsh=Y2hzbzE3MWEyaGxl,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Argentina,"Population, total",SP.POP.TOTL,46936024,,,,,2025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,zainul_abidin_rasheed@mfa.gov.sg,SGP,Singapore,unicameral legislature,Parliament,,400,Zainul Abidin Rasheed,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Senior Minister of State, Ministry of Foreign Affairs Mayor, North East District",Aljunied*,2006.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Singapore,"Population, total",SP.POP.TOTL,5953785,,,https://www.pap.org.sg/,,2006
4596,zainul_abidin_rasheed@pa.gov.sg,SGP,Singapore,unicameral legislature,Parliament,,400,Zainul Abidin Rasheed,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Mayor, North East CDC",Aljunied*,2001.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Singapore,"Population, total",SP.POP.TOTL,5953785,,,https://www.pap.org.sg/,,2001
4597,zaqy_mohamad@mindef.gov.sg,SGP,Singapore,unicameral legislature,Parliament,,400,ZAQY Mohamad,,,"Ministry of Defence 303 Gombak Drive Off Upper Bukit Timah Road MINDEF Building Singapore 669645 Ministry of Manpower 18 Havelock Road, #07-01Singapore 059764",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Senior Minister of State, Ministry of Defence &Ministry of ManpowerDeputy Leader of the House(Marsiling-Yew Tee*)",Marsiling-Yew Tee,2021.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Singapore,"Population, total",SP.POP.TOTL,5953785,,,https://www.pap.org.sg/,,2021
4598,zaqy_mohamad@mom.gov.sg,SGP,Singapore,unicameral legislature,Parliament,,400,ZAQY Mohamad,,,"Ministry of Defence 303 Gombak Drive Off Upper Bukit Timah Road MINDEF Building Singapore 669645 Ministry of Manpower 18 Havelock Road, #07-01Singapore 059764",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"Senior Minister of State, Ministry of Defence &Ministry of ManpowerDeputy Leader of the House(Marsiling-Yew Tee*)",Marsiling-Yew Tee,2021.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Singapore,"Population, total",SP.POP.TOTL,5953785,,,https://www.pap.org.sg/,,2021


In [14]:
df_scraped_emails.groupby("cc3").size()

cc3
ARG      71
BRA      81
DNK     186
GRC     368
IND    3251
NGA      69
NOR     174
SGP     400
dtype: int64

### Basic numbers

In [15]:
df_scraped_emails["nemail_cc3"].describe()

count    4600.000000
mean     2379.491304
std      1354.807201
min        69.000000
25%       400.000000
50%      3251.000000
75%      3251.000000
max      3251.000000
Name: nemail_cc3, dtype: float64

In [16]:
df_scraped_emails["legislature"].unique()

array(['Parliament', 'Folketing', 'Lok Sabha', 'State Legislature',
       'Senate', 'Storting'], dtype=object)

In [17]:
df_scraped_emails["email"].nunique()

4600

In [18]:
df_scraped_emails["name"].nunique()

1507

In [19]:
df_scraped_emails["cc3"].nunique()

8

In [20]:
df_scraped_emails.groupby(["cc3", "legislature"]).ngroups

9

In [21]:
set_edomain = set()
for _, row in df_scraped_emails.iterrows():
    try:
        edomain = row["email"].split("@")[1]
        set_edomain.add(edomain)
    except IndexError:
        print(_)
#         print(row["email"])

len(set_edomain)

265

In [22]:
# for cc in df_scraped_emails["cc3"].unique().tolist():
#     _df = df_scraped_emails.query(f"cc3=='{cc}'")
#     _set_edomain = set()
#     for _, row in _df.iterrows():
#         edomain = row["email"].split("@")[1]
#         _set_edomain.add(edomain)

#     print(cc)
#     print(len(_set_edomain))
#     print(_set_edomain)

## Combine EP + Scraped

In [23]:
df_pol_emails = (
    pd.concat(
        [df_ep_emails.assign(source="ep"), df_scraped_emails.assign(source="scraped")],
        ignore_index=True,
    )
    .pipe(lambda df_: clean_email_column_no_dedupe(df_))
    .drop_duplicates(subset=["email"], keep="first", ignore_index=True)
    # ================================================================
    # Get indicator for years and chambers for table1
    .assign(
        years=lambda df_: df_.groupby("cc3")["leg_start_year"].transform(
            lambda x: ", ".join(map(str, sorted(x.unique())))
        ),
        chambers=lambda df_: df_.groupby("cc3")["ltype"]
        .transform(lambda x: ", ".join(map(str, sorted(x.unique()))))
        .replace("legislature", "", regex=True)
        .replace("house", "", regex=True)
        .str.strip()
        .str.title(),
        legislatures=lambda df_: df_.groupby("cc3")["legislature"].transform(
            lambda x: ", ".join(map(str, sorted(x.unique())))
        ),
    )
    # ================================================================
    # Get #emails per country (cc3)
    .assign(nemail_cc3=lambda df_: df_.groupby("cc3")["email"].transform("nunique"))
    .sort_values(["cc3", "leg_start_year", "email"])
)
# del df_ep_emails, df_scraped_emails
df_pol_emails

Unnamed: 0,email,cc3,country,ltype,legislature,chamber,leg_start_year,nemail_cc3,gender,id,name,sort_name,twitter,facebook,group,group_id,area_id,area,term_x,start_date,end_date,image,wikidata,wikidata_group,wikidata_area,row_id,term_y,row_count,n_unique_emails,person_count_legistype,url,cc,leg_start_date,lastmod,pop2024,lastmod_year,source,party,title,address,private_phone,mobile_phone,work_phone,minister_phone,fax,mpsno,initial,firstname,lastname,partyfname,partysname,statename,constname,profession,presentfaddr,presentladdr,delhiphone,permanentfaddr,permanentladdr,personalphone,lastloksabha,lsexpr,age,phone,noofterms,status,imageurl,profileurl,dob,numberofsons,numberofdaughters,qualification,freedom,profession2,categorycode,currentpagenumber,perpagesize,totalelements,totalpages,source_file,mplastfirstname,mpfirstlastname,maritalstatus,createdat,updatedat,email_fix,rank,constituency,leg_start_year_sg,bloque,apellido,nombre,provincia,partido_o_alianza_por_el_que_ingreso,designacion_legal,cese_legal,designacion_real,cese_real,telefono,instagram,youtube,nome_parlamentar,partido,uf,titularidade,mandato,telefones,dtnasc,chefe_gab,endereco,state,district,parliament_address,parliament_number,social_media,region_x,contact,sr_no,photo,serial_no_,tel_no_r_,constituency_name,permanent_address,mobile,tele_no_res_no_,ac_no,candidate,Country Name,Series Name,Series Code,region_y,EU,politician,civil_servants,years,chambers,legislatures
0,albana.vokshi@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,female,c8bd71fc-4815-40b5-a5c4-bd359e3b0cef,Albana Vokshi,VOKSHI ALBANA,,,PD,c26946f7-8c5e-4474-a85e-ee5342cf4006,durrës_county,Durrës County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Albana-Vokshi-PD.jpg,Q4709025,Q845743,,490.0,VII Pluralist Legislature,140.0,31.0,292.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1.557735e+09,2740502.0,2019.0,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"2009, 2013, 2017",Unicameral,Kuvendi
1,aldo.bumci@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,589ba883-b893-40ed-95f8-69b10f0497d9,Aldo Bumçi,BUMÇI ALDO,,,PD,c26946f7-8c5e-4474-a85e-ee5342cf4006,tirana_county,Tirana County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Aldo-Bumci-PD.jpg,Q2832310,Q845743,,490.0,VII Pluralist Legislature,140.0,31.0,292.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1.557735e+09,2740502.0,2019.0,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"2009, 2013, 2017",Unicameral,Kuvendi
2,bashkim.fino@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,9d0181c7-7ebe-4b16-800a-6ea28baf22f5,Bashkim Fino,FINO BASHKIM,,,PS,0c83d4f1-9e86-48f2-95ee-9c3d158d1141,fier_county,Fier County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Bashkim-Fino-PS.jpg,Q809978,Q642882,,490.0,VII Pluralist Legislature,140.0,31.0,292.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1.557735e+09,2740502.0,2019.0,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"2009, 2013, 2017",Unicameral,Kuvendi
3,besnik.baraj@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,d37e9f6d-9308-4017-bfed-70d081dcab83,Besnik Baraj,BARAJ BESNIK,,,PS,0c83d4f1-9e86-48f2-95ee-9c3d158d1141,tirana_county,Tirana County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Besnik-Baraj-PS.jpg,,Q642882,,490.0,VII Pluralist Legislature,140.0,31.0,292.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1.557735e+09,2740502.0,2019.0,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"2009, 2013, 2017",Unicameral,Kuvendi
4,blendi.klosi@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,e2516f6c-a1a2-45d6-a23a-b73dd7742829,Blendi Klosi,KLOSI BLENDI,,,PS,0c83d4f1-9e86-48f2-95ee-9c3d158d1141,durrës_county,Durrës County,7,,,https://www.parlament.al/wp-content/uploads/2016/01/Blendi-Klosi-PS.jpg,Q13037656,Q642882,,490.0,VII Pluralist Legislature,140.0,31.0,292.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1.557735e+09,2740502.0,2019.0,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"2009, 2013, 2017",Unicameral,Kuvendi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8317,udtarusenga@gmail.com,ZWE,Zimbabwe,lower house,House of Assembly,House of Assembly,2013,37,male,7bd3e99f-5023-4191-8eee-fecb519650a6,Unganai Tarusenga,"Tarusenga, Unganai",,,MDC-T,mdc-t,st_mary's,St Mary's,8,,,http://www.parlzim.gov.zw/media/k2/items/cache/f4db67ec2dbaadd122c4c0b528856935_XL.jpg,,Q1146616,,253.0,8th Parliament,229.0,37.0,229.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/f74be3db4c76d1b42155ad56fa63cad6b3ed2bdd/data/Zimbabwe/Assembly/term-8.csv,ZW,2013-09-17,1.556117e+09,17020321.0,2019.0,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013,Lower,House of Assembly
8318,user@parlzim.gov,ZWE,Zimbabwe,lower house,House of Assembly,House of Assembly,2013,37,female,33a8a270-99c9-435a-b3d8-66486e865558,Mable M. Chinomona,"Chinomona, Mable M.",,,ZANU PF,zanu_pf,mutoko_north,Mutoko North,8,,,http://www.parlzim.gov.zw/media/k2/items/cache/9267284e7733f4bec00d2e114d3f3ba1_XL.jpg,,Q1910161,,253.0,8th Parliament,229.0,37.0,229.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/f74be3db4c76d1b42155ad56fa63cad6b3ed2bdd/data/Zimbabwe/Assembly/term-8.csv,ZW,2013-09-17,1.556117e+09,17020321.0,2019.0,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013,Lower,House of Assembly
8319,wmaondera@gmail.com,ZWE,Zimbabwe,lower house,House of Assembly,House of Assembly,2013,37,male,970274bf-f4a8-4154-aa1d-bbee67b5960a,Webster Maondera,"Maondera, Webster",,,MDC-T,mdc-t,glen_norah,Glen Norah,8,,,http://www.kuvakazim.com/media_root/images/Webster_Maondera.jpg,,Q1146616,,253.0,8th Parliament,229.0,37.0,229.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/f74be3db4c76d1b42155ad56fa63cad6b3ed2bdd/data/Zimbabwe/Assembly/term-8.csv,ZW,2013-09-17,1.556117e+09,17020321.0,2019.0,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013,Lower,House of Assembly
8320,wmutomba@gmail.com,ZWE,Zimbabwe,lower house,House of Assembly,House of Assembly,2013,37,male,44bfd6ef-ad19-41fc-9383-59f254bee19a,William Mutomba,"Mutomba, William",,,ZANU PF,zanu_pf,buhera_north,Buhera North,8,,,http://www.kuvakazim.com/media_root/images/mutombawilliam.jpg,,Q1910161,,253.0,8th Parliament,229.0,37.0,229.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/f74be3db4c76d1b42155ad56fa63cad6b3ed2bdd/data/Zimbabwe/Assembly/term-8.csv,ZW,2013-09-17,1.556117e+09,17020321.0,2019.0,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013,Lower,House of Assembly


In [24]:
df_pol_emails["nemail_cc3"].describe()

count    12577.000000
mean      1108.044764
std       1366.455739
min         31.000000
25%        162.000000
50%        385.000000
75%       3353.000000
max       3353.000000
Name: nemail_cc3, dtype: float64

In [25]:
df_pol_emails["email"].nunique()

12577

In [26]:
df_pol_emails["cc3"].nunique()

59

In [27]:
df_pol_emails.groupby(["cc3", "legislature"]).ngroups

67

In [28]:
set_edomains = set()
for cc in df_pol_emails["cc3"].unique().tolist():
    _df = df_pol_emails.query(f"cc3=='{cc}'")
    _set_edomain = set()
    for _, row in _df.iterrows():
        edomain = row["email"].split("@")[1]
        _set_edomain.add(edomain)
        set_edomains.add(edomain)

#     print(cc)
#     print(len(_set_edomain))
#     print(_set_edomain)

In [29]:
len(set_edomains)

685

## Table1

In [30]:
df_tab1 = (
    df_pol_emails.drop_duplicates(["cc3"], ignore_index=True)
    #     .filter(["cc3", "country", "nemail_cc3", "female_prop_cc3", "years", "chambers", "legislatures", "pop2024"])
    #     .assign(female_prop_cc3=lambda df_: (100 * df_["female_prop_cc3"]).round(1).astype(str) + "\\%")
    .filter(
        ["cc3", "country", "nemail_cc3", "years", "chambers", "legislatures", "pop2024"]
    )
    .assign(
        pop2024=lambda df_: (df_["pop2024"].astype(float) / 1_000_000)
        .round(1)
        .apply(str)
    )
    .replace("Bosnia-and-Herzegovina", "Bosnia")
    .replace(
        "House of Commons, National Assembly for Wales, Scottish Parliament",
        "Commons, Senedd, Scottish Parliament",
    )
    # Tidy year strings
    .replace("1997, 2001, 2005, 2007, 2010, 2011, 2015, 2016, 2017", "1997--2017")
    .replace("2004, 2007, 2010, 2013, 2016", "2004--2016")
    .replace("2001, 2005, 2007, 2011, 2015", "2001--2015")
    .replace("2004, 2007, 2009, 2012, 2015", "2004--2015")
    .replace("2001, 2005, 2007, 2011, 2015, 2025", "2001--2025")
    .replace("2008, 2011, 2014, 2017", "2008--2017")
    .replace("2006, 2010, 2012, 2016", "2006--2016")
    .replace("2001, 2006, 2011, 2015, 2021", "2001--2025")
    .replace("2004, 2007, 2009, 2012, 2015, 2025", "2004--2025")
    .replace("Lower ", "Lower", regex=True)
    .replace("Upper, Lower, Upper", "Lower, Upper", regex=True)
    .assign(ix=lambda df_: range(1, 1 + len(df_)))
    .reorder_columns(["ix"])
)
df_tab1

Unnamed: 0,ix,cc3,country,nemail_cc3,years,chambers,legislatures,pop2024
0,1,ALB,Albania,140,"2009, 2013, 2017",Unicameral,Kuvendi,2.7
1,2,AND,Andorra,31,2015,Unicameral,Consell General,0.1
2,3,ARG,Argentina,71,2025,Bicameral,Parliament,46.9
3,4,ARM,Armenia,119,2019,Unicameral,National Assembly,2.8
4,5,AUS,Australia,177,2004--2016,"Lower, Upper","House of Representatives, Senate",26.9
5,6,BEL,Belgium,149,2014,Lower,Chamber of Representatives,11.9
6,7,BGR,Bulgaria,206,"2013, 2014, 2017",Unicameral,National Assembly,6.4
7,8,BIH,Bosnia,42,2014,Lower,House of Representatives,3.2
8,9,BLR,Belarus,59,2016,Unicameral,House of Representatives,9.1
9,10,BMU,Bermuda,33,2017,Lower,Parliament,


In [31]:
df_tab1["nemail_cc3"].sum()

np.int64(12577)

In [32]:
df_tab1["pop2024"].astype(float).sum()

np.float64(2884.5)

In [33]:
# % coverage of global pop
100 * (df_tab1["pop2024"].astype(float).sum()) / 8000

np.float64(36.05625)

In [34]:
pandas_to_tex(
    df_tab1.replace("nan", "---"),
    "../tables/hipb_pooled_emailcoverage_summary.tex",
    index=False,
)

In [35]:
!cat ../tables/hipb_pooled_emailcoverage_summary.tex

\midrule
1 & ALB & Albania & 140 & 2009, 2013, 2017 & Unicameral & Kuvendi & 2.7 \\
2 & AND & Andorra & 31 & 2015 & Unicameral & Consell General & 0.1 \\
3 & ARG & Argentina & 71 & 2025 & Bicameral & Parliament & 46.9 \\
4 & ARM & Armenia & 119 & 2019 & Unicameral & National Assembly & 2.8 \\
5 & AUS & Australia & 177 & 2004--2016 & Lower, Upper & House of Representatives, Senate & 26.9 \\
6 & BEL & Belgium & 149 & 2014 & Lower & Chamber of Representatives & 11.9 \\
7 & BGR & Bulgaria & 206 & 2013, 2014, 2017 & Unicameral & National Assembly & 6.4 \\
8 & BIH & Bosnia & 42 & 2014 & Lower & House of Representatives & 3.2 \\
9 & BLR & Belarus & 59 & 2016 & Unicameral & House of Representatives & 9.1 \\
10 & BMU & Bermuda & 33 & 2017 & Lower & Parliament & --- \\
11 & BRA & Brazil & 81 & 2025 & Bicameral & Parliament & 217.6 \\
12 & BTN & Bhutan & 48 & 2013 & Lower & National Assembly & 0.8 \\
13 & CAN & Canada & 432 & 2011, 2015 & Lower, Upper & House of Commons, Senate & 40.

## Merge to HIBP

### EP data

In [36]:
df_ep_email_breach_expanded = (
    # All EP emails
    df_ep_emails.filter(
        [
            "email",
            "gender",
            "cc3",
            "country",
            "ltype",
            "legislature",
            "chamber",
            "leg_start_year",
            "nemail_cc3",
        ]
    )
    # ========================================================================
    # Merge to breached status
    .merge(
        (
            pd.read_csv("../data/everypol_hibp.csv")
            .clean_names()
            .rename_column("filename", "email")
        ),
        how="left",
        on="email",
        validate="1:m",
    ).dropna(subset=["breach"])
    # ========================================================================
    # Merge to expanded breach info
    .merge(
        (
            pd.read_parquet("../data/breaches_01_2025_expanded.parquet")
            .remove_columns(
                [
                    "description",
                    "title",
                    "domain",
                    "breachdate",
                    "addeddate",
                    "modifieddate",
                    "logopath",
                    "timetopublic",
                    "dataclasses",
                ]
            )
            .rename_column("name", "breach")
        ),
        how="left",
        on="breach",
        validate="m:1",
    )
    # ========================================================================
    #     .query("present==True")
    .reset_index(drop=True)
)
df_ep_email_breach_expanded.head(3)

Unnamed: 0,email,gender,cc3,country,ltype,legislature,chamber,leg_start_year,nemail_cc3,breach,present,pwncount,isverified,isfabricated,issensitive,isretired,isspamlist,ismalware,issubscriptionfree,yearstopublic,n_dataclasses,Education levels,Private messages,Personal health data,Licence plates,Apps installed on devices,Financial transactions,Family structure,Support tickets,Eating habits,Religions,Vehicle identification numbers (VINs),Sexual orientations,Instant messenger identities,Deceased date,Website activity,Security questions and answers,Passwords,Account balances,Customer feedback,Home ownership statuses,Net worths,Tattoo status,Loan information,User statuses,Health insurance information,Car ownership statuses,Telecommunications carrier,Income levels,Career levels,Mothers maiden names,Bank account numbers,Password strengths,Work habits,Historical passwords,Physical attributes,Parenting plans,Time zones,Email messages,Living costs,Sexual fetishes,Family members' names,Geographic locations,Political views,Government issued IDs,Personal interests,MAC addresses,Browsing histories,Device information,Places of birth,Partial credit card data,Years of professional experience,Dates of birth,Utility bills,Deceased statuses,...,HIV statuses,Company names,Beauty ratings,Political donations,Races,Survey results,Flights taken,Credit card CVV,Drug habits,Usernames,Spoken languages,Social security numbers,Ages,Chat logs,Comments,Cellular network names,Social media profiles,Browser user agent details,Citizenship statuses,Travel plans,Purchasing habits,Device usage tracking data,Credit status information,Spouses names,Relationship statuses,Auth tokens,Charitable donations,Nationalities,Biometric data,Recovery email addresses,Taxation records,Encrypted keys,Delivery instructions,Payment methods,Salutations,Fitness levels,Email addresses,Audio recordings,Job titles,User website URLs,Reward program balances,Age groups,Physical addresses,Employment statuses,Login histories,Driver's licenses,IMSI numbers,Names,Avatars,Employers,Appointments,Travel habits,Purchases,Customer interactions,Buying preferences,Warranty claims,Partial dates of birth,Ethnicities,Mnemonic phrases,Marital statuses,Payment histories,Social connections,IMEI numbers,Password hints,Vehicle details,IP addresses,Financial investments,PINs,Occupations,Cryptocurrency wallet addresses,SMS messages,Astrological signs,Genders,Credit cards,Personal descriptions
0,albana.vokshi@parlament.al,female,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,000webhost,False,14936670,True,False,False,False,False,False,False,0.654795,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,albana.vokshi@parlament.al,female,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,123RF,False,8661578,True,False,False,False,False,False,False,0.652055,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,albana.vokshi@parlament.al,female,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,2844Breaches,False,80115532,False,False,False,False,False,False,False,0.019178,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [37]:
df_ep_email_breach_expanded["email"].nunique()

8129

In [38]:
df_ep_emails["email"].nunique()

8511

### Scraped data

In [39]:
df_scraped_email_breach_expanded = (
    df_scraped_emails.filter(["email", "cc3", "country", "year", "nemail_cc3"])
    # ============================================================================
    # Merge to breached status
    .merge(
        (
            pd.read_csv("../data/scraped_pol_hibp.csv")
            .clean_names()
            .rename_column("filename", "email")
        ),
        how="left",
        on="email",
        validate="1:m",
    )
    #     .dropna(subset=["breach"])
    .assign(present=lambda df_: df_["present"].fillna(False))
    # ============================================================================
    # Merge to expanded breach info
    .merge(
        (
            pd.read_parquet("../data/breaches_01_2025_expanded.parquet")
            .remove_columns(
                [
                    "description",
                    "title",
                    "domain",
                    "breachdate",
                    "addeddate",
                    "modifieddate",
                    "logopath",
                    "timetopublic",
                    "dataclasses",
                ]
            )
            .rename_column("name", "breach")
        ),
        how="left",
        on="breach",
        validate="m:1",
    )
    # ============================================================================
    #     .query("present==True")
    .reset_index(drop=True)
)
df_scraped_email_breach_expanded.head()

Unnamed: 0,email,cc3,country,nemail_cc3,breach,present,pwncount,isverified,isfabricated,issensitive,isretired,isspamlist,ismalware,issubscriptionfree,yearstopublic,n_dataclasses,Education levels,Private messages,Personal health data,Licence plates,Apps installed on devices,Financial transactions,Family structure,Support tickets,Eating habits,Religions,Vehicle identification numbers (VINs),Sexual orientations,Instant messenger identities,Deceased date,Website activity,Security questions and answers,Passwords,Account balances,Customer feedback,Home ownership statuses,Net worths,Tattoo status,Loan information,User statuses,Health insurance information,Car ownership statuses,Telecommunications carrier,Income levels,Career levels,Mothers maiden names,Bank account numbers,Password strengths,Work habits,Historical passwords,Physical attributes,Parenting plans,Time zones,Email messages,Living costs,Sexual fetishes,Family members' names,Geographic locations,Political views,Government issued IDs,Personal interests,MAC addresses,Browsing histories,Device information,Places of birth,Partial credit card data,Years of professional experience,Dates of birth,Utility bills,Deceased statuses,Job applications,Drinking habits,Nicknames,Passport numbers,Smoking habits,...,HIV statuses,Company names,Beauty ratings,Political donations,Races,Survey results,Flights taken,Credit card CVV,Drug habits,Usernames,Spoken languages,Social security numbers,Ages,Chat logs,Comments,Cellular network names,Social media profiles,Browser user agent details,Citizenship statuses,Travel plans,Purchasing habits,Device usage tracking data,Credit status information,Spouses names,Relationship statuses,Auth tokens,Charitable donations,Nationalities,Biometric data,Recovery email addresses,Taxation records,Encrypted keys,Delivery instructions,Payment methods,Salutations,Fitness levels,Email addresses,Audio recordings,Job titles,User website URLs,Reward program balances,Age groups,Physical addresses,Employment statuses,Login histories,Driver's licenses,IMSI numbers,Names,Avatars,Employers,Appointments,Travel habits,Purchases,Customer interactions,Buying preferences,Warranty claims,Partial dates of birth,Ethnicities,Mnemonic phrases,Marital statuses,Payment histories,Social connections,IMEI numbers,Password hints,Vehicle details,IP addresses,Financial investments,PINs,Occupations,Cryptocurrency wallet addresses,SMS messages,Astrological signs,Genders,Credit cards,Personal descriptions
0,alejandra.vigo@senado.gob.ar,ARG,Argentina,71,StealerLogsJan2025,False,71039833.0,True,False,False,False,False,False,False,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,alfredo.deangeli@senado.gob.ar,ARG,Argentina,71,StealerLogsJan2025,False,71039833.0,True,False,False,False,False,False,False,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,alicia.kirchner@senado.gob.ar,ARG,Argentina,71,StealerLogsJan2025,False,71039833.0,True,False,False,False,False,False,False,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,anabel.fernandezsagasti@senado.gob.ar,ARG,Argentina,71,StealerLogsJan2025,False,71039833.0,True,False,False,False,False,False,False,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,andrea.cristina@senado.gob.ar,ARG,Argentina,71,StealerLogsJan2025,False,71039833.0,True,False,False,False,False,False,False,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Manually check

In [40]:
# non_breached_emails = []
# for pol in df["email"].unique():
#     if pol not in df_ep_email_breach_expanded["email"].unique().tolist():
#         non_breached_emails.append(pol)

In [41]:
# import random
# random.seed(42)

# # Sample 10 unique emails
# sampled_emails = random.sample(non_breached_emails, 20)
# sampled_emails

Paste (no breach) 

* g.akriotis@parliament.gr

### Combine

In [42]:
df_email_breach_expanded = (
    pd.concat(
        [
            df_ep_email_breach_expanded.remove_columns(
                ["gender", "country", "ltype", "chamber", "legislature"]
            ).rename_column("leg_start_year", "year"),
            df_scraped_email_breach_expanded.remove_columns(["country"]),
        ],
        ignore_index=True,
    ).fillna(0)
    # ========================================================================
    .assign(
        seriousbreach=lambda df_: df_[LIST_SERIOUS_DATACLASSES].max(axis=1).astype(int)
    )
    # ========================================================================
    .pipe(lambda df_: classify_comm_gov_email(df_))
)
df_email_breach_expanded

Unnamed: 0,email,cc3,year,nemail_cc3,breach,present,pwncount,isverified,isfabricated,issensitive,isretired,isspamlist,ismalware,issubscriptionfree,yearstopublic,n_dataclasses,Education levels,Private messages,Personal health data,Licence plates,Apps installed on devices,Financial transactions,Family structure,Support tickets,Eating habits,Religions,Vehicle identification numbers (VINs),Sexual orientations,Instant messenger identities,Deceased date,Website activity,Security questions and answers,Passwords,Account balances,Customer feedback,Home ownership statuses,Net worths,Tattoo status,Loan information,User statuses,Health insurance information,Car ownership statuses,Telecommunications carrier,Income levels,Career levels,Mothers maiden names,Bank account numbers,Password strengths,Work habits,Historical passwords,Physical attributes,Parenting plans,Time zones,Email messages,Living costs,Sexual fetishes,Family members' names,Geographic locations,Political views,Government issued IDs,Personal interests,MAC addresses,Browsing histories,Device information,Places of birth,Partial credit card data,Years of professional experience,Dates of birth,Utility bills,Deceased statuses,Job applications,Drinking habits,Nicknames,Passport numbers,Smoking habits,...,Political donations,Races,Survey results,Flights taken,Credit card CVV,Drug habits,Usernames,Spoken languages,Social security numbers,Ages,Chat logs,Comments,Cellular network names,Social media profiles,Browser user agent details,Citizenship statuses,Travel plans,Purchasing habits,Device usage tracking data,Credit status information,Spouses names,Relationship statuses,Auth tokens,Charitable donations,Nationalities,Biometric data,Recovery email addresses,Taxation records,Encrypted keys,Delivery instructions,Payment methods,Salutations,Fitness levels,Email addresses,Audio recordings,Job titles,User website URLs,Reward program balances,Age groups,Physical addresses,Employment statuses,Login histories,Driver's licenses,IMSI numbers,Names,Avatars,Employers,Appointments,Travel habits,Purchases,Customer interactions,Buying preferences,Warranty claims,Partial dates of birth,Ethnicities,Mnemonic phrases,Marital statuses,Payment histories,Social connections,IMEI numbers,Password hints,Vehicle details,IP addresses,Financial investments,PINs,Occupations,Cryptocurrency wallet addresses,SMS messages,Astrological signs,Genders,Credit cards,Personal descriptions,seriousbreach,domain,ecategory
0,albana.vokshi@parlament.al,ALB,2009.0,140,000webhost,False,14936670.0,True,False,False,False,False,False,False,0.654795,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,parlament.al,Official
1,albana.vokshi@parlament.al,ALB,2009.0,140,123RF,False,8661578.0,True,False,False,False,False,False,False,0.652055,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,parlament.al,Official
2,albana.vokshi@parlament.al,ALB,2009.0,140,2844Breaches,False,80115532.0,False,False,False,False,False,False,False,0.019178,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,parlament.al,Official
3,albana.vokshi@parlament.al,ALB,2009.0,140,500px,False,14867999.0,True,False,False,False,False,False,False,0.720548,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,parlament.al,Official
4,albana.vokshi@parlament.al,ALB,2009.0,140,8fit,False,15025407.0,True,False,False,False,False,False,False,0.720548,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1,parlament.al,Official
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3208634,zhul.rahim@pap.org.sg,SGP,0.0,400,Yatra,False,5033997.0,True,False,False,False,False,False,False,4.841096,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,pap.org.sg,Official
3208635,zhul.rahim@pap.org.sg,SGP,0.0,400,YouveBeenScraped,False,66147869.0,True,False,False,False,False,False,False,0.169863,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,pap.org.sg,Official
3208636,zhul.rahim@pap.org.sg,SGP,0.0,400,Zacks,False,8929503.0,True,False,False,False,False,False,False,3.084932,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,pap.org.sg,Official
3208637,zhul.rahim@pap.org.sg,SGP,0.0,400,Zynga,False,172869660.0,True,False,False,False,False,False,False,0.298630,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,pap.org.sg,Official


In [103]:
# (
#     df_email_breach_expanded
#     .query("ecategory=='Official'")
#     .drop_duplicates(["email"])
#     .groupby("domain")
#     .size()
#     .reset_index(name="count")
#     .sort_values("count", ascending=False, ignore_index=True)
#     .head(100)
# )

In [102]:
# (
#     df_email_breach_expanded
#     .query("ecategory=='Commercial'")
#     .drop_duplicates(["email"])
#     .groupby("domain")
#     .size()
#     .reset_index(name="count")
#     .sort_values("count", ascending=False, ignore_index=True)
#     .head(50)
# )

### Basic numbers

In [45]:
# pols involved in 563 data breaches
df_email_breach_expanded["breach"].nunique()

563

In [46]:
# 3750 pols have been breached
n_pwned_pols = df_email_breach_expanded.query("present==True")["email"].nunique()
n_pwned_pols

3750

In [47]:
100*n_pwned_pols / df_email_breach_expanded["email"].nunique()

30.747786159396522

In [48]:
# 2415 pols have been breached multiple times
n_multi_pwned_pols = (
    df_email_breach_expanded
    .groupby(["email"])["present"]
    .sum()
    .reset_index()
    .query("present>1")["email"]
    .nunique()
)
n_multi_pwned_pols

2415

In [49]:
# Proportion of multiple breaches
100 * (n_multi_pwned_pols / n_pwned_pols)

64.4

In [50]:
# Proportion of total
100 * n_multi_pwned_pols / df_email_breach_expanded["email"].nunique()

19.801574286651363

In [51]:
# How many email-pw pairs
_n = (
    df_email_breach_expanded
    .query("present==True")
    .clean_names()
    .query("email_addresses==1")
    .query("passwords==1")["email"]
    .nunique()
)

In [52]:
100 * _n / n_pwned_pols

65.97333333333333

In [53]:
# How many had serious breaches
(
    df_email_breach_expanded
    .query("present==True")
    .query("seriousbreach==1")
    ["email"]
    .nunique()
)

2545

In [54]:
# How many had multiple serious breaches
(
    df_email_breach_expanded.query("present==True")
    .query("seriousbreach==1")
    .groupby(["email"])["seriousbreach"]
    .sum()
    .reset_index()
    .query("seriousbreach>1")["email"]
    .nunique()
)

1347

### Basic numbers - Personal/Comm

In [55]:
df_email_breach_expanded.query("ecategory=='Commercial'")["email"].nunique()

2830

In [56]:
# pols involved in 563 data breaches
df_email_breach_expanded.query("ecategory=='Commercial'")["breach"].nunique()

563

In [57]:
# 1311 pols have been breached
n_pwned_pols_comm = (
    df_email_breach_expanded.query("ecategory=='Commercial'")
    .query("present==True")["email"]
    .nunique()
)
n_pwned_pols_comm

1311

In [58]:
100 * n_pwned_pols_comm / df_email_breach_expanded.query("ecategory=='Commercial'")[
    "email"
].nunique()

46.32508833922262

In [59]:
# 873 pols have been breached multiple times
n_multi_pwned_pols_comm = (
    df_email_breach_expanded.query("ecategory=='Commercial'")
    .groupby(["email"])["present"]
    .sum()
    .reset_index()
    .query("present>1")["email"]
    .nunique()
)
n_multi_pwned_pols_comm

873

In [60]:
# Proportion of multiple breaches
100 * (n_multi_pwned_pols_comm / n_pwned_pols_comm)

66.5903890160183

In [61]:
# How many email-pw pairs
_n = (
    df_email_breach_expanded.query("ecategory=='Commercial'")
    .query("present==True")
    .clean_names()
    .query("email_addresses==1")
    .query("passwords==1")["email"]
    .nunique()
)

In [62]:
100 * _n / n_pwned_pols_comm

72.61632341723875

In [63]:
# How many had serious breaches
df_email_breach_expanded.query("ecategory=='Commercial'").query("present==True").query(
    "seriousbreach==1"
)["email"].nunique()

973

In [64]:
# How many had multiple serious breaches
(
    df_email_breach_expanded.query("ecategory=='Commercial'")
    .query("present==True")
    .query("seriousbreach==1")
    .groupby(["email"])["seriousbreach"]
    .sum()
    .reset_index()
    .query("seriousbreach>1")["email"]
    .nunique()
)

586

### Basic numbers - Official

In [65]:
df_email_breach_expanded.query("ecategory=='Official'")["email"].nunique()

9366

In [66]:
# pols involved in 563 data breaches
df_email_breach_expanded.query("ecategory=='Official'")["breach"].nunique()

563

In [67]:
# 2439 pols have been breached
n_pwned_pols_govt = (
    df_email_breach_expanded
    .query("ecategory=='Official'")
    .query("present==True")["email"]
    .nunique()
)
n_pwned_pols_comm

1311

In [68]:
100 * n_pwned_pols_govt / df_email_breach_expanded.query("ecategory=='Official'")[
    "email"
].nunique()

26.040999359385008

In [69]:
# 1542 pols have been breached multiple times
n_multi_pwned_pols_govt = (
    df_email_breach_expanded.query("ecategory=='Official'")
    .groupby(["email"])["present"]
    .sum()
    .reset_index()
    .query("present>1")["email"]
    .nunique()
)
n_multi_pwned_pols_comm

873

In [70]:
# Proportion of multiple breaches
100 * (n_multi_pwned_pols_govt / n_pwned_pols_comm)

117.62013729977116

In [71]:
# How many email-pw pairs
_n = (
    df_email_breach_expanded
    .query("ecategory=='Official'")
    .query("present==True")
    .clean_names()
    .query("email_addresses==1")
    .query("passwords==1")["email"]
    .nunique()
)

In [72]:
100 * _n / n_pwned_pols_comm

116.09458428680396

In [73]:
# How many had serious breaches
df_email_breach_expanded.query("ecategory=='Official'").query("present==True").query(
    "seriousbreach==1"
)["email"].nunique()

1572

In [74]:
# How many had multiple serious breaches
(
    df_email_breach_expanded
    .query("ecategory=='Official'")
    .query("present==True")
    .query("seriousbreach==1")
    .groupby(["email"])["seriousbreach"]
    .sum()
    .reset_index()
    .query("seriousbreach>1")["email"]
    .nunique()
)

761

## Tabulate

### Breach summary

In [75]:
_all = (
    df_email_breach_expanded.groupby("email")["present"]
    .sum()
    .pipe(lambda s: s.describe().to_frame().T)
    .assign(perc_at_least_1=100*n_pwned_pols/df_email_breach_expanded["email"].nunique())
)
_all

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,perc_at_least_1
present,12196.0,1.070023,5.910541,0.0,0.0,0.0,1.0,484.0,30.747786


In [76]:
_govt = (
    df_email_breach_expanded
    .query("ecategory=='Official'")
    .groupby("email")["present"]
    .sum()
    .pipe(lambda s: s.describe().to_frame().T)
    .astype(int)
    .assign(perc_at_least_1=100*n_pwned_pols_govt/df_email_breach_expanded.query("ecategory=='Official'")["email"].nunique())
)
_govt

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,perc_at_least_1
present,9366,0,1,0,0,0,1,36,26.040999


In [77]:
_comm = (
    df_email_breach_expanded.query("ecategory=='Commercial'")
    .groupby("email")["present"]
    .sum()
    .pipe(lambda s: s.describe().to_frame().T)
    .astype(int)
    .assign(perc_at_least_1=100*n_pwned_pols_comm/df_email_breach_expanded.query("ecategory=='Commercial'")["email"].nunique())
)
_comm

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,perc_at_least_1
present,2830,2,11,0,0,0,2,484,46.325088


In [78]:
_tab = (
    pd.concat([_all, _govt, _comm], ignore_index=True)
    .assign(**{col: lambda df_, col=col: df_[col].round(0).astype(int) for col in ["min", "25%", "50%", "75%", "max"]})
    .round(1)
    .assign(perc_at_least_1=lambda df_: df_["perc_at_least_1"].astype(str) +"\\%")
    .assign(count=lambda df_: df_["count"].round(0).astype(int).apply(lambda x: f"{x:,}"))
    .astype(str)
)
pandas_to_tex(_tab, "../tables/pooled_pols_breach_number_summary", index=False)
display(_tab)
!cat ../tables/pooled_pols_breach_number_summary.tex

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,perc_at_least_1
0,12196,1.1,5.9,0,0,0,1,484,30.7\%
1,9366,0.0,1.0,0,0,0,1,36,26.0\%
2,2830,2.0,11.0,0,0,0,2,484,46.3\%


\midrule
12,196 & 1.1 & 5.9 & 0 & 0 & 0 & 1 & 484 & 30.7\% \\
9,366 & 0.0 & 1.0 & 0 & 0 & 0 & 1 & 36 & 26.0\% \\
2,830 & 2.0 & 11.0 & 0 & 0 & 0 & 2 & 484 & 46.3\% \\

In [79]:
_all = (
    df_email_breach_expanded
    .query("seriousbreach==1")
    .groupby("email")["present"]
    .sum()
    .pipe(lambda s: s.describe().to_frame().T)
    .assign(perc_at_least_1=100*df_email_breach_expanded.query("present==True").query(
    "seriousbreach==1"
)["email"].nunique()/df_email_breach_expanded["email"].nunique())
)
_all

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,perc_at_least_1
present,12163.0,0.544849,4.577156,0.0,0.0,0.0,0.0,393.0,20.867498


In [80]:
_govt = (
    df_email_breach_expanded
    .query("ecategory=='Official'")
    .query("seriousbreach==1")
    .groupby("email")["present"]
    .sum()
    .pipe(lambda s: s.describe().to_frame().T)
    .astype(int)
    .assign(perc_at_least_1=100*df_email_breach_expanded.query("ecategory=='Official'").query("present==True").query(
    "seriousbreach==1"
)["email"].nunique()/df_email_breach_expanded.query("ecategory=='Official'")["email"].nunique())
)
_govt

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,perc_at_least_1
present,9348,0,0,0,0,0,0,18,16.784113


In [81]:
_comm = (
    df_email_breach_expanded
    .query("ecategory=='Commercial'")
    .query("seriousbreach==1")
    .groupby("email")["present"]
    .sum()
    .pipe(lambda s: s.describe().to_frame().T)
    .astype(int)
    .assign(perc_at_least_1=100*df_email_breach_expanded.query("ecategory=='Commercial'").query("present==True").query(
    "seriousbreach==1"
)["email"].nunique()/df_email_breach_expanded.query("ecategory=='Commercial'")["email"].nunique())
)
_comm

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,perc_at_least_1
present,2815,1,9,0,0,0,1,393,34.381625


In [82]:
_tab = (
    pd.concat([_all, _govt, _comm], ignore_index=True)
    .assign(**{col: lambda df_, col=col: df_[col].round(0).astype(int) for col in ["min", "25%", "50%", "75%", "max"]})
    .round(1)
    .assign(perc_at_least_1=lambda df_: df_["perc_at_least_1"].astype(str) +"\\%")
    .assign(count=lambda df_: df_["count"].round(0).astype(int).apply(lambda x: f"{x:,}"))
    .astype(str)
)
pandas_to_tex(_tab, "../tables/pooled_pols_seriousbreach_number_summary", index=False)
display(_tab)
!cat ../tables/pooled_pols_breach_number_summary.tex

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,perc_at_least_1
0,12163,0.5,4.6,0,0,0,0,393,20.9\%
1,9348,0.0,0.0,0,0,0,0,18,16.8\%
2,2815,1.0,9.0,0,0,0,1,393,34.4\%


\midrule
12,196 & 1.1 & 5.9 & 0 & 0 & 0 & 1 & 484 & 30.7\% \\
9,366 & 0.0 & 1.0 & 0 & 0 & 0 & 1 & 36 & 26.0\% \\
2,830 & 2.0 & 11.0 & 0 & 0 & 0 & 2 & 484 & 46.3\% \\

### Tabulate: Compromised data types

In [83]:
df_pwnpol_datatype = (
    df_email_breach_expanded
    .query("present==True")
    # ===================================================
    # Filter data types and remove duplicate breach info
    .set_index("email")
    .iloc[:, 15:-3]
    .reset_index()
    .drop_duplicates(ignore_index=True)
    # ===================================================
    # Long by email-datatype
    .melt(id_vars=["email"], var_name="datatype", value_name="present")
    .groupby(["email", "datatype"])["present"]
    .sum()
    .reset_index()
    .assign(present=lambda df_: np.where(df_["present"] > 0, 1, 0))
    # ===================================================
    # Back to wide by email
    .pivot(index="email", columns="datatype", values="present")
    # ===================================================
    .T.sum(axis=1)
    .reset_index(name="count")
    .sort_values(["count", "datatype"], ascending=[False, True], ignore_index=True)
    .assign(percent=lambda df_: 100 * df_["count"] / n_pwned_pols)
    .assign(percent=lambda df_: df_["percent"].apply(lambda x: f"{round(x, 1)}\\%"))
    # ===================================================
    .assign(
        seriousbreach=lambda df_: df_["datatype"].isin(LIST_SERIOUS_DATACLASSES).map({True: r"\checkmark", False: ""})
    )
)
df_pwnpol_datatype

Unnamed: 0,datatype,count,percent,seriousbreach
0,Email addresses,3749,100.0\%,
1,Names,3154,84.1\%,
2,Phone numbers,2897,77.3\%,
3,Job titles,2613,69.7\%,
4,Passwords,2475,66.0\%,\checkmark
5,Physical addresses,2440,65.1\%,
6,Social media profiles,2430,64.8\%,
7,Geographic locations,1891,50.4\%,
8,Employers,1690,45.1\%,
9,Genders,1597,42.6\%,


In [84]:
df_datatype1 = (
    df_pwnpol_datatype.iloc[:20]
    .reset_index(drop=True)
    .assign(ix=range(1, 21))
    .astype(str)
    .reorder_columns(["ix"])
)
df_datatype2 = (
    df_pwnpol_datatype.iloc[20:40]
    .reset_index(drop=True)
    .assign(ix=range(21, 41))
    .astype(str)
    .reorder_columns(["ix"])
)
df_datatype3 = (
    df_pwnpol_datatype.iloc[40:60]
    .reset_index(drop=True)
    .assign(ix=range(41, 61))
    .astype(str)
    .reorder_columns(["ix"])
)
df2tex = pd.concat([df_datatype1, df_datatype2, df_datatype3], axis=1)
# print(df2tex.to_latex(na_rep="", index=False))
df2tex

Unnamed: 0,ix,datatype,count,percent,seriousbreach,ix.1,datatype.1,count.1,percent.1,seriousbreach.1,ix.2,datatype.2,count.2,percent.2,seriousbreach.2
0,1,Email addresses,3749,100.0\%,,21,Religions,98,2.6\%,,41,Bank account numbers,29,0.8\%,\checkmark
1,2,Names,3154,84.1\%,,22,Email messages,95,2.5\%,\checkmark,42,Nationalities,29,0.8\%,\checkmark
2,3,Phone numbers,2897,77.3\%,,23,Password hints,82,2.2\%,\checkmark,43,Payment histories,28,0.7\%,
3,4,Job titles,2613,69.7\%,,24,Auth tokens,78,2.1\%,\checkmark,44,Telecommunications carrier,25,0.7\%,
4,5,Passwords,2475,66.0\%,\checkmark,25,Ethnicities,75,2.0\%,,45,Company names,22,0.6\%,
5,6,Physical addresses,2440,65.1\%,,26,Home ownership statuses,74,2.0\%,,46,Relationship statuses,20,0.5\%,
6,7,Social media profiles,2430,64.8\%,,27,Occupations,74,2.0\%,,47,Deceased statuses,18,0.5\%,
7,8,Geographic locations,1891,50.4\%,,28,PINs,71,1.9\%,\checkmark,48,Private messages,18,0.5\%,\checkmark
8,9,Employers,1690,45.1\%,,29,Credit status information,63,1.7\%,\checkmark,49,Website activity,18,0.5\%,
9,10,Genders,1597,42.6\%,,30,Family structure,59,1.6\%,,50,Credit cards,14,0.4\%,\checkmark


In [85]:
pandas_to_tex(
    df2tex, "../tables/hipb_pwnpols_datatypes.tex", na_rep="", index=False, escape=True
)

!cat "../tables/hipb_pwnpols_datatypes.tex"

\midrule
1 & Email addresses & 3749 & 100.0\textbackslash \% &  & 21 & Religions & 98 & 2.6\textbackslash \% &  & 41 & Bank account numbers & 29 & 0.8\textbackslash \% & \textbackslash checkmark \\
2 & Names & 3154 & 84.1\textbackslash \% &  & 22 & Email messages & 95 & 2.5\textbackslash \% & \textbackslash checkmark & 42 & Nationalities & 29 & 0.8\textbackslash \% & \textbackslash checkmark \\
3 & Phone numbers & 2897 & 77.3\textbackslash \% &  & 23 & Password hints & 82 & 2.2\textbackslash \% & \textbackslash checkmark & 43 & Payment histories & 28 & 0.7\textbackslash \% &  \\
4 & Job titles & 2613 & 69.7\textbackslash \% &  & 24 & Auth tokens & 78 & 2.1\textbackslash \% & \textbackslash checkmark & 44 & Telecommunications carrier & 25 & 0.7\textbackslash \% &  \\
5 & Passwords & 2475 & 66.0\textbackslash \% & \textbackslash checkmark & 25 & Ethnicities & 75 & 2.0\textbackslash \% &  & 45 & Company names & 22 & 0.6\textbackslash \% &  \\
6 & Physical addresses & 2440 & 65.1\tex

### Tabulate: Breaches

In [86]:
df_pwnpol_breach_incident = (
    df_email_breach_expanded
    # ===================================================
    .groupby(["breach"])["email"]
    .size()
    .reset_index()
    .rename_column("email", "emails")
    .sort_values("emails", ascending=False, ignore_index=True)
    .head(50)
    .assign(percent=lambda df_: 100 * df_["emails"] / n_pwned_pols)
    .assign(percent=lambda df_: df_["percent"].apply(lambda x: f"{round(x, 1)}\%"))
    # ===================================================
    # Merge back to get breach characteristics
    .merge(
        (
            pd.read_parquet("../data/breaches_01_2025_expanded.parquet")
            .remove_columns(
                [
                    "description",
                    "title",
                    "modifieddate",
                    "logopath",
                    "timetopublic",
                    "dataclasses",
                ]
            )
            .rename_column("name", "breach")
            .assign(
                seriousbreach=lambda df_: df_[LIST_SERIOUS_DATACLASSES]
                .max(axis=1)
                .astype(int)
            )
            .assign(
                seriousbreach=lambda df_: np.where(
                    df_["seriousbreach"] == 1, r"\checkmark", ""
                )
            )
            #             .filter(["breach", "breachdate", "addeddate", "yearstopublic", "n_dataclasses", "seriousbreach", *LIST_ALL_DATACLASSES])
            .filter(
                [
                    "breach",
                    "domain",
                    "breachdate",
                    "addeddate",
                    "yearstopublic",
                    "pwncount",
                    "n_dataclasses",
                    "seriousbreach",
                ]
            )
        ),
        how="left",
        on="breach",
        validate="1:1",
    )
    # ===================================================
    # Remove timestamps
    .assign(
        breachdate=lambda df_: pd.to_datetime(df_["breachdate"]).dt.date,
        addeddate=lambda df_: pd.to_datetime(df_["addeddate"]).dt.date,
    )
    .assign(pwncount=lambda df_: df_["pwncount"] / 1_000_000)
    .round(1)
    .astype(str)
    .assign(pwncount=lambda df_: df_["pwncount"] + "M")
    .assign(yearstopublic=lambda df_: df_["yearstopublic"] + " years")
    # ===================================================
    .assign(ix=lambda df_: range(1, 1 + len(df_)))
    .reorder_columns(["ix"])
)
df_pwnpol_breach_incident

Unnamed: 0,ix,breach,emails,percent,domain,breachdate,addeddate,yearstopublic,pwncount,n_dataclasses,seriousbreach
0,1,db8151dd,12235,326.3\%,covve.com,2020-02-20,2020-05-15,0.2 years,22.8M,6,
1,2,TelegramCombolists,12138,323.7\%,,2024-05-28,2024-06-03,0.0 years,361.5M,3,\checkmark
2,3,Cit0day,12071,321.9\%,cit0day.in,2020-11-04,2020-11-19,0.0 years,226.9M,2,\checkmark
3,4,Twitter200M,12071,321.9\%,twitter.com,2021-01-01,2023-01-05,2.0 years,211.5M,4,
4,5,PDL,12071,321.9\%,,2019-10-16,2019-11-22,0.1 years,622.2M,7,
5,6,LinkedInScrape,12071,321.9\%,linkedin.com,2021-04-08,2021-10-02,0.5 years,125.7M,7,
6,7,LinkedIn,12071,321.9\%,linkedin.com,2012-05-05,2016-05-21,4.0 years,164.6M,2,\checkmark
7,8,YouveBeenScraped,12003,320.1\%,,2018-10-05,2018-12-06,0.2 years,66.1M,6,
8,9,OnlinerSpambot,11965,319.1\%,,2017-08-28,2017-08-29,0.0 years,711.5M,2,\checkmark
9,10,MyFitnessPal,11960,318.9\%,myfitnesspal.com,2018-02-01,2019-02-21,1.1 years,143.6M,4,\checkmark


In [87]:
pandas_to_tex(
    df_pwnpol_breach_incident.head(25),
    "../tables/hipb_pwnpols_breach_incidents.tex",
    escape=False,
)

In [88]:
!cat ../tables/hipb_pwnpols_breach_incidents.tex

\midrule
1 & db8151dd & 12235 & 326.3\% & covve.com & 2020-02-20 & 2020-05-15 & 0.2 years & 22.8M & 6 &  \\
2 & TelegramCombolists & 12138 & 323.7\% & None & 2024-05-28 & 2024-06-03 & 0.0 years & 361.5M & 3 & \checkmark \\
3 & Cit0day & 12071 & 321.9\% & cit0day.in & 2020-11-04 & 2020-11-19 & 0.0 years & 226.9M & 2 & \checkmark \\
4 & Twitter200M & 12071 & 321.9\% & twitter.com & 2021-01-01 & 2023-01-05 & 2.0 years & 211.5M & 4 &  \\
5 & PDL & 12071 & 321.9\% & None & 2019-10-16 & 2019-11-22 & 0.1 years & 622.2M & 7 &  \\
6 & LinkedInScrape & 12071 & 321.9\% & linkedin.com & 2021-04-08 & 2021-10-02 & 0.5 years & 125.7M & 7 &  \\
7 & LinkedIn & 12071 & 321.9\% & linkedin.com & 2012-05-05 & 2016-05-21 & 4.0 years & 164.6M & 2 & \checkmark \\
8 & YouveBeenScraped & 12003 & 320.1\% & None & 2018-10-05 & 2018-12-06 & 0.2 years & 66.1M & 6 &  \\
9 & OnlinerSpambot & 11965 & 319.1\% & None & 2017-08-28 & 2017-08-29 & 0.0 years & 711.5M & 2 & \checkmark \\
10 & MyFitnessPal & 11960 &