In [1]:
import pandas as pd
import janitor
import numpy as np
from IPython.display import display
from utilities import clean_email_column_no_dedupe, pandas_to_tex

pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 150)
pd.set_option("display.max_colwidth", None)

import sys

sys.path.append("/home/lsys/pwned_pols//venv/lib/python3.10/site-packages")
import warnings

warnings.filterwarnings("ignore")

## Prep EP data

In [2]:
df_ep_emails = (
    pd.read_csv(
        "../data/everypol/everypol_combined_legislature_data.csv", low_memory=False
    )
    .sort_values(["cc3", "leg_start_year", "email"])
    .pipe(lambda df_: clean_email_column_no_dedupe(df_))
    # ================================================================
    # Fix missing cc3 for Wales/Scotland
    # Wales, Scotland = GBR
    .assign(
        cc3=lambda df_: np.where(
            df_["cc"] == "GB-SCT",
            "GBR",
            np.where(df_["cc"] == "GB-WLS", "GBR", df_["cc3"]),
        )
    )
    # ================================================================
    # Fix missing pop for Wales/Scotland
    .assign(
        pop2024=lambda df_: np.where(
            df_["cc"] == "GB-SCT",
            "GBR",
            np.where(df_["cc"] == "GB-WLS", 68556800, df_["pop2024"]),
        )
    )
    # ================================================================
    # Fix ltype for India to bicameral
    .assign(ltype=lambda df_: np.where(df_["cc3"]=="IND", "bicameral legislature", df_["ltype"]))
    # ================================================================
    # Get #emails per country (cc3)
    .assign(nemail_cc3=lambda df_: df_.groupby("cc3")["email"].transform("nunique"))
    # ================================================================
    # Perc female
    .assign(
        female_count_cc3=lambda df_: df_.groupby("cc3")["gender"].transform(
            lambda g: (g == "female").sum()
        ),
        female_prop_cc3=lambda df_: df_["female_count_cc3"] / df_["nemail_cc3"],
    )
    # ================================================================
    .reorder_columns(
        [
            "email",
            "cc3",
            "country",
            "ltype",
            "legislature",
            "chamber",
            "leg_start_year",
            "nemail_cc3",
            "gender",
        ]
    )
)

assert (df_ep_emails["female_count_cc3"] <= df_ep_emails["nemail_cc3"]).all()
assert (df_ep_emails["female_prop_cc3"] <= 1).all()
assert (df_ep_emails["nemail_cc3"] >= 30).all()
display(df_ep_emails.head(3))
df_ep_emails.info()

Unnamed: 0,email,cc3,country,ltype,legislature,chamber,leg_start_year,nemail_cc3,gender,id,name,sort_name,twitter,facebook,group,group_id,area_id,area,term_x,start_date,end_date,image,wikidata,wikidata_group,wikidata_area,row_id,term_y,row_count,n_unique_emails,person_count_legistype,url,cc,leg_start_date,lastmod,pop2024,lastmod_year,female_count_cc3,female_prop_cc3
0,albana.vokshi@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,female,c8bd71fc-4815-40b5-a5c4-bd359e3b0cef,Albana Vokshi,VOKSHI ALBANA,,,PD,c26946f7-8c5e-4474-a85e-ee5342cf4006,durrës_county,Durrës County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Albana-Vokshi-PD.jpg,Q4709025,Q845743,,490,VII Pluralist Legislature,140,31,292,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1557734928,2740502.0,2019,12,0.085714
1,aldo.bumci@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,589ba883-b893-40ed-95f8-69b10f0497d9,Aldo Bumçi,BUMÇI ALDO,,,PD,c26946f7-8c5e-4474-a85e-ee5342cf4006,tirana_county,Tirana County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Aldo-Bumci-PD.jpg,Q2832310,Q845743,,490,VII Pluralist Legislature,140,31,292,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1557734928,2740502.0,2019,12,0.085714
2,bashkim.fino@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,9d0181c7-7ebe-4b16-800a-6ea28baf22f5,Bashkim Fino,FINO BASHKIM,,,PS,0c83d4f1-9e86-48f2-95ee-9c3d158d1141,fier_county,Fier County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Bashkim-Fino-PS.jpg,Q809978,Q642882,,490,VII Pluralist Legislature,140,31,292,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1557734928,2740502.0,2019,12,0.085714


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8512 entries, 0 to 8511
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   email                   8512 non-null   object 
 1   cc3                     8512 non-null   object 
 2   country                 8512 non-null   object 
 3   ltype                   8512 non-null   object 
 4   legislature             8512 non-null   object 
 5   chamber                 8512 non-null   object 
 6   leg_start_year          8512 non-null   int64  
 7   nemail_cc3              8512 non-null   int64  
 8   gender                  7300 non-null   object 
 9   id                      8512 non-null   object 
 10  name                    8512 non-null   object 
 11  sort_name               8512 non-null   object 
 12  twitter                 2393 non-null   object 
 13  facebook                1597 non-null   object 
 14  group                   8510 non-null   

In [3]:
# countries with incomplete gender coverage
for c in df_ep_emails["cc3"].unique().tolist():
    _df = df_ep_emails.query(f"cc3=='{c}'")
    n_gender = len(_df.dropna(subset=["gender"]))
    n_country = len(_df)
    if n_gender < n_country:
        name = _df.reset_index().loc[0, "country"]
        print(f"{name} ({c}):\n {n_gender}, {n_country}")

Albania (ALB):
 61, 140
Armenia (ARM):
 32, 119
Bulgaria (BGR):
 186, 206
Cameroon (CMR):
 81, 109
Colombia (COL):
 55, 169
Estonia (EST):
 95, 101
Georgia (GEO):
 49, 145
Guernsey (GGY):
 17, 39
Guatemala (GTM):
 143, 152
Hungary (HUN):
 171, 184
Iran (IRN):
 51, 138
Italy (ITA):
 284, 314
South-Korea (KOR):
 238, 253
Moldova (MDA):
 43, 44
Macedonia (MKD):
 95, 101
Namibia (NAM):
 12, 72
Nigeria (NGA):
 107, 140
Nicaragua (NIC):
 2, 85
Nepal (NPL):
 47, 268
Papua-New-Guinea (PNG):
 55, 56
Rwanda (RWA):
 1, 76
Slovakia (SVK):
 116, 164
Seychelles (SYC):
 0, 32
Tanzania (TZA):
 384, 405
Uruguay (URY):
 108, 119
South-Africa (ZAF):
 371, 385


In [4]:
set_edomain = set()
for _, row in df_ep_emails.iterrows():
    try:
        edomain = row["email"].split("@")[1]
        set_edomain.add(edomain)
    except IndexError:
        print(_)
#         print(row["email"])

len(set_edomain)

481

In [5]:
for cc in df_ep_emails["cc3"].unique().tolist():
    _df = df_ep_emails.query(f"cc3=='{cc}'")
    _set_edomain = set()
    for _, row in _df.iterrows():
        edomain = row["email"].split("@")[1]
        _set_edomain.add(edomain)
        
    print(cc)
    print(len(_set_edomain))
    print(_set_edomain)

ALB
1
{'parlament.al'}
AND
3
{'parlament.ad', 'gmail.com', 'andorra.ad'}
ARM
1
{'parliament.am'}
AUS
5
{'ia.pm.gov.au', 'dfat.gov.au', 'defence.gov.au', 'jobs.gov.au', 'aph.gov.au'}
BEL
5
{'mac.com', 'gmail.com', 'dekamer.be', 'lachambre.be', 'ecolo.be'}
BGR
3
{'abv.bg', 'vmro-bg.org', 'parliament.bg'}
BIH
1
{'parlament.ba'}
BLR
3
{'mail.ru', 'house.gov.by.', 'house.gov.by'}
BMU
2
{'gov.bm', 'parliament.bm'}
BTN
12
{'nab.gov.bt', 'mowhs.gov.bt', 'health.gov.bt', 'mof.gov.bt', 'education.gov.bt', 'moea.gov.bt', 'moaf.gov.bt', 'mohca.gov.bt', 'moic.gov.bt', 'cabinet.gov.bt', 'molhr.gov.bt', 'mfa.gov.bt'}
CAN
2
{'sen.parl.gc.ca', 'parl.gc.ca'}
CMR
5
{'asnnat.cm', 'assnatt.cm', 'asstnat.cm', 'assnat.cm', 'assnat.com'}
COL
1
{'camara.gov.co'}
CYP
1
{'parliament.cy'}
DNK
19
{'ufm.dk', 'stm.dk', 'fm.dk', 'fmn.dk', 'evm.dk', 'uvm.dk', 'kum.dk', 'bm.dk', 'sm.dk', 'mfvm.dk', 'efkm.dk', 'uim.dk', 'trm.dk', 'oim.dk', 'sum.dk', 'um.dk', 'jm.dk', 'ft.dk', 'skm.dk'}
EST
1
{'riigikogu.ee'}
FIN
1
{'edu

### Basic numbers

In [6]:
df_ep_emails["nemail_cc3"].describe()

count    8512.000000
mean      284.731203
std       222.100304
min        31.000000
25%       140.000000
50%       208.000000
75%       405.000000
max       835.000000
Name: nemail_cc3, dtype: float64

In [7]:
df_ep_emails["ltype"].unique()

array(['unicameral legislature', 'lower house', 'upper house',
       'bicameral legislature'], dtype=object)

In [8]:
df_ep_emails["legislature"].unique()

array(['Kuvendi', 'Consell General', 'National Assembly',
       'House of Representatives', 'Senate', 'Chamber of Representatives',
       'Parliament', 'House of Commons', 'Assemblée Nationale',
       'Cámara de Representantes', 'Folketing', 'Riigikogu', 'Eduskunta',
       'Parliament of Georgia', 'States', 'Hellenic Parliament',
       'Inatsisartut', 'Congress', 'Legislative Council', 'Országgyűlés',
       'Lok Sabha', 'Majles', 'Chamber of Deputies', 'Parlament',
       'Sobranie', 'National Council', 'Tweede Kamer',
       'Constituent Assembly', 'New Zealand Parliament',
       'National Parliament', 'Assembly', 'House of Assembly',
       'Scottish Parliament', 'National Assembly for Wales'], dtype=object)

In [9]:
df_ep_emails["legislature"].nunique()

34

In [10]:
df_ep_emails["email"].nunique()

8512

In [11]:
df_ep_emails["cc3"].nunique()

55

In [12]:
df_ep_emails.groupby(["cc3", "legislature"]).ngroups

61

In [13]:
df_ep_emails["female_prop_cc3"].describe()

count    8512.000000
mean        0.228148
std         0.127742
min         0.000000
25%         0.117647
50%         0.250000
75%         0.337725
max         0.543860
Name: female_prop_cc3, dtype: float64

## Prep scraped data

In [14]:
df_scraped_emails = (
    pd.read_csv("../data/scraped_pol_combined_legislature_data.csv")
    .sort_values(["cc3", "leg_start_year", "email"])
    .pipe(lambda df_: clean_email_column_no_dedupe(df_))
    # ================================================================
    # Get popsize
    .merge(
        (
            pd.read_csv("../data/popsize.csv")
            .dropna(subset=["cc3"])
            .rename_column("2024 [YR2024]", "pop2024")
        ), how="left", on="cc3", validate="m:1"
    )
    # ================================================================
    # Get #emails per country (cc3)
    .assign(nemail_cc3=lambda df_: df_.groupby("cc3")["email"].transform("nunique"))
    # ================================================================
    .reorder_columns(
        [
            "email",
            "cc3",
            "country",
            "ltype",
            "legislature",
            "chamber",
            "year",
            "nemail_cc3",
        ]
    )
#     # ================================================================
#     # Renaming fields to harmonise with EP
    .rename_column("leg_start_year", "leg_start_year_sg")
    .assign(
        leg_start_year=lambda df_: np.where(df_["cc3"] == "SGP", df_["leg_start_year_sg"], 2025).astype(int)
    )
)
assert (df_scraped_emails["nemail_cc3"] >= 30).all()
df_scraped_emails.head()

Unnamed: 0,email,cc3,country,ltype,legislature,chamber,year,nemail_cc3,name,party,title,address,private_phone,mobile_phone,work_phone,minister_phone,fax,mpsno,initial,firstname,lastname,gender,partyfname,partysname,statename,constname,profession,presentfaddr,presentladdr,delhiphone,permanentfaddr,permanentladdr,personalphone,lastloksabha,lsexpr,age,phone,noofterms,status,imageurl,profileurl,dob,numberofsons,numberofdaughters,qualification,freedom,profession2,categorycode,currentpagenumber,perpagesize,totalelements,totalpages,source_file,mplastfirstname,mpfirstlastname,maritalstatus,createdat,updatedat,email_fix,rank,constituency,leg_start_year_sg,sr_no,photo,contact,serial_no_,tel_no_r_,constituency_name,permanent_address,mobile,tele_no_res_no_,ac_no,candidate,Country Name,Series Name,Series Code,pop2024,region,EU,politician,civil_servants,leg_start_year
0,anna.falkenberg@ft.dk,DNK,Denmark,unicameral legislature,Folketing,,2025,186,Anna Falkenberg,Sambandsflokkurin,Mf,"Folketinget, Christiansborg 1218, København K",,+45 6162 4253,+45 3337 4532,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Denmark,"Population, total",SP.POP.TOTL,5972149,,1.0,https://github.com/everypolitician/everypolitician-data/tree/master/data/Denmark/Folketing,,2025
1,henrik.rejnholt.andersen@ft.dk,DNK,Denmark,unicameral legislature,Folketing,,2025,186,Henrik Rejnholt Andersen,Moderaterne,MF,"Folketinget, Christiansborg 1218, København K",,,+ 45 3337 5846,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Denmark,"Population, total",SP.POP.TOTL,5972149,,1.0,https://github.com/everypolitician/everypolitician-data/tree/master/data/Denmark/Folketing,,2025
2,mohammad.rona@ft.dk,DNK,Denmark,unicameral legislature,Folketing,,2025,186,Mohammad Rona,Moderaterne,MF,"Folketinget, Christiansborg 1218, København K",,,+45 3337 5866,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Denmark,"Population, total",SP.POP.TOTL,5972149,,1.0,https://github.com/everypolitician/everypolitician-data/tree/master/data/Denmark/Folketing,,2025
3,kim.valentin@ft.dk,DNK,Denmark,unicameral legislature,Folketing,,2025,186,Kim Valentin,Venstre,Cand.polit.,"Folketinget, Christiansborg 1218, København K",,,+45 3337 4527,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Denmark,"Population, total",SP.POP.TOTL,5972149,,1.0,https://github.com/everypolitician/everypolitician-data/tree/master/data/Denmark/Folketing,,2025
4,pia.kjaersgaard.sekretaer@ft.dk,DNK,Denmark,unicameral legislature,Folketing,,2025,186,Pia Kjærsgaard,Dansk Folkeparti,Fhv. formand for Folketinget,"Folketinget, Christiansborg 1218, København K",,,+45 3337 5500,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Denmark,"Population, total",SP.POP.TOTL,5972149,,1.0,https://github.com/everypolitician/everypolitician-data/tree/master/data/Denmark/Folketing,,2025


### Basic numbers

In [15]:
df_scraped_emails["nemail_cc3"].describe()

count    4011.000000
mean     2691.067814
std      1159.206982
min       174.000000
25%      3251.000000
50%      3251.000000
75%      3251.000000
max      3251.000000
Name: nemail_cc3, dtype: float64

In [16]:
df_scraped_emails["legislature"].unique()

array(['Folketing', 'Lok Sabha', 'State Legislature', 'Storting',
       'Parliament'], dtype=object)

In [17]:
df_scraped_emails["email"].nunique()

4011

In [18]:
df_scraped_emails["name"].nunique()

1183

In [19]:
df_scraped_emails["cc3"].nunique()

4

In [20]:
df_scraped_emails.groupby(["cc3", "legislature"]).ngroups

5

In [21]:
set_edomain = set()
for _, row in df_scraped_emails.iterrows():
    try:
        edomain = row["email"].split("@")[1]
        set_edomain.add(edomain)
    except IndexError:
        print(_)
#         print(row["email"])

len(set_edomain)

203

In [22]:
for cc in df_scraped_emails["cc3"].unique().tolist():
    _df = df_scraped_emails.query(f"cc3=='{cc}'")
    _set_edomain = set()
    for _, row in _df.iterrows():
        edomain = row["email"].split("@")[1]
        _set_edomain.add(edomain)
        
    print(cc)
    print(len(_set_edomain))
    print(_set_edomain)

DNK
20
{'stm.dk', 'fm.dk', 'fmn.dk', 'uvm.dk', 'kum.dk', 'bm.dk', 'mgtp.dk', 'em.dk', 'mim.dk', 'mssb.dk', 'fvm.dk', 'km.dk', 'uim.dk', 'trm.dk', 'sum.dk', 'aeldremin.dk', 'um.dk', 'jm.dk', 'ft.dk', 'skm.dk'}
IND
76
{'ymail.com', 'kalrajmishra.com', 'jayaprakashhegde.com', 'sansad.nic.', 'gmai.com', 'tejasvisurya.in', 'sansad.nic', 'gmail.comshasankshekharverma', 'sify.com', 'sansadnic.in', 'yahoo.in', '67gmail.com', 'chaitanyasharma.co.in', 'riteshpandey.in', 'plrprojects.com', 'icloud.com', 'gajendrapatel.com', 'sameerbhujbal.com', 'karti.com', 'visaka.in', 'somnathbharti.com', 'tharoor.in', 'ithamizhachi.com', 'jhr.nic.in', 'deepender.in', 'abc.com', 'gmail.comn', 'gmail.com', 'rb.railnet.gov.in', 'ramniwasgoel.com', 'gmil.com', 'inc.in', 'jswamy.com', 'gov.in', 'tn.gov.in', 'pcmohan.com', 'naveenjindal.com', 'yahoo.com', 'vsnl.com', 'poonammahajan.in', 'rahulgandhi.in', 'gmaail.com', 'prabhatsinh.com', 'citadelh.com', 'darshanajardosh.in', 'rediffmail.com', 'yahoo.co.in', 'bjpanda.

## Combine EP + Scraped

In [23]:
df_pol_emails = (
    pd.concat([df_ep_emails.assign(source="ep"), df_scraped_emails.assign(source="scraped")])
    .pipe(lambda df_: clean_email_column_no_dedupe(df_))
    # ================================================================
    # Get indicator for years and chambers for table1
    .assign(
        years=lambda df_: df_.groupby("cc3")["leg_start_year"].transform(
            lambda x: ", ".join(map(str, sorted(x.unique())))
        ),
        chambers=lambda df_: df_.groupby("cc3")["ltype"]
        .transform(lambda x: ", ".join(map(str, sorted(x.unique()))))
        .replace("legislature", "", regex=True)
        .replace("house", "", regex=True)
        .str.strip()
        .str.title(),
        legislatures=lambda df_: df_.groupby("cc3")["legislature"].transform(
            lambda x: ", ".join(map(str, sorted(x.unique())))
        ),
    )
    .sort_values(["cc3", "leg_start_year", "email"])
)
# del df_ep_emails, df_scraped_emails
df_pol_emails

Unnamed: 0,email,cc3,country,ltype,legislature,chamber,leg_start_year,nemail_cc3,gender,id,name,sort_name,twitter,facebook,group,group_id,area_id,area,term_x,start_date,end_date,image,wikidata,wikidata_group,wikidata_area,row_id,term_y,row_count,n_unique_emails,person_count_legistype,url,cc,leg_start_date,lastmod,pop2024,lastmod_year,female_count_cc3,female_prop_cc3,source,year,party,title,address,private_phone,mobile_phone,work_phone,minister_phone,fax,mpsno,initial,firstname,lastname,partyfname,partysname,statename,constname,profession,presentfaddr,presentladdr,delhiphone,permanentfaddr,permanentladdr,personalphone,lastloksabha,lsexpr,age,phone,noofterms,status,imageurl,profileurl,dob,numberofsons,numberofdaughters,qualification,freedom,profession2,categorycode,currentpagenumber,perpagesize,totalelements,totalpages,source_file,mplastfirstname,mpfirstlastname,maritalstatus,createdat,updatedat,email_fix,rank,constituency,leg_start_year_sg,sr_no,photo,contact,serial_no_,tel_no_r_,constituency_name,permanent_address,mobile,tele_no_res_no_,ac_no,candidate,Country Name,Series Name,Series Code,region,EU,politician,civil_servants,years,chambers,legislatures
0,albana.vokshi@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,female,c8bd71fc-4815-40b5-a5c4-bd359e3b0cef,Albana Vokshi,VOKSHI ALBANA,,,PD,c26946f7-8c5e-4474-a85e-ee5342cf4006,durrës_county,Durrës County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Albana-Vokshi-PD.jpg,Q4709025,Q845743,,490.0,VII Pluralist Legislature,140.0,31.0,292.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1.557735e+09,2740502.0,2019.0,12.0,0.085714,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"2009, 2013, 2017",Unicameral,Kuvendi
1,aldo.bumci@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,589ba883-b893-40ed-95f8-69b10f0497d9,Aldo Bumçi,BUMÇI ALDO,,,PD,c26946f7-8c5e-4474-a85e-ee5342cf4006,tirana_county,Tirana County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Aldo-Bumci-PD.jpg,Q2832310,Q845743,,490.0,VII Pluralist Legislature,140.0,31.0,292.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1.557735e+09,2740502.0,2019.0,12.0,0.085714,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"2009, 2013, 2017",Unicameral,Kuvendi
2,bashkim.fino@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,9d0181c7-7ebe-4b16-800a-6ea28baf22f5,Bashkim Fino,FINO BASHKIM,,,PS,0c83d4f1-9e86-48f2-95ee-9c3d158d1141,fier_county,Fier County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Bashkim-Fino-PS.jpg,Q809978,Q642882,,490.0,VII Pluralist Legislature,140.0,31.0,292.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1.557735e+09,2740502.0,2019.0,12.0,0.085714,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"2009, 2013, 2017",Unicameral,Kuvendi
3,besnik.baraj@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,d37e9f6d-9308-4017-bfed-70d081dcab83,Besnik Baraj,BARAJ BESNIK,,,PS,0c83d4f1-9e86-48f2-95ee-9c3d158d1141,tirana_county,Tirana County,7,,,https://www.parlament.al/wp-content/uploads/2015/12/Besnik-Baraj-PS.jpg,,Q642882,,490.0,VII Pluralist Legislature,140.0,31.0,292.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1.557735e+09,2740502.0,2019.0,12.0,0.085714,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"2009, 2013, 2017",Unicameral,Kuvendi
4,blendi.klosi@parlament.al,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,male,e2516f6c-a1a2-45d6-a23a-b73dd7742829,Blendi Klosi,KLOSI BLENDI,,,PS,0c83d4f1-9e86-48f2-95ee-9c3d158d1141,durrës_county,Durrës County,7,,,https://www.parlament.al/wp-content/uploads/2016/01/Blendi-Klosi-PS.jpg,Q13037656,Q642882,,490.0,VII Pluralist Legislature,140.0,31.0,292.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/4ace3b1548b95eb669deab4d99bbc894639269f2/data/Albania/Assembly/term-7.csv,AL,2009-09-08,1.557735e+09,2740502.0,2019.0,12.0,0.085714,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"2009, 2013, 2017",Unicameral,Kuvendi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8318,udtarusenga@gmail.com,ZWE,Zimbabwe,lower house,House of Assembly,House of Assembly,2013,37,male,7bd3e99f-5023-4191-8eee-fecb519650a6,Unganai Tarusenga,"Tarusenga, Unganai",,,MDC-T,mdc-t,st_mary's,St Mary's,8,,,http://www.parlzim.gov.zw/media/k2/items/cache/f4db67ec2dbaadd122c4c0b528856935_XL.jpg,,Q1146616,,253.0,8th Parliament,229.0,37.0,229.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/f74be3db4c76d1b42155ad56fa63cad6b3ed2bdd/data/Zimbabwe/Assembly/term-8.csv,ZW,2013-09-17,1.556117e+09,17020321.0,2019.0,9.0,0.243243,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013,Lower,House of Assembly
8319,user@parlzim.gov,ZWE,Zimbabwe,lower house,House of Assembly,House of Assembly,2013,37,female,33a8a270-99c9-435a-b3d8-66486e865558,Mable M. Chinomona,"Chinomona, Mable M.",,,ZANU PF,zanu_pf,mutoko_north,Mutoko North,8,,,http://www.parlzim.gov.zw/media/k2/items/cache/9267284e7733f4bec00d2e114d3f3ba1_XL.jpg,,Q1910161,,253.0,8th Parliament,229.0,37.0,229.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/f74be3db4c76d1b42155ad56fa63cad6b3ed2bdd/data/Zimbabwe/Assembly/term-8.csv,ZW,2013-09-17,1.556117e+09,17020321.0,2019.0,9.0,0.243243,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013,Lower,House of Assembly
8320,wmaondera@gmail.com,ZWE,Zimbabwe,lower house,House of Assembly,House of Assembly,2013,37,male,970274bf-f4a8-4154-aa1d-bbee67b5960a,Webster Maondera,"Maondera, Webster",,,MDC-T,mdc-t,glen_norah,Glen Norah,8,,,http://www.kuvakazim.com/media_root/images/Webster_Maondera.jpg,,Q1146616,,253.0,8th Parliament,229.0,37.0,229.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/f74be3db4c76d1b42155ad56fa63cad6b3ed2bdd/data/Zimbabwe/Assembly/term-8.csv,ZW,2013-09-17,1.556117e+09,17020321.0,2019.0,9.0,0.243243,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013,Lower,House of Assembly
8321,wmutomba@gmail.com,ZWE,Zimbabwe,lower house,House of Assembly,House of Assembly,2013,37,male,44bfd6ef-ad19-41fc-9383-59f254bee19a,William Mutomba,"Mutomba, William",,,ZANU PF,zanu_pf,buhera_north,Buhera North,8,,,http://www.kuvakazim.com/media_root/images/mutombawilliam.jpg,,Q1910161,,253.0,8th Parliament,229.0,37.0,229.0,https://cdn.rawgit.com/everypolitician/everypolitician-data/f74be3db4c76d1b42155ad56fa63cad6b3ed2bdd/data/Zimbabwe/Assembly/term-8.csv,ZW,2013-09-17,1.556117e+09,17020321.0,2019.0,9.0,0.243243,ep,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2013,Lower,House of Assembly


In [24]:
df_pol_emails["nemail_cc3"].describe()

count    12055.000000
mean       985.986479
std       1272.965029
min         31.000000
25%        152.000000
50%        314.000000
75%        835.000000
max       3251.000000
Name: nemail_cc3, dtype: float64

In [25]:
df_pol_emails["email"].nunique()

12055

In [26]:
df_pol_emails["cc3"].nunique()

57

In [27]:
df_pol_emails.groupby(["cc3", "legislature"]).ngroups

64

## Table1

In [28]:
df_tab1 = (
    df_pol_emails.drop_duplicates(["cc3"], ignore_index=True)
    #     .filter(["cc3", "country", "nemail_cc3", "female_prop_cc3", "years", "chambers", "legislatures", "pop2024"])
    #     .assign(female_prop_cc3=lambda df_: (100 * df_["female_prop_cc3"]).round(1).astype(str) + "\\%")
    .filter(
        ["cc3", "country", "nemail_cc3", "years", "chambers", "legislatures", "pop2024"]
    )
    .assign(
        pop2024=lambda df_: (df_["pop2024"].astype(float) / 1_000_000)
        .round(1)
        .apply(str)
    )
    .replace("Bosnia-and-Herzegovina", "Bosnia")
    .replace(
        "House of Commons, National Assembly for Wales, Scottish Parliament",
        "Commons, Senedd, Scottish Parliament",
    )
    # Tidy year strings
    .replace("1997, 2001, 2005, 2007, 2010, 2011, 2015, 2016, 2017", "1997--2017")
    .replace("2004, 2007, 2010, 2013, 2016", "2004--2016")
    .replace("2001, 2005, 2007, 2011, 2015", "2001--2015")
    .replace("2004, 2007, 2009, 2012, 2015", "2004--2015")
    .replace("2001, 2005, 2007, 2011, 2015, 2025", "2001--2025")
    .replace("2008, 2011, 2014, 2017", "2008--2017")
    .replace("2006, 2010, 2012, 2016", "2006--2016")
    .replace("2001, 2006, 2011, 2015, 2021", "2001--2025")
    .replace("Lower ", "Lower", regex=True)
    .assign(ix=lambda df_: range(1, 1 + len(df_)))
    .reorder_columns(["ix"])
)
df_tab1

Unnamed: 0,ix,cc3,country,nemail_cc3,years,chambers,legislatures,pop2024
0,1,ALB,Albania,140,"2009, 2013, 2017",Unicameral,Kuvendi,2.7
1,2,AND,Andorra,31,2015,Unicameral,Consell General,0.1
2,3,ARM,Armenia,119,2019,Unicameral,National Assembly,2.8
3,4,AUS,Australia,177,2004--2016,"Lower, Upper","House of Representatives, Senate",26.9
4,5,BEL,Belgium,149,2014,Lower,Chamber of Representatives,11.9
5,6,BGR,Bulgaria,206,"2013, 2014, 2017",Unicameral,National Assembly,6.4
6,7,BIH,Bosnia,42,2014,Lower,House of Representatives,3.2
7,8,BLR,Belarus,59,2016,Unicameral,House of Representatives,9.1
8,9,BMU,Bermuda,33,2017,Lower,Parliament,
9,10,BTN,Bhutan,48,2013,Lower,National Assembly,0.8


In [29]:
df_tab1["pop2024"].astype(float).sum()

np.float64(2620.0)

In [30]:
# % coverage of global pop
100 * (df_tab1["pop2024"].astype(float).sum()) / 8000

np.float64(32.75)

In [31]:
pandas_to_tex(
    df_tab1.replace("nan", "---"), "../tables/hipb_pooled_emailcoverage_summary.tex", index=False,
)

In [32]:
!cat ../tables/hipb_pooled_emailcoverage_summary.tex

\midrule
1 & ALB & Albania & 140 & 2009, 2013, 2017 & Unicameral & Kuvendi & 2.7 \\
2 & AND & Andorra & 31 & 2015 & Unicameral & Consell General & 0.1 \\
3 & ARM & Armenia & 119 & 2019 & Unicameral & National Assembly & 2.8 \\
4 & AUS & Australia & 177 & 2004--2016 & Lower, Upper & House of Representatives, Senate & 26.9 \\
5 & BEL & Belgium & 149 & 2014 & Lower & Chamber of Representatives & 11.9 \\
6 & BGR & Bulgaria & 206 & 2013, 2014, 2017 & Unicameral & National Assembly & 6.4 \\
7 & BIH & Bosnia & 42 & 2014 & Lower & House of Representatives & 3.2 \\
8 & BLR & Belarus & 59 & 2016 & Unicameral & House of Representatives & 9.1 \\
9 & BMU & Bermuda & 33 & 2017 & Lower & Parliament & --- \\
10 & BTN & Bhutan & 48 & 2013 & Lower & National Assembly & 0.8 \\
11 & CAN & Canada & 432 & 2011, 2015 & Lower, Upper & House of Commons, Senate & 40.4 \\
12 & CMR & Cameroon & 109 & 2013 & Lower & Assemblée Nationale & 29.4 \\
13 & COL & Colombia & 169 & 2014, 2018 & Lower & Cámara 

## Merge to HIBP

### Scraped data

In [33]:
df_scraped_email_breach_expanded = (
    df_scraped_emails.filter(["email", "cc3", "country", "year", "nemail_cc3"])
    .pipe(lambda df_: clean_email_column_no_dedupe(df_))
    # ============================================================================
    # Merge to breached status
    .merge(
        (
            pd.read_csv("../data/scraped_pol_hibp.csv")
            .clean_names()
            .rename_column("filename", "email")
        ),
        how="left",
        on="email",
        validate="1:m",
    )
    .dropna(subset=["breach"])
    # ============================================================================
    # Merge to expanded breach info
    .merge(
        (
            pd.read_parquet("../data/breaches_01_2025_expanded.parquet")
            .remove_columns(
                [
                    "description",
                    "title",
                    "domain",
                    "breachdate",
                    "addeddate",
                    "modifieddate",
                    "logopath",
                    "timetopublic",
                    "dataclasses",
                ]
            )
            .rename_column("name", "breach")
        ),
        how="left",
        on="breach",
        validate="m:1",
    )
    # ============================================================================
    .query("present==True")
    .reset_index(drop=True)
)
df_scraped_email_breach_expanded.head()

Unnamed: 0,email,cc3,country,year,nemail_cc3,breach,present,pwncount,isverified,isfabricated,issensitive,isretired,isspamlist,ismalware,issubscriptionfree,yearstopublic,n_dataclasses,Education levels,Private messages,Personal health data,Licence plates,Apps installed on devices,Financial transactions,Family structure,Support tickets,Eating habits,Religions,Vehicle identification numbers (VINs),Sexual orientations,Instant messenger identities,Deceased date,Website activity,Security questions and answers,Passwords,Account balances,Customer feedback,Home ownership statuses,Net worths,Tattoo status,Loan information,User statuses,Health insurance information,Car ownership statuses,Telecommunications carrier,Income levels,Career levels,Mothers maiden names,Bank account numbers,Password strengths,Work habits,Historical passwords,Physical attributes,Parenting plans,Time zones,Email messages,Living costs,Sexual fetishes,Family members' names,Geographic locations,Political views,Government issued IDs,Personal interests,MAC addresses,Browsing histories,Device information,Places of birth,Partial credit card data,Years of professional experience,Dates of birth,Utility bills,Deceased statuses,Job applications,Drinking habits,Nicknames,Passport numbers,...,HIV statuses,Company names,Beauty ratings,Political donations,Races,Survey results,Flights taken,Credit card CVV,Drug habits,Usernames,Spoken languages,Social security numbers,Ages,Chat logs,Comments,Cellular network names,Social media profiles,Browser user agent details,Citizenship statuses,Travel plans,Purchasing habits,Device usage tracking data,Credit status information,Spouses names,Relationship statuses,Auth tokens,Charitable donations,Nationalities,Biometric data,Recovery email addresses,Taxation records,Encrypted keys,Delivery instructions,Payment methods,Salutations,Fitness levels,Email addresses,Audio recordings,Job titles,User website URLs,Reward program balances,Age groups,Physical addresses,Employment statuses,Login histories,Driver's licenses,IMSI numbers,Names,Avatars,Employers,Appointments,Travel habits,Purchases,Customer interactions,Buying preferences,Warranty claims,Partial dates of birth,Ethnicities,Mnemonic phrases,Marital statuses,Payment histories,Social connections,IMEI numbers,Password hints,Vehicle details,IP addresses,Financial investments,PINs,Occupations,Cryptocurrency wallet addresses,SMS messages,Astrological signs,Genders,Credit cards,Personal descriptions
0,kim.valentin@ft.dk,DNK,Denmark,2025,186,LinkedInScrape2023,True,19788753,True,False,False,False,True,False,False,0.008219,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,alex.ahrendtsen@ft.dk,DNK,Denmark,2025,186,DemandScience,True,121796165,True,False,False,False,False,False,False,0.709589,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,anne.paulin@ft.dk,DNK,Denmark,2025,186,DemandScience,True,121796165,True,False,False,False,False,False,False,0.709589,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,anni.matthiesen@ft.dk,DNK,Denmark,2025,186,Cit0day,True,226883414,False,False,False,False,False,False,False,0.041096,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,anni.matthiesen@ft.dk,DNK,Denmark,2025,186,MyHeritage,True,91991358,True,False,False,False,False,False,False,1.320548,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### EP data

In [34]:
df_ep_email_breach_expanded = (
    # All EP emails
    df_pol_emails.filter(
        [
            "email",
            "gender",
            "cc3",
            "country",
            "ltype",
            "legislature",
            "chamber",
            "leg_start_year",
            "nemail_cc3",
        ]
    )
    # ============================================================================
    # Merge to breached status
    .merge(
        (
            pd.read_csv("../data/everypol_hibp.csv")
            .clean_names()
            .rename_column("filename", "email")
        ),
        how="left",
        on="email",
        validate="1:m",
    )
    .dropna(subset=["breach"])
    # ============================================================================
    # Merge to expanded breach info
    .merge(
        (
            pd.read_parquet("../data/breaches_01_2025_expanded.parquet")
            .remove_columns(
                [
                    "description",
                    "title",
                    "domain",
                    "breachdate",
                    "addeddate",
                    "modifieddate",
                    "logopath",
                    "timetopublic",
                    "dataclasses",
                ]
            )
            .rename_column("name", "breach")
        ),
        how="left",
        on="breach",
        validate="m:1",
    )
    # ============================================================================
    .query("present==True")
    .reset_index(drop=True)
)
df_ep_email_breach_expanded.head()

Unnamed: 0,email,gender,cc3,country,ltype,legislature,chamber,leg_start_year,nemail_cc3,breach,present,pwncount,isverified,isfabricated,issensitive,isretired,isspamlist,ismalware,issubscriptionfree,yearstopublic,n_dataclasses,Education levels,Private messages,Personal health data,Licence plates,Apps installed on devices,Financial transactions,Family structure,Support tickets,Eating habits,Religions,Vehicle identification numbers (VINs),Sexual orientations,Instant messenger identities,Deceased date,Website activity,Security questions and answers,Passwords,Account balances,Customer feedback,Home ownership statuses,Net worths,Tattoo status,Loan information,User statuses,Health insurance information,Car ownership statuses,Telecommunications carrier,Income levels,Career levels,Mothers maiden names,Bank account numbers,Password strengths,Work habits,Historical passwords,Physical attributes,Parenting plans,Time zones,Email messages,Living costs,Sexual fetishes,Family members' names,Geographic locations,Political views,Government issued IDs,Personal interests,MAC addresses,Browsing histories,Device information,Places of birth,Partial credit card data,Years of professional experience,Dates of birth,Utility bills,Deceased statuses,...,HIV statuses,Company names,Beauty ratings,Political donations,Races,Survey results,Flights taken,Credit card CVV,Drug habits,Usernames,Spoken languages,Social security numbers,Ages,Chat logs,Comments,Cellular network names,Social media profiles,Browser user agent details,Citizenship statuses,Travel plans,Purchasing habits,Device usage tracking data,Credit status information,Spouses names,Relationship statuses,Auth tokens,Charitable donations,Nationalities,Biometric data,Recovery email addresses,Taxation records,Encrypted keys,Delivery instructions,Payment methods,Salutations,Fitness levels,Email addresses,Audio recordings,Job titles,User website URLs,Reward program balances,Age groups,Physical addresses,Employment statuses,Login histories,Driver's licenses,IMSI numbers,Names,Avatars,Employers,Appointments,Travel habits,Purchases,Customer interactions,Buying preferences,Warranty claims,Partial dates of birth,Ethnicities,Mnemonic phrases,Marital statuses,Payment histories,Social connections,IMEI numbers,Password hints,Vehicle details,IP addresses,Financial investments,PINs,Occupations,Cryptocurrency wallet addresses,SMS messages,Astrological signs,Genders,Credit cards,Personal descriptions
0,edmond.spaho@parlament.al,male,ALB,Albania,unicameral legislature,Kuvendi,Kuvendi,2009,140,Nitro,True,77159696,True,False,False,False,False,False,False,0.309589,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,pepbardina@gmail.com,male,AND,Andorra,unicameral legislature,Consell General,Consell General,2015,31,MyHeritage,True,91991358,True,False,False,False,False,False,False,1.320548,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,rgilicasals@gmail.com,female,AND,Andorra,unicameral legislature,Consell General,Consell General,2015,31,Dailymotion,True,85176234,True,False,False,False,False,False,False,0.79726,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,rgilicasals@gmail.com,female,AND,Andorra,unicameral legislature,Consell General,Consell General,2015,31,Dropbox,True,68648009,True,False,False,False,False,False,False,4.169863,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,rgilicasals@gmail.com,female,AND,Andorra,unicameral legislature,Consell General,Consell General,2015,31,Evite,True,100985047,True,False,False,False,False,False,False,5.926027,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


### Manually check

In [35]:
# non_breached_emails = []
# for pol in df["email"].unique():
#     if pol not in df_ep_email_breach_expanded["email"].unique().tolist():
#         non_breached_emails.append(pol)

In [36]:
# import random
# random.seed(42)

# # Sample 10 unique emails
# sampled_emails = random.sample(non_breached_emails, 20)
# sampled_emails

Paste (no breach) 

* g.akriotis@parliament.gr

### Combine

In [37]:
LIST_SERIOUS_DATACLASSES = [
    "Audio recordings",
    "Auth tokens",
    "Bank account numbers",
    "Biometric data",
    "Browsing histories",
    "Chat logs",
    "Credit card CVV",
    "Credit cards",
    "Credit status information",
    "Drinking habits",
    "Driver's licenses",
    "Drug habits",
    "Email messages",
    "Encrypted keys",
    "Government issued IDs",
    "Health insurance information",
    "Historical passwords",
    "HIV statuses",
    "Login histories",
    "MAC addresses",
    "Mothers maiden names",
    "Nationalities",
    "Partial credit card data",
    "Partial dates of birth",
    "Passport numbers",
    "Password hints",
    "Passwords",
    "Personal health data",
    "Photos",
    "PINs",
    "Places of birth",
    "Private messages",
    "Security questions and answers",
    "Sexual fetishes",
    "Sexual orientations",
    "SMS messages",
    "Social security numbers",
    "Taxation records",
]

In [38]:
df_email_breach_expanded = (
    pd.concat(
        [
            df_ep_email_breach_expanded.remove_columns(
                ["gender", "country", "ltype", "chamber", "legislature"]
            ).rename_column("leg_start_year", "year"),
            df_scraped_email_breach_expanded.remove_columns(["country"]),
        ],
        ignore_index=True,
    )
    .fillna(0)
    .assign(
        seriousbreach=lambda df_: df_[LIST_SERIOUS_DATACLASSES].max(axis=1).astype(int)
    )
)
df_email_breach_expanded

Unnamed: 0,email,cc3,year,nemail_cc3,breach,present,pwncount,isverified,isfabricated,issensitive,isretired,isspamlist,ismalware,issubscriptionfree,yearstopublic,n_dataclasses,Education levels,Private messages,Personal health data,Licence plates,Apps installed on devices,Financial transactions,Family structure,Support tickets,Eating habits,Religions,Vehicle identification numbers (VINs),Sexual orientations,Instant messenger identities,Deceased date,Website activity,Security questions and answers,Passwords,Account balances,Customer feedback,Home ownership statuses,Net worths,Tattoo status,Loan information,User statuses,Health insurance information,Car ownership statuses,Telecommunications carrier,Income levels,Career levels,Mothers maiden names,Bank account numbers,Password strengths,Work habits,Historical passwords,Physical attributes,Parenting plans,Time zones,Email messages,Living costs,Sexual fetishes,Family members' names,Geographic locations,Political views,Government issued IDs,Personal interests,MAC addresses,Browsing histories,Device information,Places of birth,Partial credit card data,Years of professional experience,Dates of birth,Utility bills,Deceased statuses,Job applications,Drinking habits,Nicknames,Passport numbers,Smoking habits,...,Company names,Beauty ratings,Political donations,Races,Survey results,Flights taken,Credit card CVV,Drug habits,Usernames,Spoken languages,Social security numbers,Ages,Chat logs,Comments,Cellular network names,Social media profiles,Browser user agent details,Citizenship statuses,Travel plans,Purchasing habits,Device usage tracking data,Credit status information,Spouses names,Relationship statuses,Auth tokens,Charitable donations,Nationalities,Biometric data,Recovery email addresses,Taxation records,Encrypted keys,Delivery instructions,Payment methods,Salutations,Fitness levels,Email addresses,Audio recordings,Job titles,User website URLs,Reward program balances,Age groups,Physical addresses,Employment statuses,Login histories,Driver's licenses,IMSI numbers,Names,Avatars,Employers,Appointments,Travel habits,Purchases,Customer interactions,Buying preferences,Warranty claims,Partial dates of birth,Ethnicities,Mnemonic phrases,Marital statuses,Payment histories,Social connections,IMEI numbers,Password hints,Vehicle details,IP addresses,Financial investments,PINs,Occupations,Cryptocurrency wallet addresses,SMS messages,Astrological signs,Genders,Credit cards,Personal descriptions,seriousbreach
0,edmond.spaho@parlament.al,ALB,2009,140,Nitro,True,77159696,True,False,False,False,False,False,False,0.309589,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,pepbardina@gmail.com,AND,2015,31,MyHeritage,True,91991358,True,False,False,False,False,False,False,1.320548,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,rgilicasals@gmail.com,AND,2015,31,Dailymotion,True,85176234,True,False,False,False,False,False,False,0.797260,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,rgilicasals@gmail.com,AND,2015,31,Dropbox,True,68648009,True,False,False,False,False,False,False,4.169863,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,rgilicasals@gmail.com,AND,2015,31,Evite,True,100985047,True,False,False,False,False,False,False,5.926027,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12116,wan.rizal@gmail.com,SGP,2025,400,Dropbox,True,68648009,True,False,False,False,False,False,False,4.169863,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
12117,wan.rizal@gmail.com,SGP,2025,400,Dubsmash,True,161749950,True,False,False,False,False,False,False,0.235616,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
12118,wan.rizal@gmail.com,SGP,2025,400,Eatigo,True,2789609,True,False,False,False,False,False,False,2.860274,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
12119,wan.rizal@gmail.com,SGP,2025,400,Edmodo,True,43423561,True,False,False,False,False,False,False,0.057534,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


### Basic numbers

In [39]:
# pols involved in 559 data breaches
df_email_breach_expanded["breach"].nunique()

559

In [40]:
# 3501 pols have been breached
n_pwned_pols = df_email_breach_expanded["email"].nunique()
n_pwned_pols

3501

In [41]:
100 * n_pwned_pols / (
    df_pol_emails["email"].nunique() + df_scraped_emails["email"].nunique()
)

21.791360637370847

In [42]:
# 2244 pols have been breached multiple times
n_multi_pwned_pols = (
    df_email_breach_expanded.groupby(["email"])["present"]
    .sum()
    .reset_index()
    .query("present>1")["email"]
    .nunique()
)
n_multi_pwned_pols

2244

In [43]:
# Proportion of multiple breaches
100 * (n_multi_pwned_pols / n_pwned_pols)

64.09597257926308

In [44]:
# Proportion of total
100 * n_multi_pwned_pols / (
    df_pol_emails["email"].nunique() + df_scraped_emails["email"].nunique()
)

13.967384538777543

In [45]:
# How many email-pw pairs
_n = (
    df_email_breach_expanded.query("present==True")
    .clean_names()
    .query("email_addresses==1")
    .query("passwords==1")["email"]
    .nunique()
)

In [46]:
100 * _n / n_pwned_pols

66.09540131391032

In [47]:
# How many had serious breaches
df_email_breach_expanded.query("seriousbreach==1")["email"].nunique()

2380

In [48]:
# How many had multiple serious breaches
(
    df_email_breach_expanded.query("seriousbreach==1")
    .groupby(["email"])["seriousbreach"]
    .sum()
    .reset_index()
    .query("seriousbreach>1")["email"]
    .nunique()
)

1250

### Tabulate: Compromised data types

In [49]:
df_pwnpol_datatype = (
    df_email_breach_expanded.query("present==True")
    # ===================================================
    # Filter data types and remove duplicate breach info
    .set_index("email")
    .iloc[:, 20:]
    .reset_index()
    .drop_duplicates(ignore_index=True)
    # ===================================================
    # Long by email-datatype
    .melt(id_vars=["email"], var_name="datatype", value_name="present")
    .groupby(["email", "datatype"])["present"]
    .sum()
    .reset_index()
    .assign(present=lambda df_: np.where(df_["present"] > 0, 1, 0))
    # ===================================================
    # Back to wide by email
    .pivot(index="email", columns="datatype", values="present")
    # ===================================================
    .T.sum(axis=1)
    .reset_index(name="count")
    .sort_values(["count", "datatype"], ascending=[False, True], ignore_index=True)
    .assign(percent=lambda df_: 100 * df_["count"] / n_pwned_pols)
    .assign(percent=lambda df_: df_["percent"].apply(lambda x: f"{round(x, 1)}%"))
)
df_pwnpol_datatype

Unnamed: 0,datatype,count,percent
0,Email addresses,3500,100.0%
1,Names,2962,84.6%
2,Phone numbers,2711,77.4%
3,Job titles,2460,70.3%
4,seriousbreach,2380,68.0%
5,Passwords,2315,66.1%
6,Social media profiles,2289,65.4%
7,Physical addresses,2268,64.8%
8,Geographic locations,1776,50.7%
9,Employers,1582,45.2%


In [50]:
df_datatype1 = (
    df_pwnpol_datatype.iloc[:40]
    .reset_index(drop=True)
    .assign(ix=range(1, 41))
    .astype(str)
    .reorder_columns(["ix"])
)
df_datatype2 = (
    df_pwnpol_datatype.iloc[40:74]
    .reset_index(drop=True)
    .assign(ix=range(41, 75))
    .astype(str)
    .reorder_columns(["ix"])
)
df2tex = pd.concat([df_datatype1, df_datatype2], axis=1)
# print(df2tex.to_latex(na_rep="", index=False))
df2tex

Unnamed: 0,ix,datatype,count,percent,ix.1,datatype.1,count.1,percent.1
0,1,Email addresses,3500,100.0%,41.0,Bank account numbers,29.0,0.8%
1,2,Names,2962,84.6%,42.0,Payment histories,28.0,0.8%
2,3,Phone numbers,2711,77.4%,43.0,Nationalities,27.0,0.8%
3,4,Job titles,2460,70.3%,44.0,Telecommunications carrier,25.0,0.7%
4,5,seriousbreach,2380,68.0%,45.0,Deceased statuses,18.0,0.5%
5,6,Passwords,2315,66.1%,46.0,Relationship statuses,17.0,0.5%
6,7,Social media profiles,2289,65.4%,47.0,Professional skills,14.0,0.4%
7,8,Physical addresses,2268,64.8%,48.0,Website activity,14.0,0.4%
8,9,Geographic locations,1776,50.7%,49.0,Credit cards,13.0,0.4%
9,10,Employers,1582,45.2%,50.0,Support tickets,10.0,0.3%


In [51]:
pandas_to_tex(
    df2tex, "../tables/hipb_pwnpols_datatypes.tex", na_rep="", index=False, escape=True
)

!cat "../tables/hipb_pwnpols_datatypes.tex"

\midrule
1 & Email addresses & 3500 & 100.0\% & 41 & Bank account numbers & 29 & 0.8\% \\
2 & Names & 2962 & 84.6\% & 42 & Payment histories & 28 & 0.8\% \\
3 & Phone numbers & 2711 & 77.4\% & 43 & Nationalities & 27 & 0.8\% \\
4 & Job titles & 2460 & 70.3\% & 44 & Telecommunications carrier & 25 & 0.7\% \\
5 & seriousbreach & 2380 & 68.0\% & 45 & Deceased statuses & 18 & 0.5\% \\
6 & Passwords & 2315 & 66.1\% & 46 & Relationship statuses & 17 & 0.5\% \\
7 & Social media profiles & 2289 & 65.4\% & 47 & Professional skills & 14 & 0.4\% \\
8 & Physical addresses & 2268 & 64.8\% & 48 & Website activity & 14 & 0.4\% \\
9 & Geographic locations & 1776 & 50.7\% & 49 & Credit cards & 13 & 0.4\% \\
10 & Employers & 1582 & 45.2\% & 50 & Support tickets & 10 & 0.3\% \\
11 & Genders & 1478 & 42.2\% & 51 & Passport numbers & 9 & 0.3\% \\
12 & Dates of birth & 1398 & 39.9\% & 52 & Profile photos & 9 & 0.3\% \\
13 & IP addresses & 1379 & 39.4\% & 53 & Political donations & 8 & 0.2\% \\

### Tabulate: Breaches

In [52]:
LIST_ALL_DATACLASSES = df_email_breach_expanded.iloc[:, 16:-1].columns.tolist()

In [53]:
df_pwnpol_breach_incident = (
    df_email_breach_expanded
    # ===================================================
    .groupby(["breach"])["email"]
    .size()
    .reset_index()
    .rename_column("email", "emails")
    .sort_values("emails", ascending=False, ignore_index=True)
    .head(50)
    .assign(percent=lambda df_: 100 * df_["emails"] / n_pwned_pols)
    .assign(percent=lambda df_: df_["percent"].apply(lambda x: f"{round(x, 1)}\%"))
    # ===================================================
    # Merge back to get breach characteristics
    .merge(
        (
            pd.read_parquet("../data/breaches_01_2025_expanded.parquet")
            .remove_columns(
                [
                    "description",
                    "title",
                    "domain",
                    "modifieddate",
                    "logopath",
                    "timetopublic",
                    "dataclasses",
                ]
            )
            .rename_column("name", "breach")
            .assign(
                seriousbreach=lambda df_: df_[LIST_SERIOUS_DATACLASSES]
                .max(axis=1)
                .astype(int)
            )
            .assign(
                seriousbreach=lambda df_: np.where(
                    df_["seriousbreach"] == 1, r"\checkmark", ""
                )
            )
            #             .filter(["breach", "breachdate", "addeddate", "yearstopublic", "n_dataclasses", "seriousbreach", *LIST_ALL_DATACLASSES])
            .filter(
                [
                    "breach",
                    "breachdate",
                    "addeddate",
                    "yearstopublic",
                    "pwncount",
                    "n_dataclasses",
                    "seriousbreach",
                ]
            )
        ),
        how="left",
        on="breach",
        validate="1:1",
    )
    # ===================================================
    # Remove timestamps
    .assign(
        breachdate=lambda df_: pd.to_datetime(df_["breachdate"]).dt.date,
        addeddate=lambda df_: pd.to_datetime(df_["addeddate"]).dt.date,
    )
    .assign(pwncount=lambda df_: df_["pwncount"] / 1_000_000)
    .round(1)
    .astype(str)
    .assign(pwncount=lambda df_: df_["pwncount"] + "M")
    .assign(yearstopublic=lambda df_: df_["yearstopublic"] + " years")
    # ===================================================
    .assign(ix=lambda df_: range(1, 1 + len(df_)))
    .reorder_columns(["ix"])
)
df_pwnpol_breach_incident

Unnamed: 0,ix,breach,emails,percent,breachdate,addeddate,yearstopublic,pwncount,n_dataclasses,seriousbreach
0,1,db8151dd,1471,42.0\%,2020-02-20,2020-05-15,0.2 years,22.8M,6,
1,2,OnlinerSpambot,1157,33.0\%,2017-08-28,2017-08-29,0.0 years,711.5M,2,\checkmark
2,3,PDL,954,27.2\%,2019-10-16,2019-11-22,0.1 years,622.2M,7,
3,4,VerificationsIO,927,26.5\%,2019-02-25,2019-03-09,0.0 years,763.1M,10,
4,5,LinkedIn,447,12.8\%,2012-05-05,2016-05-21,4.0 years,164.6M,2,\checkmark
5,6,LinkedInScrape,328,9.4\%,2021-04-08,2021-10-02,0.5 years,125.7M,7,
6,7,Apollo,298,8.5\%,2018-07-23,2018-10-05,0.2 years,125.9M,8,
7,8,Intelimost,279,8.0\%,2019-03-10,2019-04-02,0.1 years,3.1M,2,\checkmark
8,9,Twitter200M,262,7.5\%,2021-01-01,2023-01-05,2.0 years,211.5M,4,
9,10,Cit0day,247,7.1\%,2020-11-04,2020-11-19,0.0 years,226.9M,2,\checkmark


In [54]:
pandas_to_tex(
    df_pwnpol_breach_incident.head(25),
    "../tables/hipb_pwnpols_breach_incidents.tex",
    escape=False,
)

In [55]:
!cat ../tables/hipb_pwnpols_breach_incidents.tex

\midrule
1 & db8151dd & 1471 & 42.0\% & 2020-02-20 & 2020-05-15 & 0.2 years & 22.8M & 6 &  \\
2 & OnlinerSpambot & 1157 & 33.0\% & 2017-08-28 & 2017-08-29 & 0.0 years & 711.5M & 2 & \checkmark \\
3 & PDL & 954 & 27.2\% & 2019-10-16 & 2019-11-22 & 0.1 years & 622.2M & 7 &  \\
4 & VerificationsIO & 927 & 26.5\% & 2019-02-25 & 2019-03-09 & 0.0 years & 763.1M & 10 &  \\
5 & LinkedIn & 447 & 12.8\% & 2012-05-05 & 2016-05-21 & 4.0 years & 164.6M & 2 & \checkmark \\
6 & LinkedInScrape & 328 & 9.4\% & 2021-04-08 & 2021-10-02 & 0.5 years & 125.7M & 7 &  \\
7 & Apollo & 298 & 8.5\% & 2018-07-23 & 2018-10-05 & 0.2 years & 125.9M & 8 &  \\
8 & Intelimost & 279 & 8.0\% & 2019-03-10 & 2019-04-02 & 0.1 years & 3.1M & 2 & \checkmark \\
9 & Twitter200M & 262 & 7.5\% & 2021-01-01 & 2023-01-05 & 2.0 years & 211.5M & 4 &  \\
10 & Cit0day & 247 & 7.1\% & 2020-11-04 & 2020-11-19 & 0.0 years & 226.9M & 2 & \checkmark \\
11 & Collection1 & 241 & 6.9\% & 2019-01-07 & 2019-01-16 & 0.0 years & 772.9M 