### Analysis of Everypol (after JSON to pd)

In [41]:
import os
import pandas as pd
import json
import ast
import openpyxl
import janitor

from utilities import process_json_files_to_matrix, clean_email_column_no_dedupe

### Assume MCAR

Where we don't have pol. emails etc., let's just assume that the data are missing at random. 

In [42]:
everypol_hibp = process_json_files_to_matrix("../data/everypol/everypol_hibp")
everypol_hibp.shape

(8536, 220)

In [43]:
everypol_hibp.head()

Unnamed: 0,Filename,000webhost,123RF,2844Breaches,500px,8fit,ABFRL,AKP,ActMobile,Acuity,...,Zacks,Zomato,Zynga,bigbasket,db8151dd,digiDirect,eThekwiniMunicipality,iMesh,ixigo,piZap
0,wouter.raskin@dekamer.be,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,k.verhoeven@tweedekamer.nl,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2,lcoffice@dphk.org,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,pk.sreemathi@sansad.nic.in,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,edurubio@parlamento.gub.uy,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [44]:
long_everypol_hibp = pd.melt(
    everypol_hibp,
    id_vars=['Filename'],
    var_name='Breach',
    value_name='Present'
)
long_everypol_hibp.shape

(1869384, 3)

In [45]:
long_everypol_hibp.to_csv("../data/everypol_hibp.csv", index=False)

In [46]:
breaches = pd.read_csv("../data/breaches_01_2025.csv")
breaches.head()

Unnamed: 0,Name,Title,Domain,BreachDate,AddedDate,ModifiedDate,PwnCount,Description,LogoPath,DataClasses,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree
0,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,"In approximately March 2015, the free web host...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'IP addresses', 'Names', '...",True,False,False,False,False,False,False
1,123RF,123RF,123rf.com,2020-03-22,2020-11-15T00:59:50Z,2020-11-15T01:07:10Z,8661578,"In March 2020, the stock photo site <a href=""h...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'IP addresses', 'Names', '...",True,False,False,False,False,False,False
2,126,126,126.com,2012-01-01,2016-10-08T07:46:05Z,2016-10-08T07:46:05Z,6414191,"In approximately 2012, it's alleged that the C...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'Passwords']",False,False,False,False,False,False,False
3,17Media,17,17app.co,2016-04-19,2016-07-08T01:55:03Z,2016-07-08T01:55:03Z,4009640,"In April 2016, customer data obtained from the...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Device information', 'Email addresses', 'IP ...",True,False,False,False,False,False,False
4,17173,17173,17173.com,2011-12-28,2018-04-28T04:53:15Z,2018-04-28T04:53:15Z,7485802,"In late 2011, <a href=""https://news.softpedia....",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'Passwords', 'Usernames']",False,False,False,False,False,False,False


In [47]:
breaches["DataClasses"] = breaches["DataClasses"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
all_categories = set(item for sublist in breaches["DataClasses"] for item in sublist)
expanded_df = pd.DataFrame(
    {category: breaches["DataClasses"].apply(lambda x: category in x) for category in all_categories}
)
breaches = pd.concat([breaches.drop(columns=["DataClasses"]), expanded_df], axis=1)

In [48]:
breaches_everypol_hibp = long_everypol_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_everypol_hibp.head()

Unnamed: 0,Filename,Breach,Present,Name,Title,Domain,BreachDate,AddedDate,ModifiedDate,PwnCount,...,Mothers maiden names,Names,Apps installed on devices,Driver's licenses,Personal descriptions,Buying preferences,Historical passwords,IMSI numbers,Living costs,Ethnicities
0,wouter.raskin@dekamer.be,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,...,False,True,False,False,False,False,False,False,False,False
1,k.verhoeven@tweedekamer.nl,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,...,False,True,False,False,False,False,False,False,False,False
2,lcoffice@dphk.org,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,...,False,True,False,False,False,False,False,False,False,False
3,pk.sreemathi@sansad.nic.in,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,...,False,True,False,False,False,False,False,False,False,False
4,edurubio@parlamento.gub.uy,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,...,False,True,False,False,False,False,False,False,False,False


In [49]:
boolean_columns = breaches_everypol_hibp.select_dtypes(include=['bool'])
grouped_bool_counts = boolean_columns.groupby(breaches_everypol_hibp["Filename"]).sum().reset_index()
grouped_bool_counts.describe()

Unnamed: 0,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Parenting plans,Job titles,Income levels,...,Mothers maiden names,Names,Apps installed on devices,Driver's licenses,Personal descriptions,Buying preferences,Historical passwords,IMSI numbers,Living costs,Ethnicities
count,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,...,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0
mean,207.0,1.0,0.0,0.0,12.0,2.0,1.0,0.0,16.0,5.0,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,207.0,1.0,0.0,0.0,12.0,2.0,1.0,0.0,16.0,5.0,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
25%,207.0,1.0,0.0,0.0,12.0,2.0,1.0,0.0,16.0,5.0,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
50%,207.0,1.0,0.0,0.0,12.0,2.0,1.0,0.0,16.0,5.0,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
75%,207.0,1.0,0.0,0.0,12.0,2.0,1.0,0.0,16.0,5.0,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
max,207.0,1.0,0.0,0.0,12.0,2.0,1.0,0.0,16.0,5.0,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0


In [50]:
grouped_bool_counts.head()

Unnamed: 0,Filename,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Parenting plans,Job titles,...,Mothers maiden names,Names,Apps installed on devices,Driver's licenses,Personal descriptions,Buying preferences,Historical passwords,IMSI numbers,Living costs,Ethnicities
0,(i)joice.george@sansad.nic.in,207,1,0,0,12,2,1,0,16,...,1,150,0,1,0,0,0,0,1,2
1,1.office@bjpanda.org,207,1,0,0,12,2,1,0,16,...,1,150,0,1,0,0,0,0,1,2
2,101bbb@naver.com,207,1,0,0,12,2,1,0,16,...,1,150,0,1,0,0,0,0,1,2
3,2016kimkj@gmail.com,207,1,0,0,12,2,1,0,16,...,1,150,0,1,0,0,0,0,1,2
4,2016kimss@gmail.com,207,1,0,0,12,2,1,0,16,...,1,150,0,1,0,0,0,0,1,2


In [51]:
pol_dat = pd.read_csv("../data/everypol/everypol_combined_legislature_data.csv", low_memory=False)
pol_dat.head()

Unnamed: 0,id,name,sort_name,email,twitter,facebook,group,group_id,area_id,area,...,n_unique_emails,person_count_legistype,ltype,url,cc,leg_start_date,lastmod,cc3,pop2024,lastmod_year
0,35ad9676-8485-4137-9a16-50f2844f3ab2,"Adhalrao Patil, Shri Shivaji","Adhalrao Patil, Shri Shivaji",shivajirao@sansad.nic.in,,,Shiv Sena,SS,shirur,Shirur,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
1,f1f0f31d-ddb8-4681-82c0-4484d0d26ee3,"Adhikari, Shri Deepak (Dev)","Adhikari, Shri Deepak (Dev)",adhikari.deepak@sansad.nic.in,idevadhikari,IamTheDev,All India Trinamool Congress,AITC,ghatal,Ghatal,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
2,bb68e3c6-de79-4d07-ad1f-90bb9a61ded0,"Adhikari, Shri Sisir Kumar","Adhikari, Shri Sisir Kumar",sisiradhikari76@yahoo.com,,,All India Trinamool Congress,AITC,kanthi,Kanthi,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
3,c9a891c3-adb6-4581-b93b-72f9c00beacc,"Adhikari, Shri Suvendu","Adhikari, Shri Suvendu",adhikari.suvendu@sansad.nic.in,,,All India Trinamool Congress,AITC,tamluk,Tamluk,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
4,5c22c70d-8317-4f94-97ec-2ccadd1acdf3,"Adityanath , Shri Yogi","Adityanath , Shri Yogi",yogi.adityanath@sansad.nic.in,,,Bharatiya Janata Party,BJP,gorakhpur,Gorakhpur,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019


In [52]:
pol_dat.shape

(25087, 35)

In [53]:
pol_dat = clean_email_column_no_dedupe(pol_dat)
pol_dat.shape

(8512, 35)

In [54]:
pol_hibp = pol_dat.merge(grouped_bool_counts, left_on="email", right_on="Filename", how="left")
pol_hibp.head()

Unnamed: 0,id,name,sort_name,email,twitter,facebook,group,group_id,area_id,area,...,Mothers maiden names,Names,Apps installed on devices,Driver's licenses,Personal descriptions,Buying preferences,Historical passwords,IMSI numbers,Living costs,Ethnicities
0,35ad9676-8485-4137-9a16-50f2844f3ab2,"Adhalrao Patil, Shri Shivaji","Adhalrao Patil, Shri Shivaji",shivajirao@sansad.nic.in,,,Shiv Sena,SS,shirur,Shirur,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
1,f1f0f31d-ddb8-4681-82c0-4484d0d26ee3,"Adhikari, Shri Deepak (Dev)","Adhikari, Shri Deepak (Dev)",adhikari.deepak@sansad.nic.in,idevadhikari,IamTheDev,All India Trinamool Congress,AITC,ghatal,Ghatal,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
2,bb68e3c6-de79-4d07-ad1f-90bb9a61ded0,"Adhikari, Shri Sisir Kumar","Adhikari, Shri Sisir Kumar",sisiradhikari76@yahoo.com,,,All India Trinamool Congress,AITC,kanthi,Kanthi,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
3,c9a891c3-adb6-4581-b93b-72f9c00beacc,"Adhikari, Shri Suvendu","Adhikari, Shri Suvendu",adhikari.suvendu@sansad.nic.in,,,All India Trinamool Congress,AITC,tamluk,Tamluk,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
4,5c22c70d-8317-4f94-97ec-2ccadd1acdf3,"Adityanath , Shri Yogi","Adityanath , Shri Yogi",yogi.adityanath@sansad.nic.in,,,Bharatiya Janata Party,BJP,gorakhpur,Gorakhpur,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0


In [55]:
pol_hibp.describe().round(2)

Unnamed: 0,row_id,leg_start_year,row_count,n_unique_emails,person_count_legistype,lastmod,pop2024,lastmod_year,IsVerified,IsFabricated,...,Mothers maiden names,Names,Apps installed on devices,Driver's licenses,Personal descriptions,Buying preferences,Historical passwords,IMSI numbers,Living costs,Ethnicities
count,8512.0,8512.0,8512.0,8512.0,8512.0,8512.0,8153.0,8512.0,8131.0,8131.0,...,8131.0,8131.0,8131.0,8131.0,8131.0,8131.0,8131.0,8131.0,8131.0,8131.0
mean,260.17,2014.98,293.1,226.26,533.32,1556591000.0,123066800.0,2018.99,207.0,1.0,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
std,181.72,1.93,181.07,160.52,418.01,2085169.0,342178200.0,0.09,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,1.0,1997.0,31.0,31.0,31.0,1542919000.0,80341.0,2018.0,207.0,1.0,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
25%,124.0,2014.0,134.0,101.0,249.0,1555982000.0,6378654.0,2019.0,207.0,1.0,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
50%,201.0,2015.0,290.0,169.0,499.0,1557279000.0,31240320.0,2019.0,207.0,1.0,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
75%,385.0,2016.0,410.0,327.0,606.0,1557730000.0,61020220.0,2019.0,207.0,1.0,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
max,748.0,2019.0,686.0,646.0,1783.0,1557904000.0,1441720000.0,2019.0,207.0,1.0,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0


In [56]:
columns_to_fill = list(all_categories) + ['IsVerified', 'IsFabricated', 'IsSensitive', 
                   'IsRetired', 'IsSpamList', 'IsMalware', 'IsSubscriptionFree']
pol_hibp[columns_to_fill] = pol_hibp[columns_to_fill].fillna(0)

In [57]:
pol_hibp.groupby(['country', 'legislature', 'leg_start_year']).mean(numeric_only=True).reset_index().describe().round(2)

Unnamed: 0,leg_start_year,row_id,row_count,n_unique_emails,person_count_legistype,lastmod,pop2024,lastmod_year,IsVerified,IsFabricated,...,Mothers maiden names,Names,Apps installed on devices,Driver's licenses,Personal descriptions,Buying preferences,Historical passwords,IMSI numbers,Living costs,Ethnicities
count,77.0,77.0,77.0,77.0,77.0,77.0,71.0,77.0,77.0,77.0,...,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
mean,2014.31,346.4,207.78,134.99,416.82,1556156000.0,47440660.0,2018.97,199.5,0.96,...,0.96,144.57,0.0,0.96,0.0,0.0,0.0,0.0,0.96,1.93
std,3.26,193.07,165.17,125.88,416.12,2896721.0,173094100.0,0.16,26.08,0.13,...,0.13,18.9,0.0,0.13,0.0,0.0,0.0,0.0,0.13,0.25
min,1997.0,1.0,31.0,31.0,31.0,1542919000.0,80341.0,2018.0,32.14,0.16,...,0.16,23.29,0.0,0.16,0.0,0.0,0.0,0.0,0.16,0.31
25%,2013.0,150.0,90.0,54.0,115.0,1555055000.0,3308847.0,2019.0,204.91,0.99,...,0.99,148.48,0.0,0.99,0.0,0.0,0.0,0.0,0.99,1.98
50%,2015.0,365.0,158.0,99.0,267.0,1557278000.0,10319100.0,2019.0,207.0,1.0,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
75%,2016.0,503.0,291.0,152.0,554.0,1557708000.0,40409180.0,2019.0,207.0,1.0,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0
max,2019.0,748.0,686.0,646.0,1783.0,1557904000.0,1441720000.0,2019.0,207.0,1.0,...,1.0,150.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0


### Let's merge the rest (India, Eur.)

In [58]:
no_hibp = process_json_files_to_matrix("../data/no_hibp")
dk_hibp = process_json_files_to_matrix("../data/dk_hibp")
in_hibp = process_json_files_to_matrix("../data/india/ls_hibp")
sg_hibp = process_json_files_to_matrix("../data/sg_hibp")

gr_hibp = process_json_files_to_matrix("../data/ar_hibp/")
ng_hibp = process_json_files_to_matrix("../data/ng_hibp/")
br_hibp = process_json_files_to_matrix("../data/ar_hibp/")
ar_hibp = process_json_files_to_matrix("../data/ar_hibp/")

bihar_hibp = process_json_files_to_matrix("../data/india/bihar_hibp")
tn_hibp = process_json_files_to_matrix("../data/india/tn_hibp")
up_hibp = process_json_files_to_matrix("../data/india/up_hibp")
hp_hibp = process_json_files_to_matrix("../data/india/hp_hibp")
del_hibp = process_json_files_to_matrix("../data/india/delhi_hibp")

In [59]:
no_hibp

Unnamed: 0,Filename,Adapt,Adobe,Apollo,BVD,Bitly,Cit0day,DemandScience,Disqus,Dodonew,...,NetEase,OnlinerSpambot,PDL,ShareThis,TAPAirPortugal,TrikSpamBotnet,Twitter200M,VerificationsIO,YouveBeenScraped,db8151dd
0,sandra.bruflot@stortinget.no,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,torbjorn.vereide@stortinget.no,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,marius.arion.nilsen@stortinget.no,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,torstein.solberg@stortinget.no,False,False,False,False,False,False,True,False,False,...,False,False,True,False,False,False,False,True,False,False
4,aslaug.sem-jacobsen@stortinget.no,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,tellef.inge.morland@stortinget.no,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
170,ase.kristin.ask.bakke@stortinget.no,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
171,sveinung.rotevatn@stortinget.no,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
172,mona.nilsen.ap@stortinget.no,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [61]:
# Make it long
long_no_hibp    = pd.melt(no_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_dk_hibp    = pd.melt(dk_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_in_hibp    = pd.melt(in_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_sg_hibp    = pd.melt(sg_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')

long_gr_hibp  = pd.melt(gr_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_ng_hibp  = pd.melt(ng_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_br_hibp  = pd.melt(br_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_ar_hibp  = pd.melt(ar_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')

long_bihar_hibp = pd.melt(bihar_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_tn_hibp    = pd.melt(tn_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_up_hibp    = pd.melt(up_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_hp_hibp    = pd.melt(hp_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_del_hibp   = pd.melt(del_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')

In [62]:
(
    pd.concat([long_no_hibp, long_dk_hibp, long_in_hibp, long_sg_hibp, 
               long_gr_hibp, long_ng_hibp, long_br_hibp, long_ar_hibp, 
               long_bihar_hibp, long_tn_hibp, long_up_hibp, long_hp_hibp, long_del_hibp])
    .to_csv("../data/scraped_pol_hibp.csv", index=False)
)

In [63]:
# Join to breaches
breaches_no = long_no_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_dk = long_dk_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_in = long_in_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_sg = long_sg_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")

breaches_gr = long_no_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_ng = long_dk_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_br = long_in_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_ar = long_sg_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")

breaches_bihar = long_bihar_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_tn    = long_tn_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_up    = long_up_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_hp    = long_hp_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_del   = long_del_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")

In [64]:
# Group by email and sum the bool cols
no_breach_count = breaches_no.select_dtypes(include=['bool']).groupby(breaches_no["Filename"]).sum().reset_index()
dk_breach_count = breaches_dk.select_dtypes(include=['bool']).groupby(breaches_dk["Filename"]).sum().reset_index()
in_breach_count = breaches_in.select_dtypes(include=['bool']).groupby(breaches_in["Filename"]).sum().reset_index()
sg_breach_count = breaches_sg.select_dtypes(include=['bool']).groupby(breaches_sg["Filename"]).sum().reset_index()

gr_breach_count = breaches_gr.select_dtypes(include=['bool']).groupby(breaches_no["Filename"]).sum().reset_index()
ng_breach_count = breaches_ng.select_dtypes(include=['bool']).groupby(breaches_dk["Filename"]).sum().reset_index()
br_breach_count = breaches_br.select_dtypes(include=['bool']).groupby(breaches_in["Filename"]).sum().reset_index()
ar_breach_count = breaches_ar.select_dtypes(include=['bool']).groupby(breaches_sg["Filename"]).sum().reset_index()

bihar_breach_count = breaches_bihar.select_dtypes(include=['bool']).groupby(breaches_bihar["Filename"]).sum().reset_index()
tn_breach_count = breaches_tn.select_dtypes(include=['bool']).groupby(breaches_tn["Filename"]).sum().reset_index()
up_breach_count = breaches_up.select_dtypes(include=['bool']).groupby(breaches_up["Filename"]).sum().reset_index()
hp_breach_count = breaches_hp.select_dtypes(include=['bool']).groupby(breaches_hp["Filename"]).sum().reset_index()
del_breach_count = breaches_del.select_dtypes(include=['bool']).groupby(breaches_del["Filename"]).sum().reset_index()

In [65]:
# Norway
no_parl = pd.read_csv("../data/no/no_parliament.csv")
no_parl = clean_email_column_no_dedupe(no_parl)
print(no_parl.shape)

# Denmark
wb = openpyxl.load_workbook("../data/danish_parliament_1_2025.xlsx")
ws = wb.active  
data = list(ws.values)
dk_parl = pd.DataFrame(data)
dk_parl.columns = dk_parl.iloc[0]
dk_parl = dk_parl[1:].reset_index(drop=True)
dk_parl = clean_email_column_no_dedupe(dk_parl)
print(dk_parl.shape)

# India
in_df = pd.read_csv("../data/india/ls_long.csv")
in_df = clean_email_column_no_dedupe(in_df)
print(in_df.shape)

# Singapore
sg_df = pd.read_csv("../data/sg/sg_mp.csv")
sg_df = clean_email_column_no_dedupe(sg_df)
print(sg_df.shape)

# Bihar
with open('../data/india/bihar/bihar.txt', 'r', encoding='utf-8') as file:
    lines = file.read().splitlines()
    lines = [line.split('\t') for line in lines]

bihar_df = pd.DataFrame(lines, columns=['sr_no', 'photo', 'constituency', 'name', 'gender', 'party', 'contact', 'email'])
bihar_df = clean_email_column_no_dedupe(bihar_df)
print(bihar_df.shape)

# UP
up_df = pd.read_csv("../data/india/up/up_18_mlas.csv", usecols=['email'], encoding="utf-8", quotechar='"', sep=",", engine="python")
up_df = clean_email_column_no_dedupe(up_df)
print(up_df.shape)

# HP
hp_df = pd.read_csv("../data/india/hp_14.csv")
hp_df.columns = hp_df.columns.str.lower()
hp_df = clean_email_column_no_dedupe(hp_df)
print(hp_df.shape)

# TN
tn_df = pd.read_csv("../data/india/tn/tn.csv")
tn_df.rename(columns={"Email Address": "email"}, inplace=True)
tn_df = clean_email_column_no_dedupe(tn_df)
print(tn_df.shape)

# Delhi
del_df = pd.read_csv("../data/india/delhi/delhi_7th_assembly.csv")
del_df.rename(columns={"Email": "email"}, inplace=True)
del_df = clean_email_column_no_dedupe(del_df)
print(del_df.shape)

(174, 3)
(186, 10)
(2562, 43)
(400, 7)
(241, 8)
(106, 1)
(68, 7)
(233, 6)
(43, 8)


  warn("Workbook contains no default style, apply openpyxl's default")


In [26]:
# Add metadata (to be consistent with EP)
no_parl = no_parl.assign(
    country="Norway",
    cc3="NOR",
    year=2025,
    legislature="Storting",
    ltype="unicameral legislature",
).clean_names()
dk_parl = dk_parl.assign(
    country="Denmark",
    cc3="DNK",
    year=2025,
    legislature="Folketing",
    ltype="unicameral legislature",
).clean_names()
in_df = in_df.assign(
    country="India",
    cc3="IND",
    year=2025,
    legislature="Lok Sabha",
    ltype="bicameral legislature",
).clean_names()
sg_df = sg_df.assign(
    country="Singapore",
    cc3="SGP",
    year=2025,
    legislature="Parliament",
    ltype="unicameral legislature",
).clean_names()

bihar_df = bihar_df.assign(
    country="India",
    cc3="IND",
    year=2025,
    legislature="State Legislature",
    chamber="Bihar Legislature",
    ltype="bicameral legislature",
).clean_names()
tn_df = tn_df.assign(
    country="India",
    cc3="IND",
    year=2025,
    legislature="State Legislature",
    chamber="Tamil Nadu State Legislature",
    ltype="bicameral legislature",
).clean_names()
up_df = up_df.assign(
    country="India",
    cc3="IND",
    year=2025,
    legislature="State Legislature",
    chamber="UP State Legislature",
    ltype="bicameral legislature",
).clean_names()
hp_df = hp_df.assign(
    country="India",
    cc3="IND",
    year=2025,
    legislature="State Legislature",
    ltype="bicameral legislature",
    chamber="HP Legislature",
).clean_names()
del_df = del_df.assign(
    country="India",
    cc3="IND",
    year=2025,
    legislature="State Legislature",
    chamber="Delhi Legislature",
    ltype="bicameral legislature",
).clean_names()

# Renaming col
dk_parl = dk_parl.rename_column("full_name", "name")

  return method(self._obj, *args, **kwargs)


In [27]:
(
    pd.concat([no_parl, dk_parl, in_df, sg_df, bihar_df, tn_df, up_df, hp_df, del_df])
    .to_csv("../data/scraped_pol_combined_legislature_data.csv", index=False)
)

In [28]:
in_df

Unnamed: 0,mpsno,initial,firstname,lastname,gender,partyfname,partysname,statename,constname,profession,...,mpfirstlastname,maritalstatus,createdat,updatedat,email_fix,country,cc3,year,legislature,ltype
0,344,Shri,,A. Raja,Male,Dravida Munnetra Kazhagam,DMK,Tamil Nadu ...,Nilgiris,Advocate ...,...,,,,,raja.andimuthu@gmail.com,India,IND,2025,Lok Sabha,bicameral legislature
1,344,Shri,,A. Raja,Male,Dravida Munnetra Kazhagam,DMK,Tamil Nadu ...,Nilgiris,Advocate ...,...,,,,,a.raja@sansad.nic.in,India,IND,2025,Lok Sabha,bicameral legislature
2,5175,Shri,Narayana Swamy,Abbaiah,Male,Bharatiya Janata Party,BJP,Karnataka ...,Chitradurga,Businessperson ...,...,,,,,anarayanaswamyanekal5@gmail.com,India,IND,2025,Lok Sabha,bicameral legislature
3,5175,Shri,Narayana Swamy,Abbaiah,Male,Bharatiya Janata Party,BJP,Karnataka ...,Chitradurga,Businessperson ...,...,,,,,a.narayanswamy@sansad.nic.in,India,IND,2025,Lok Sabha,bicameral legislature
4,2654,Dr.,Farooq,Abdullah,Male,Jammu and Kashmir National Conference,J&KNC,Jammu and Kashmir ...,Srinagar,Social Worker ...,...,,,,,iamfarooq80@hotmail.com,India,IND,2025,Lok Sabha,bicameral legislature
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2557,512,Shri,V.,Vetriselvan,Male,Dravida Munnetra Kazhagam,DMK,Tamil Nadu ...,Krishnagiri,Advocate ...,...,,,,,vetriselvan@sansad.nic.in,India,IND,2025,Lok Sabha,bicameral legislature
2558,513,Smt.,Dumpa Mary,Vijayakumari,Female,Telugu Desam Party,TDP,Andhra Pradesh ...,Bhadrachalam-ST ...,Social Worker ...,...,,,,,vijayak@sansad.nic.in,India,IND,2025,Lok Sabha,bicameral legislature
2559,518,Shri,Srikanta Datta Narasimharaja,Wadiyar,Male,Indian National Congress,INC,Karnataka ...,Mysore ...,Businessperson ...,...,,,,,wadiyar@sansad.nic.in,India,IND,2025,Lok Sabha,bicameral legislature
2560,521,Dr.(Smt.),Sudha,Yadav,Female,Bharatiya Janata Party,BJP,Haryana ...,Mahendergarh ...,Educationist ...,...,,,,,sudhayadav@sansad.nic.in,India,IND,2025,Lok Sabha,bicameral legislature


### Join

In [29]:
no_all = no_parl.merge(no_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0)
dk_all = dk_parl.merge(dk_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0)
in_all = in_df.merge(in_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0)
sg_all = sg_df.merge(sg_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0)

bihar_all = bihar_df.merge(bihar_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0)
tn_all = tn_df.merge(tn_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0)
up_all = up_df.merge(up_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0)
hp_all = hp_df.merge(hp_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0)
del_all = del_df.merge(del_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0)

### India is in long. Get it back to MP level

In [30]:
in_grouped = (
    in_all.groupby(['mpsno', 'source_file'], as_index=False)
    .sum(numeric_only=True)
)
in_grouped.head()

Unnamed: 0,mpsno,source_file,lastloksabha,age,noofterms,numberofsons,numberofdaughters,currentpagenumber,perpagesize,totalelements,...,Mothers maiden names,Names,Apps installed on devices,Driver's licenses,Personal descriptions,Buying preferences,Historical passwords,IMSI numbers,Living costs,Ethnicities
0,1,ls_14,14,77.0,2.0,2.0,1.0,1,554,554,...,0.0,283.0,1.0,2.0,1.0,0.0,3.0,1.0,0.0,4.0
1,2,ls_14,14,53.0,3.0,2.0,0.0,1,554,554,...,0.0,283.0,1.0,2.0,1.0,0.0,3.0,1.0,0.0,4.0
2,3,ls_14,14,56.0,2.0,1.0,1.0,1,554,554,...,0.0,283.0,1.0,2.0,1.0,0.0,3.0,1.0,0.0,4.0
3,4,ls_15,15,81.0,9.0,1.0,2.0,1,511,511,...,0.0,283.0,1.0,2.0,1.0,0.0,3.0,1.0,0.0,4.0
4,5,ls_14,14,74.0,3.0,0.0,2.0,1,554,554,...,0.0,283.0,1.0,2.0,1.0,0.0,3.0,1.0,0.0,4.0


In [31]:
in_grouped.shape

(1610, 165)

In [32]:
no_all.describe().round(2)

Unnamed: 0,year,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Parenting plans,Job titles,...,Mothers maiden names,Names,Apps installed on devices,Driver's licenses,Personal descriptions,Buying preferences,Historical passwords,IMSI numbers,Living costs,Ethnicities
count,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,...,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0
mean,2025.0,26.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,9.0,...,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,2025.0,26.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,9.0,...,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,2025.0,26.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,9.0,...,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,2025.0,26.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,9.0,...,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,2025.0,26.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,9.0,...,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,2025.0,26.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,9.0,...,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [33]:
dk_all.describe().round(2)

Unnamed: 0,year,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Parenting plans,Job titles,...,Mothers maiden names,Names,Apps installed on devices,Driver's licenses,Personal descriptions,Buying preferences,Historical passwords,IMSI numbers,Living costs,Ethnicities
count,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,...,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0
mean,2025.0,36.77,0.0,0.0,0.0,3.87,0.97,0.0,0.0,8.71,...,0.0,23.23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.97
std,0.0,6.73,0.0,0.0,0.0,0.71,0.18,0.0,0.0,1.59,...,0.0,4.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18
min,2025.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2025.0,38.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,9.0,...,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,2025.0,38.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,9.0,...,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,2025.0,38.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,9.0,...,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,2025.0,38.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,9.0,...,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [34]:
in_grouped.groupby(['source_file']).mean()

Unnamed: 0_level_0,mpsno,lastloksabha,age,noofterms,numberofsons,numberofdaughters,currentpagenumber,perpagesize,totalelements,totalpages,...,Mothers maiden names,Names,Apps installed on devices,Driver's licenses,Personal descriptions,Buying preferences,Historical passwords,IMSI numbers,Living costs,Ethnicities
source_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ls_13,280.705128,13.0,76.705128,2.294872,1.467949,1.352564,1.0,551.0,551.0,1.0,...,0.0,283.0,1.0,2.0,1.0,0.0,3.0,1.0,0.0,4.0
ls_14,1461.060606,14.636364,77.386364,3.151515,1.590909,1.287879,1.045455,579.181818,579.181818,1.045455,...,0.0,295.863636,1.045455,2.090909,1.045455,0.0,3.136364,1.045455,0.0,4.181818
ls_15,3475.84127,16.666667,74.232804,2.169312,1.391534,1.502646,1.111111,567.777778,567.777778,1.111111,...,0.0,269.52381,0.952381,1.904762,0.952381,0.0,2.857143,0.952381,0.0,3.809524
ls_16,4006.098485,24.787879,99.57197,2.852273,1.837121,1.700758,1.549242,793.212121,793.212121,1.549242,...,0.0,438.435606,1.549242,3.098485,1.549242,0.0,4.647727,1.549242,0.0,6.19697
ls_17,4500.073022,33.103448,113.336714,3.819473,2.206897,1.957404,1.947262,1049.574037,1049.574037,1.947262,...,0.0,551.075051,1.947262,3.894523,1.947262,0.0,5.841785,1.947262,0.0,7.789047
ls_18,5268.476064,32.984043,102.242021,2.555851,1.098404,1.119681,1.832447,996.851064,996.851064,1.832447,...,0.0,518.582447,1.832447,3.664894,1.832447,0.0,5.49734,1.832447,0.0,7.329787


In [35]:
sg_all.describe().round(2)

Unnamed: 0,leg_start_year,year,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Parenting plans,...,Mothers maiden names,Names,Apps installed on devices,Driver's licenses,Personal descriptions,Buying preferences,Historical passwords,IMSI numbers,Living costs,Ethnicities
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,...,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,2011.08,2025.0,69.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,...,0.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
std,7.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,2001.0,2025.0,69.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,...,0.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,2006.0,2025.0,69.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,...,0.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,2011.0,2025.0,69.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,...,0.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,2015.0,2025.0,69.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,...,0.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,2021.0,2025.0,69.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,...,0.0,46.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [36]:
bihar_all.describe().round(2)

Unnamed: 0,year,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Parenting plans,Job titles,...,Mothers maiden names,Names,Apps installed on devices,Driver's licenses,Personal descriptions,Buying preferences,Historical passwords,IMSI numbers,Living costs,Ethnicities
count,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,...,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0
mean,2025.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,2025.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2025.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2025.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2025.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2025.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
tn_all.describe().round(2)

Unnamed: 0,photo,year,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Parenting plans,...,Mothers maiden names,Names,Apps installed on devices,Driver's licenses,Personal descriptions,Buying preferences,Historical passwords,IMSI numbers,Living costs,Ethnicities
count,233.0,233.0,233.0,233.0,233.0,233.0,233.0,233.0,233.0,233.0,...,233.0,233.0,233.0,233.0,233.0,233.0,233.0,233.0,233.0,233.0
mean,0.0,2025.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,2025.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2025.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,2025.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,2025.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,2025.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
hp_all.describe().round(2)

Unnamed: 0,photo,mobile,tele_no_res_no_,year,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,...,Mothers maiden names,Names,Apps installed on devices,Driver's licenses,Personal descriptions,Buying preferences,Historical passwords,IMSI numbers,Living costs,Ethnicities
count,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,...,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0
mean,0.0,9416969000.0,0.0,2025.0,37.0,0.0,0.0,0.0,4.0,0.0,...,0.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,0.0,510635200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,7018000000.0,0.0,2025.0,37.0,0.0,0.0,0.0,4.0,0.0,...,0.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,9418022000.0,0.0,2025.0,37.0,0.0,0.0,0.0,4.0,0.0,...,0.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,9418154000.0,0.0,2025.0,37.0,0.0,0.0,0.0,4.0,0.0,...,0.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,9805725000.0,0.0,2025.0,37.0,0.0,0.0,0.0,4.0,0.0,...,0.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,9882812000.0,0.0,2025.0,37.0,0.0,0.0,0.0,4.0,0.0,...,0.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
up_all.describe().round(2)

Unnamed: 0,year,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Parenting plans,Job titles,...,Mothers maiden names,Names,Apps installed on devices,Driver's licenses,Personal descriptions,Buying preferences,Historical passwords,IMSI numbers,Living costs,Ethnicities
count,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,...,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0
mean,2025.0,30.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,2025.0,30.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,2025.0,30.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,2025.0,30.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,2025.0,30.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,2025.0,30.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0,...,0.0,20.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [40]:
del_all.describe().round(2)

Unnamed: 0,ac_no,year,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Parenting plans,...,Mothers maiden names,Names,Apps installed on devices,Driver's licenses,Personal descriptions,Buying preferences,Historical passwords,IMSI numbers,Living costs,Ethnicities
count,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,...,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0
mean,44.47,2025.0,32.23,0.98,0.0,0.0,2.93,0.98,0.0,0.0,...,0.0,25.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,16.26,0.0,5.03,0.15,0.0,0.0,0.46,0.15,0.0,0.0,...,0.0,3.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,18.0,2025.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,29.5,2025.0,33.0,1.0,0.0,0.0,3.0,1.0,0.0,0.0,...,0.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,45.0,2025.0,33.0,1.0,0.0,0.0,3.0,1.0,0.0,0.0,...,0.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,58.5,2025.0,33.0,1.0,0.0,0.0,3.0,1.0,0.0,0.0,...,0.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,70.0,2025.0,33.0,1.0,0.0,0.0,3.0,1.0,0.0,0.0,...,0.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
