### Analysis of Everypol (after JSON to pd)

In [1]:
import os
import pandas as pd
import json
import ast
import openpyxl
import janitor
import sys

sys.path.append("/home/lsys/pwned_pols/venv/lib/python3.10/site-packages")
from utilities import process_json_files_to_matrix, clean_email_column_no_dedupe, clean_contact_column

### Assume MCAR

Where we don't have pol. emails etc., let's just assume that the data are missing at random. 

In [2]:
everypol_hibp = process_json_files_to_matrix("../data/everypol/everypol_hibp")
everypol_hibp.shape

(8536, 220)

In [3]:
everypol_hibp.head()

Unnamed: 0,Filename,000webhost,123RF,2844Breaches,500px,8fit,ABFRL,AKP,ActMobile,Acuity,...,Zacks,Zomato,Zynga,bigbasket,db8151dd,digiDirect,eThekwiniMunicipality,iMesh,ixigo,piZap
0,wouter.raskin@dekamer.be,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,k.verhoeven@tweedekamer.nl,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2,lcoffice@dphk.org,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,pk.sreemathi@sansad.nic.in,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,edurubio@parlamento.gub.uy,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [4]:
long_everypol_hibp = pd.melt(
    everypol_hibp,
    id_vars=['Filename'],
    var_name='Breach',
    value_name='Present'
)
long_everypol_hibp.shape

(1869384, 3)

In [5]:
long_everypol_hibp.to_csv("../data/everypol_hibp.csv", index=False)

In [6]:
breaches = pd.read_csv("../data/breaches_01_2025.csv")
breaches.head()

Unnamed: 0,Name,Title,Domain,BreachDate,AddedDate,ModifiedDate,PwnCount,Description,LogoPath,DataClasses,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree
0,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,"In approximately March 2015, the free web host...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'IP addresses', 'Names', '...",True,False,False,False,False,False,False
1,123RF,123RF,123rf.com,2020-03-22,2020-11-15T00:59:50Z,2020-11-15T01:07:10Z,8661578,"In March 2020, the stock photo site <a href=""h...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'IP addresses', 'Names', '...",True,False,False,False,False,False,False
2,126,126,126.com,2012-01-01,2016-10-08T07:46:05Z,2016-10-08T07:46:05Z,6414191,"In approximately 2012, it's alleged that the C...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'Passwords']",False,False,False,False,False,False,False
3,17Media,17,17app.co,2016-04-19,2016-07-08T01:55:03Z,2016-07-08T01:55:03Z,4009640,"In April 2016, customer data obtained from the...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Device information', 'Email addresses', 'IP ...",True,False,False,False,False,False,False
4,17173,17173,17173.com,2011-12-28,2018-04-28T04:53:15Z,2018-04-28T04:53:15Z,7485802,"In late 2011, <a href=""https://news.softpedia....",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'Passwords', 'Usernames']",False,False,False,False,False,False,False


In [7]:
breaches["DataClasses"] = breaches["DataClasses"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
all_categories = set(item for sublist in breaches["DataClasses"] for item in sublist)
expanded_df = pd.DataFrame(
    {category: breaches["DataClasses"].apply(lambda x: category in x) for category in all_categories}
)
breaches = pd.concat([breaches.drop(columns=["DataClasses"]), expanded_df], axis=1)

In [8]:
breaches_everypol_hibp = long_everypol_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_everypol_hibp.head()

Unnamed: 0,Filename,Breach,Present,Name,Title,Domain,BreachDate,AddedDate,ModifiedDate,PwnCount,...,Vehicle identification numbers (VINs),PINs,Financial investments,Government issued IDs,Net worths,Cryptocurrency wallet addresses,Driver's licenses,Travel plans,Avatars,Places of birth
0,wouter.raskin@dekamer.be,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,...,False,False,False,False,False,False,False,False,False,False
1,k.verhoeven@tweedekamer.nl,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,...,False,False,False,False,False,False,False,False,False,False
2,lcoffice@dphk.org,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,...,False,False,False,False,False,False,False,False,False,False
3,pk.sreemathi@sansad.nic.in,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,...,False,False,False,False,False,False,False,False,False,False
4,edurubio@parlamento.gub.uy,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,...,False,False,False,False,False,False,False,False,False,False


In [9]:
boolean_columns = breaches_everypol_hibp.select_dtypes(include=['bool'])
grouped_bool_counts = boolean_columns.groupby(breaches_everypol_hibp["Filename"]).sum().reset_index()
grouped_bool_counts.describe()

Unnamed: 0,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Credit status information,Telecommunications carrier,Work habits,...,Vehicle identification numbers (VINs),PINs,Financial investments,Government issued IDs,Net worths,Cryptocurrency wallet addresses,Driver's licenses,Travel plans,Avatars,Places of birth
count,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,...,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0,8536.0
mean,207.0,1.0,0.0,0.0,12.0,2.0,1.0,2.0,1.0,0.0,...,0.0,1.0,1.0,6.0,1.0,1.0,1.0,0.0,1.0,1.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,207.0,1.0,0.0,0.0,12.0,2.0,1.0,2.0,1.0,0.0,...,0.0,1.0,1.0,6.0,1.0,1.0,1.0,0.0,1.0,1.0
25%,207.0,1.0,0.0,0.0,12.0,2.0,1.0,2.0,1.0,0.0,...,0.0,1.0,1.0,6.0,1.0,1.0,1.0,0.0,1.0,1.0
50%,207.0,1.0,0.0,0.0,12.0,2.0,1.0,2.0,1.0,0.0,...,0.0,1.0,1.0,6.0,1.0,1.0,1.0,0.0,1.0,1.0
75%,207.0,1.0,0.0,0.0,12.0,2.0,1.0,2.0,1.0,0.0,...,0.0,1.0,1.0,6.0,1.0,1.0,1.0,0.0,1.0,1.0
max,207.0,1.0,0.0,0.0,12.0,2.0,1.0,2.0,1.0,0.0,...,0.0,1.0,1.0,6.0,1.0,1.0,1.0,0.0,1.0,1.0


In [10]:
grouped_bool_counts.head()

Unnamed: 0,Filename,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Credit status information,Telecommunications carrier,...,Vehicle identification numbers (VINs),PINs,Financial investments,Government issued IDs,Net worths,Cryptocurrency wallet addresses,Driver's licenses,Travel plans,Avatars,Places of birth
0,(i)joice.george@sansad.nic.in,207,1,0,0,12,2,1,2,1,...,0,1,1,6,1,1,1,0,1,1
1,1.office@bjpanda.org,207,1,0,0,12,2,1,2,1,...,0,1,1,6,1,1,1,0,1,1
2,101bbb@naver.com,207,1,0,0,12,2,1,2,1,...,0,1,1,6,1,1,1,0,1,1
3,2016kimkj@gmail.com,207,1,0,0,12,2,1,2,1,...,0,1,1,6,1,1,1,0,1,1
4,2016kimss@gmail.com,207,1,0,0,12,2,1,2,1,...,0,1,1,6,1,1,1,0,1,1


In [11]:
pol_dat = pd.read_csv("../data/everypol/everypol_combined_legislature_data.csv", low_memory=False)
pol_dat.head()

Unnamed: 0,id,name,sort_name,email,twitter,facebook,group,group_id,area_id,area,...,n_unique_emails,person_count_legistype,ltype,url,cc,leg_start_date,lastmod,cc3,pop2024,lastmod_year
0,35ad9676-8485-4137-9a16-50f2844f3ab2,"Adhalrao Patil, Shri Shivaji","Adhalrao Patil, Shri Shivaji",shivajirao@sansad.nic.in,,,Shiv Sena,SS,shirur,Shirur,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
1,f1f0f31d-ddb8-4681-82c0-4484d0d26ee3,"Adhikari, Shri Deepak (Dev)","Adhikari, Shri Deepak (Dev)",adhikari.deepak@sansad.nic.in,idevadhikari,IamTheDev,All India Trinamool Congress,AITC,ghatal,Ghatal,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
2,bb68e3c6-de79-4d07-ad1f-90bb9a61ded0,"Adhikari, Shri Sisir Kumar","Adhikari, Shri Sisir Kumar",sisiradhikari76@yahoo.com,,,All India Trinamool Congress,AITC,kanthi,Kanthi,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
3,c9a891c3-adb6-4581-b93b-72f9c00beacc,"Adhikari, Shri Suvendu","Adhikari, Shri Suvendu",adhikari.suvendu@sansad.nic.in,,,All India Trinamool Congress,AITC,tamluk,Tamluk,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
4,5c22c70d-8317-4f94-97ec-2ccadd1acdf3,"Adityanath , Shri Yogi","Adityanath , Shri Yogi",yogi.adityanath@sansad.nic.in,,,Bharatiya Janata Party,BJP,gorakhpur,Gorakhpur,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019


In [12]:
pol_dat.shape

(25087, 35)

In [13]:
pol_dat = clean_email_column_no_dedupe(pol_dat)
pol_dat.shape

(13798, 35)

In [14]:
pol_hibp = pol_dat.merge(grouped_bool_counts, left_on="email", right_on="Filename", how="left")
pol_hibp.head()

Unnamed: 0,id,name,sort_name,email,twitter,facebook,group,group_id,area_id,area,...,Vehicle identification numbers (VINs),PINs,Financial investments,Government issued IDs,Net worths,Cryptocurrency wallet addresses,Driver's licenses,Travel plans,Avatars,Places of birth
0,35ad9676-8485-4137-9a16-50f2844f3ab2,"Adhalrao Patil, Shri Shivaji","Adhalrao Patil, Shri Shivaji",shivajirao@sansad.nic.in,,,Shiv Sena,SS,shirur,Shirur,...,0.0,1.0,1.0,6.0,1.0,1.0,1.0,0.0,1.0,1.0
1,f1f0f31d-ddb8-4681-82c0-4484d0d26ee3,"Adhikari, Shri Deepak (Dev)","Adhikari, Shri Deepak (Dev)",adhikari.deepak@sansad.nic.in,idevadhikari,IamTheDev,All India Trinamool Congress,AITC,ghatal,Ghatal,...,0.0,1.0,1.0,6.0,1.0,1.0,1.0,0.0,1.0,1.0
2,bb68e3c6-de79-4d07-ad1f-90bb9a61ded0,"Adhikari, Shri Sisir Kumar","Adhikari, Shri Sisir Kumar",sisiradhikari76@yahoo.com,,,All India Trinamool Congress,AITC,kanthi,Kanthi,...,0.0,1.0,1.0,6.0,1.0,1.0,1.0,0.0,1.0,1.0
3,c9a891c3-adb6-4581-b93b-72f9c00beacc,"Adhikari, Shri Suvendu","Adhikari, Shri Suvendu",adhikari.suvendu@sansad.nic.in,,,All India Trinamool Congress,AITC,tamluk,Tamluk,...,0.0,1.0,1.0,6.0,1.0,1.0,1.0,0.0,1.0,1.0
4,5c22c70d-8317-4f94-97ec-2ccadd1acdf3,"Adityanath , Shri Yogi","Adityanath , Shri Yogi",yogi.adityanath@sansad.nic.in,,,Bharatiya Janata Party,BJP,gorakhpur,Gorakhpur,...,0.0,1.0,1.0,6.0,1.0,1.0,1.0,0.0,1.0,1.0


In [15]:
columns_to_fill = list(all_categories) + ['IsVerified', 'IsFabricated', 'IsSensitive', 
                   'IsRetired', 'IsSpamList', 'IsMalware', 'IsSubscriptionFree']
pol_hibp[columns_to_fill] = pol_hibp[columns_to_fill].fillna(0)

### Let's merge the rest (India, Eur.)

In [16]:
no_hibp = process_json_files_to_matrix("../data/no_hibp")
dk_hibp = process_json_files_to_matrix("../data/dk_hibp")
in_hibp = process_json_files_to_matrix("../data/india/ls_hibp")
sg_hibp = process_json_files_to_matrix("../data/sg_hibp")

gr_hibp = process_json_files_to_matrix("../data/gr_hibp/")
ng_hibp = process_json_files_to_matrix("../data/ng_hibp/")
br_hibp = process_json_files_to_matrix("../data/br_hibp/")
ar_hibp = process_json_files_to_matrix("../data/ar_hibp/")

bihar_hibp = process_json_files_to_matrix("../data/india/bihar_hibp")
tn_hibp = process_json_files_to_matrix("../data/india/tn_hibp")
up_hibp = process_json_files_to_matrix("../data/india/up_hibp")
hp_hibp = process_json_files_to_matrix("../data/india/hp_hibp")
del_hibp = process_json_files_to_matrix("../data/india/delhi_hibp")

In [17]:
# Make it long
long_no_hibp    = pd.melt(no_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_dk_hibp    = pd.melt(dk_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_in_hibp    = pd.melt(in_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_sg_hibp    = pd.melt(sg_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')

long_gr_hibp  = pd.melt(gr_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_ng_hibp  = pd.melt(ng_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_br_hibp  = pd.melt(br_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_ar_hibp  = pd.melt(ar_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')

long_bihar_hibp = pd.melt(bihar_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_tn_hibp    = pd.melt(tn_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_up_hibp    = pd.melt(up_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_hp_hibp    = pd.melt(hp_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
long_del_hibp   = pd.melt(del_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')

In [18]:
(
    pd.concat([long_no_hibp, long_dk_hibp, long_in_hibp, long_sg_hibp, 
               long_gr_hibp, long_ng_hibp, long_br_hibp, long_ar_hibp, 
               long_bihar_hibp, long_tn_hibp, long_up_hibp, long_hp_hibp, long_del_hibp])
    .to_csv("../data/scraped_pol_hibp.csv", index=False)
)

In [19]:
# Join to breaches
breaches_no = long_no_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_dk = long_dk_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_in = long_in_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_sg = long_sg_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")

breaches_gr = long_gr_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_ng = long_ng_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_br = long_br_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_ar = long_ar_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")

breaches_bihar = long_bihar_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_tn    = long_tn_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_up    = long_up_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_hp    = long_hp_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_del   = long_del_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")

In [20]:
breach_dfs = {
    "no": breaches_no,
    "dk": breaches_dk,
    "in": breaches_in,
    "sg": breaches_sg,
    "gr": breaches_gr,
    "ng": breaches_ng,
    "br": breaches_br,
    "ar": breaches_ar,
    "bihar": breaches_bihar,
    "tn": breaches_tn,
    "up": breaches_up,
    "hp": breaches_hp,
    "del": breaches_del,
}

breach_counts = {}

for key, df in breach_dfs.items():
    breach_counts[key] = (
        df.select_dtypes(include=["bool"])
        .groupby(df["Filename"])
        .sum()
        .reset_index()
    )

for key in breach_counts:
    globals()[f"{key}_breach_count"] = breach_counts[key]

In [23]:
# Norway
no_parl = pd.read_csv("../data/no/no_parliament.csv")
no_parl = clean_email_column_no_dedupe(no_parl)
print(f'Norway: {no_parl.shape}')

# Denmark
wb = openpyxl.load_workbook("../data/danish_parliament_1_2025.xlsx")
ws = wb.active  
data = list(ws.values)
dk_parl = pd.DataFrame(data)
dk_parl.columns = dk_parl.iloc[0]
dk_parl = dk_parl[1:].reset_index(drop=True)
dk_parl = clean_email_column_no_dedupe(dk_parl.clean_names())
dk_parl.rename(columns={"full_name": "name"}, inplace=True)
print(f'Denmark: {dk_parl.shape}')

# India
in_df = pd.read_csv("../data/india/ls_long.csv")
in_df = clean_email_column_no_dedupe(in_df)
print(f'India: {in_df.shape}')

# Singapore
sg_df = pd.read_csv("../data/sg/sg_mp.csv")
sg_df = clean_email_column_no_dedupe(sg_df)
print(f'Singapore: {sg_df.shape}')

# Argentina
ar_df = pd.read_csv("../data/ar/ListadoDeSenadores_2025.csv")
ar_df.rename(columns=str.lower, inplace=True)
ar_df = clean_email_column_no_dedupe(ar_df)
print(f'Argentina: {ar_df.shape}')

# Greece
gr_df = pd.read_csv("../data/gr/greece_parliament_members.csv", encoding='utf-8-sig')
gr_df_long = clean_email_column_no_dedupe(clean_contact_column(gr_df))
print(f'Greece: {gr_df_long.shape}')

# Nigeria
ng_df = pd.read_csv("../data/ng/nigeria_senators.csv")
ng_df.rename(columns=str.lower, inplace=True)
ng_df = clean_email_column_no_dedupe(ng_df)
print(f'Nigeria: {ng_df.shape}')

# Brazil
br_df = pd.read_csv("../data/br/brazil_senator_2025.csv", encoding='latin1', sep=';')
br_df.rename(columns=str.lower, inplace=True)
br_df = clean_email_column_no_dedupe(br_df)
print(f'Brazil: {br_df.shape}')

# Bihar
with open('../data/india/bihar/bihar.txt', 'r', encoding='utf-8') as file:
    lines = file.read().splitlines()
    lines = [line.split('\t') for line in lines]

bihar_df = pd.DataFrame(lines, columns=['sr_no', 'photo', 'constituency', 'name', 'gender', 'party', 'contact', 'email'])
bihar_df = clean_email_column_no_dedupe(bihar_df)
print(f'Bihar: {bihar_df.shape}')

# UP
up_df = pd.read_csv("../data/india/up/up_18_mlas.csv", usecols=['email'], encoding="utf-8", quotechar='"', sep=",", engine="python")
up_df = clean_email_column_no_dedupe(up_df)
print(f'UP: {up_df.shape}')

# HP
hp_df = pd.read_csv("../data/india/hp_14.csv")
hp_df.columns = hp_df.columns.str.lower()
hp_df = clean_email_column_no_dedupe(hp_df)
print(f'HP: {hp_df.shape}')

# TN
tn_df = pd.read_csv("../data/india/tn/tn.csv")
tn_df.rename(columns={"Email Address": "email"}, inplace=True)
tn_df = clean_email_column_no_dedupe(tn_df)
print(f'TN: {tn_df.shape}')

# Delhi
del_df = pd.read_csv("../data/india/delhi/delhi_7th_assembly.csv")
del_df.rename(columns={"Email": "email"}, inplace=True)
del_df = clean_email_column_no_dedupe(del_df)
print(f'Delhi: {del_df.shape}')

Norway: (174, 3)
Denmark: (186, 10)
India: (4403, 43)
Singapore: (603, 7)
Argentina: (71, 15)
Greece: (382, 5)
Nigeria: (69, 11)
Brazil: (81, 10)
Bihar: (241, 8)
UP: (106, 1)
HP: (68, 7)
TN: (233, 6)
Delhi: (43, 8)


  warn("Workbook contains no default style, apply openpyxl's default")


In [24]:
# Define metadata for each DataFrame
metadata = {
    "no_parl":  {"country": "Norway", "cc3": "NOR", "legislature": "Storting", "ltype": "unicameral legislature"},
    "dk_parl":  {"country": "Denmark", "cc3": "DNK", "legislature": "Folketing", "ltype": "unicameral legislature"},
    "in_df":    {"country": "India", "cc3": "IND", "legislature": "Lok Sabha", "ltype": "lower house"},
    "sg_df":    {"country": "Singapore", "cc3": "SGP", "legislature": "Parliament", "ltype": "unicameral legislature"},
    "ar_df":    {"country": "Argentina", "cc3": "ARG", "legislature": "Parliament", "ltype": "upper house"},
    "gr_df_long": {"country": "Greece", "cc3": "GRC", "legislature": "Parliament", "ltype": "unicameral legislature"},
    "ng_df":    {"country": "Nigeria", "cc3": "NGA", "legislature": "Senate", "ltype": "upper house"},
    "br_df":    {"country": "Brazil", "cc3": "BRA", "legislature": "Parliament", "ltype": "upper house"},
    "bihar_df": {"country": "India", "cc3": "IND", "legislature": "State Legislature", "chamber": "Bihar Legislature", "ltype": "state"},
    "tn_df":    {"country": "India", "cc3": "IND", "legislature": "State Legislature", "chamber": "Tamil Nadu State Legislature", "ltype": "state"},
    "up_df":    {"country": "India", "cc3": "IND", "legislature": "State Legislature", "chamber": "UP State Legislature", "ltype": "state"},
    "hp_df":    {"country": "India", "cc3": "IND", "legislature": "State Legislature", "chamber": "HP Legislature", "ltype": "state"},
    "del_df":   {"country": "India", "cc3": "IND", "legislature": "State Legislature", "chamber": "Delhi Legislature", "ltype": "state"},
}

for df_name, meta in metadata.items():
    if df_name in globals():
        globals()[df_name] = globals()[df_name].assign(**meta).clean_names()

In [25]:
(
    pd.concat([no_parl, dk_parl, in_df, sg_df, 
               ar_df, br_df, ng_df, gr_df_long, 
               bihar_df, tn_df, up_df, hp_df, del_df])
    .to_csv("../data/scraped_pol_combined_legislature_data.csv", index=False)
)

In [27]:
no_parl.head()

Unnamed: 0,name,party,email,country,cc3,legislature,ltype
0,"Abusland, Anja Ninasdotter",Senterpartiet,anja.ninasdotter.abusland@stortinget.no,Norway,NOR,Storting,unicameral legislature
1,"Almeland, Grunde",Venstre,grunde.kreken.almeland@stortinget.no,Norway,NOR,Storting,unicameral legislature
2,"Amundsen, Per-Willy",Fremskrittspartiet,per-willy.amundsen@stortinget.no,Norway,NOR,Storting,unicameral legislature
3,"Arnstad, Marit",Senterpartiet,marit.arnstad@stortinget.no,Norway,NOR,Storting,unicameral legislature
4,"Asheim, Henrik",Høyre,henrik.asheim@stortinget.no,Norway,NOR,Storting,unicameral legislature


### Join

In [29]:
no_all = no_parl.merge(no_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0).infer_objects(copy=False)
dk_all = dk_parl.merge(dk_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0).infer_objects(copy=False)
in_all = in_df.merge(in_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0).infer_objects(copy=False)
sg_all = sg_df.merge(sg_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0).infer_objects(copy=False)

gr_all = (gr_df_long
          .merge(gr_breach_count, how="left", left_on="email", right_on="Filename")
          .fillna(0)
          .infer_objects(copy=False))
ng_all = (ng_df
          .merge(ng_breach_count, how = "left", left_on = "email", right_on = "Filename")
          .fillna(0)
          .infer_objects(copy=False))
br_all = (br_df
          .merge(br_breach_count, how = "left", left_on = "email", right_on = "Filename")
          .fillna(0)
          .infer_objects(copy=False))
ar_all = (ar_df
          .merge(ar_breach_count, how = "left", left_on = "email", right_on = "Filename")
          .fillna(0)
          .infer_objects(copy=False))
bihar_all = bihar_df.merge(bihar_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0).infer_objects(copy=False)
tn_all = tn_df.merge(tn_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0).infer_objects(copy=False)
up_all = up_df.merge(up_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0).infer_objects(copy=False)
hp_all = hp_df.merge(hp_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0).infer_objects(copy=False)
del_all = del_df.merge(del_breach_count, how = "left", left_on = "email", right_on = "Filename").fillna(0).infer_objects(copy=False)

In [30]:
ng_breach_count

Unnamed: 0,Filename,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Sexual orientations,Device information,...,Family members' names,Names,Email addresses,Relationship statuses,Cryptocurrency wallet addresses,Taxation records,Dates of birth,Utility bills,Tattoo status,Partial dates of birth
0,aannadori@gmail.com,37,0,0,0,3,1,0,0,0,...,0,24,45,0,0,0,10,0,0,0
1,abahmoro@yahoo.com,37,0,0,0,3,1,0,0,0,...,0,24,45,0,0,0,10,0,0,0
2,abdulahmed.ningi@outlook.com,37,0,0,0,3,1,0,0,0,...,0,24,45,0,0,0,10,0,0,0
3,abusani1@gmail.com,37,0,0,0,3,1,0,0,0,...,0,24,45,0,0,0,10,0,0,0
4,adefadahunsi19@gmail.com,37,0,0,0,3,1,0,0,0,...,0,24,45,0,0,0,10,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,sunnyside1914@gmail.com,37,0,0,0,3,1,0,0,0,...,0,24,45,0,0,0,10,0,0,0
65,victorimeh1962@yahoo.com,37,0,0,0,3,1,0,0,0,...,0,24,45,0,0,0,10,0,0,0
66,yahaya.abdullahi@nass.gov.ng,37,0,0,0,3,1,0,0,0,...,0,24,45,0,0,0,10,0,0,0
67,yariabdulazeez@gmail.com,37,0,0,0,3,1,0,0,0,...,0,24,45,0,0,0,10,0,0,0


### India is in long. Get it back to MP level

In [31]:
in_grouped = (
    in_all.groupby(['mpsno', 'source_file'], as_index=False)
    .sum(numeric_only=True)
)
in_grouped.head()

Unnamed: 0,mpsno,source_file,lastloksabha,age,noofterms,numberofsons,numberofdaughters,currentpagenumber,perpagesize,totalelements,...,Family members' names,Names,Email addresses,Relationship statuses,Cryptocurrency wallet addresses,Taxation records,Dates of birth,Utility bills,Tattoo status,Partial dates of birth
0,1,ls_13,14,77.0,2.0,2.0,1.0,1,551,551,...,1.0,283.0,533.0,0.0,2.0,0.0,139.0,0.0,0.0,2.0
1,1,ls_14,14,77.0,2.0,2.0,1.0,1,554,554,...,1.0,283.0,533.0,0.0,2.0,0.0,139.0,0.0,0.0,2.0
2,2,ls_12,14,53.0,3.0,2.0,0.0,1,529,529,...,1.0,283.0,533.0,0.0,2.0,0.0,139.0,0.0,0.0,2.0
3,2,ls_13,14,53.0,3.0,2.0,0.0,1,551,551,...,1.0,283.0,533.0,0.0,2.0,0.0,139.0,0.0,0.0,2.0
4,2,ls_14,14,53.0,3.0,2.0,0.0,1,554,554,...,1.0,283.0,533.0,0.0,2.0,0.0,139.0,0.0,0.0,2.0


In [32]:
in_grouped.shape

(2684, 164)

In [33]:
no_all.describe().round(2)

Unnamed: 0,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Sexual orientations,Device information,Photos,...,Family members' names,Names,Email addresses,Relationship statuses,Cryptocurrency wallet addresses,Taxation records,Dates of birth,Utility bills,Tattoo status,Partial dates of birth
count,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,...,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0,174.0
mean,26.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,17.0,30.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,26.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,17.0,30.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0
25%,26.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,17.0,30.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0
50%,26.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,17.0,30.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0
75%,26.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,17.0,30.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0
max,26.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,17.0,30.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0


In [34]:
dk_all.describe().round(2)

Unnamed: 0,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Sexual orientations,Device information,Photos,...,Family members' names,Names,Email addresses,Relationship statuses,Cryptocurrency wallet addresses,Taxation records,Dates of birth,Utility bills,Tattoo status,Partial dates of birth
count,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,...,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0,186.0
mean,38.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,...,0.0,24.0,42.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,38.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,...,0.0,24.0,42.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
25%,38.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,...,0.0,24.0,42.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
50%,38.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,...,0.0,24.0,42.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
75%,38.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,...,0.0,24.0,42.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
max,38.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,...,0.0,24.0,42.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0


In [35]:
in_grouped.groupby(['source_file']).mean()

Unnamed: 0_level_0,mpsno,lastloksabha,age,noofterms,numberofsons,numberofdaughters,currentpagenumber,perpagesize,totalelements,totalpages,...,Family members' names,Names,Email addresses,Relationship statuses,Cryptocurrency wallet addresses,Taxation records,Dates of birth,Utility bills,Tattoo status,Partial dates of birth
source_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ls_12,793.064356,17.693069,90.173267,5.435644,1.70297,1.59901,1.19802,633.752475,633.752475,1.19802,...,1.19802,339.039604,638.544554,0.0,2.39604,0.0,166.524752,0.0,0.0,2.39604
ls_13,310.280112,16.943978,86.89916,4.221289,1.627451,1.526611,1.168067,643.605042,643.605042,1.168067,...,1.168067,330.563025,622.579832,0.0,2.336134,0.0,162.361345,0.0,0.0,2.336134
ls_14,1960.614286,19.660714,89.392857,4.757143,1.664286,1.525,1.282143,710.307143,710.307143,1.282143,...,1.282143,362.846429,683.382143,0.0,2.564286,0.0,178.217857,0.0,0.0,2.564286
ls_15,3373.632768,22.887006,95.223164,4.220339,1.864407,1.740113,1.435028,733.299435,733.299435,1.435028,...,1.435028,406.112994,764.870056,0.0,2.870056,0.0,199.468927,0.0,0.0,2.870056
ls_16,4048.672377,28.788009,108.526767,4.115632,2.027837,1.873662,1.745182,893.533191,893.533191,1.745182,...,1.745182,493.88651,930.182013,0.0,3.490364,0.0,242.5803,0.0,0.0,3.490364
ls_17,4500.073022,33.103448,113.336714,3.819473,2.206897,1.957404,1.947262,1049.574037,1049.574037,1.947262,...,1.947262,551.075051,1037.890467,0.0,3.894523,0.0,270.669371,0.0,0.0,3.894523
ls_18,4998.207156,37.355932,118.583804,4.256121,1.664783,1.52919,2.07533,1128.979284,1128.979284,2.07533,...,2.071563,586.252354,1104.143126,0.0,4.143126,0.0,287.947269,0.0,0.0,4.143126


In [36]:
sg_all.describe().round(2)

Unnamed: 0,leg_start_year,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Sexual orientations,Device information,...,Family members' names,Names,Email addresses,Relationship statuses,Cryptocurrency wallet addresses,Taxation records,Dates of birth,Utility bills,Tattoo status,Partial dates of birth
count,603.0,603.0,603.0,603.0,603.0,603.0,603.0,603.0,603.0,603.0,...,603.0,603.0,603.0,603.0,603.0,603.0,603.0,603.0,603.0,603.0
mean,2013.21,68.66,0.0,0.0,0.0,3.98,1.0,0.0,0.0,0.0,...,0.0,45.77,77.61,1.0,0.0,0.0,15.92,0.0,0.0,0.0
std,7.32,4.86,0.0,0.0,0.0,0.28,0.07,0.0,0.0,0.0,...,0.0,3.24,5.49,0.07,0.0,0.0,1.13,0.0,0.0,0.0
min,2001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2006.0,69.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,...,0.0,46.0,78.0,1.0,0.0,0.0,16.0,0.0,0.0,0.0
50%,2015.0,69.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,...,0.0,46.0,78.0,1.0,0.0,0.0,16.0,0.0,0.0,0.0
75%,2021.0,69.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,...,0.0,46.0,78.0,1.0,0.0,0.0,16.0,0.0,0.0,0.0
max,2021.0,69.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,...,0.0,46.0,78.0,1.0,0.0,0.0,16.0,0.0,0.0,0.0


In [37]:
bihar_all.describe().round(2)

Unnamed: 0,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Sexual orientations,Device information,Photos,...,Family members' names,Names,Email addresses,Relationship statuses,Cryptocurrency wallet addresses,Taxation records,Dates of birth,Utility bills,Tattoo status,Partial dates of birth
count,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,...,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0,241.0
mean,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [38]:
tn_all.describe().round(2)

Unnamed: 0,photo,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Sexual orientations,Device information,...,Family members' names,Names,Email addresses,Relationship statuses,Cryptocurrency wallet addresses,Taxation records,Dates of birth,Utility bills,Tattoo status,Partial dates of birth
count,233.0,233.0,233.0,233.0,233.0,233.0,233.0,233.0,233.0,233.0,...,233.0,233.0,233.0,233.0,233.0,233.0,233.0,233.0,233.0,233.0
mean,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
25%,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
50%,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
75%,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
max,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0


In [39]:
hp_all.describe().round(2)

Unnamed: 0,photo,mobile,tele_no_res_no_,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,...,Family members' names,Names,Email addresses,Relationship statuses,Cryptocurrency wallet addresses,Taxation records,Dates of birth,Utility bills,Tattoo status,Partial dates of birth
count,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,...,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0,68.0
mean,0.0,9416969000.0,0.0,37.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,27.0,40.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0
std,0.0,510635200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,7018000000.0,0.0,37.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,27.0,40.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0
25%,0.0,9418022000.0,0.0,37.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,27.0,40.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0
50%,0.0,9418154000.0,0.0,37.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,27.0,40.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0
75%,0.0,9805725000.0,0.0,37.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,27.0,40.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0
max,0.0,9882812000.0,0.0,37.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,27.0,40.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0


In [40]:
up_all.describe().round(2)

Unnamed: 0,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Sexual orientations,Device information,Photos,...,Family members' names,Names,Email addresses,Relationship statuses,Cryptocurrency wallet addresses,Taxation records,Dates of birth,Utility bills,Tattoo status,Partial dates of birth
count,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,...,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0,106.0
mean,30.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,...,0.0,20.0,32.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,30.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,...,0.0,20.0,32.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
25%,30.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,...,0.0,20.0,32.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
50%,30.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,...,0.0,20.0,32.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
75%,30.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,...,0.0,20.0,32.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
max,30.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,...,0.0,20.0,32.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0


In [41]:
del_all.describe().round(2)

Unnamed: 0,ac_no,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Sexual orientations,Device information,...,Family members' names,Names,Email addresses,Relationship statuses,Cryptocurrency wallet addresses,Taxation records,Dates of birth,Utility bills,Tattoo status,Partial dates of birth
count,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,...,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0,43.0
mean,44.47,32.23,0.98,0.0,0.0,2.93,0.98,0.0,0.0,0.98,...,0.0,25.4,38.09,0.98,0.0,0.0,6.84,0.0,0.0,0.0
std,16.26,5.03,0.15,0.0,0.0,0.46,0.15,0.0,0.0,0.15,...,0.0,3.96,5.95,0.15,0.0,0.0,1.07,0.0,0.0,0.0
min,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,29.5,33.0,1.0,0.0,0.0,3.0,1.0,0.0,0.0,1.0,...,0.0,26.0,39.0,1.0,0.0,0.0,7.0,0.0,0.0,0.0
50%,45.0,33.0,1.0,0.0,0.0,3.0,1.0,0.0,0.0,1.0,...,0.0,26.0,39.0,1.0,0.0,0.0,7.0,0.0,0.0,0.0
75%,58.5,33.0,1.0,0.0,0.0,3.0,1.0,0.0,0.0,1.0,...,0.0,26.0,39.0,1.0,0.0,0.0,7.0,0.0,0.0,0.0
max,70.0,33.0,1.0,0.0,0.0,3.0,1.0,0.0,0.0,1.0,...,0.0,26.0,39.0,1.0,0.0,0.0,7.0,0.0,0.0,0.0


In [42]:
ng_all.describe().round(2)

Unnamed: 0,id,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Sexual orientations,Device information,...,Family members' names,Names,Email addresses,Relationship statuses,Cryptocurrency wallet addresses,Taxation records,Dates of birth,Utility bills,Tattoo status,Partial dates of birth
count,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,...,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0,69.0
mean,493.04,37.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,...,0.0,24.0,45.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
std,162.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,87.0,37.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,...,0.0,24.0,45.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
25%,408.0,37.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,...,0.0,24.0,45.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
50%,533.0,37.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,...,0.0,24.0,45.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
75%,623.0,37.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,...,0.0,24.0,45.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
max,694.0,37.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,...,0.0,24.0,45.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0


In [43]:
gr_all.describe().round(2)

Unnamed: 0,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Sexual orientations,Device information,Photos,...,Family members' names,Names,Email addresses,Relationship statuses,Cryptocurrency wallet addresses,Taxation records,Dates of birth,Utility bills,Tattoo status,Partial dates of birth
count,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,...,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0,382.0
mean,46.63,0.0,0.0,0.0,3.97,0.99,0.0,0.0,0.0,0.0,...,0.0,26.79,52.58,0.99,0.0,0.0,9.92,0.0,0.0,0.0
std,4.15,0.0,0.0,0.0,0.35,0.09,0.0,0.0,0.0,0.0,...,0.0,2.39,4.68,0.09,0.0,0.0,0.88,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,...,0.0,27.0,53.0,1.0,0.0,0.0,10.0,0.0,0.0,0.0
50%,47.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,...,0.0,27.0,53.0,1.0,0.0,0.0,10.0,0.0,0.0,0.0
75%,47.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,...,0.0,27.0,53.0,1.0,0.0,0.0,10.0,0.0,0.0,0.0
max,47.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,...,0.0,27.0,53.0,1.0,0.0,0.0,10.0,0.0,0.0,0.0


In [44]:
ar_all.describe().round(2)

Unnamed: 0,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Sexual orientations,Device information,Photos,...,Family members' names,Names,Email addresses,Relationship statuses,Cryptocurrency wallet addresses,Taxation records,Dates of birth,Utility bills,Tattoo status,Partial dates of birth
count,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,...,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0
mean,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
br_all.describe().round(2)

Unnamed: 0,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Sexual orientations,Device information,Photos,...,Family members' names,Names,Email addresses,Relationship statuses,Cryptocurrency wallet addresses,Taxation records,Dates of birth,Utility bills,Tattoo status,Partial dates of birth
count,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,...,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0,81.0
mean,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
75%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
