### Analysis of Everypol (after JSON to pd)

In [1]:
import os
import pandas as pd
import json
import ast
import openpyxl

In [2]:
def process_json_files_to_matrix(json_folder):
    """
    Process JSON files and create a matrix of filenames vs names present in files.
    
    Args:
        json_folder (str): Path to folder containing JSON files
        
    Returns:
        pd.DataFrame: Matrix with filenames as rows and all unique names as columns.
                     Values are boolean indicating if name is present in file.
    """
    
    all_names = set()
    file_list = [f for f in os.listdir(json_folder) if f.endswith(".json")]

    for filename in file_list:
        file_path = os.path.join(json_folder, filename)
        with open(file_path, "r") as file:
            try:
                data = json.load(file)
                
                if isinstance(data, dict):
                    data = [data]
                elif not isinstance(data, list):
                    data = []
                
                # Extract names
                all_names.update(entry["Name"] for entry in data if isinstance(entry, dict) and "Name" in entry)
            except (json.JSONDecodeError, TypeError):
                pass
    
    all_names = sorted(all_names)

    df = pd.DataFrame(columns=["Filename"] + all_names)

    for filename in file_list:
        file_path = os.path.join(json_folder, filename)
        with open(file_path, "r") as file:
            try:
                data = json.load(file)
                
                # Ensure data is a list
                if isinstance(data, dict):
                    data = [data]
                elif not isinstance(data, list):
                    data = []

                present_names = {entry["Name"] for entry in data if isinstance(entry, dict) and "Name" in entry}
            except (json.JSONDecodeError, TypeError):
                present_names = set()
        
        row = {"Filename": filename.replace(".json", "")}
        row.update({name: name in present_names for name in all_names})
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)

    return df

In [3]:
def clean_email_column_no_dedupe(df, column_name="email"):
    """
    Cleans the specified email column in a DataFrame by:
    1. Stripping whitespace, converting to lowercase, and removing commas.
    2. Dropping rows where the email contains only a single letter or symbol.
    3. Dropping rows where the email is NaN.
    4. Valid email

    Args:
        df (pd.DataFrame): The DataFrame to clean.
        column_name (str): The column to process (default: "email").

    Returns:
        pd.DataFrame: Cleaned DataFrame (modification done safely).
    """
    if column_name in df.columns:
        df = df.copy()
        df[column_name] = df[column_name].str.strip().str.lower().str.replace(",", "", regex=True).str.replace(" ", "")
        df = df[~df[column_name].str.match(r"^[A-Za-z,_-]$", na=False)]
        df = df.dropna(subset=[column_name])
        
        email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'

        df = df[df[column_name].str.match(email_regex, na=False)]

    return df

In [4]:
everypol_hibp = process_json_files_to_matrix("../data/everypol/everypol_hibp")
everypol_hibp.shape

(8535, 220)

In [5]:
everypol_hibp.head()

Unnamed: 0,Filename,000webhost,123RF,2844Breaches,500px,8fit,ABFRL,AKP,ActMobile,Acuity,...,Zacks,Zomato,Zynga,bigbasket,db8151dd,digiDirect,eThekwiniMunicipality,iMesh,ixigo,piZap
0,wouter.raskin@dekamer.be,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,k.verhoeven@tweedekamer.nl,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2,lcoffice@dphk.org,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,pk.sreemathi@sansad.nic.in,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,edurubio@parlamento.gub.uy,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [6]:
long_everypol_hibp = pd.melt(
    everypol_hibp,
    id_vars=['Filename'],
    var_name='Breach',
    value_name='Present'
)
long_everypol_hibp.shape

(1869165, 3)

In [7]:
long_everypol_hibp.to_csv("../data/everypol_hibp.csv", index=False)

In [8]:
breaches = pd.read_csv("../data/hipb_01_2025_breaches_data.csv")
breaches.head()

Unnamed: 0,Name,Title,Domain,BreachDate,AddedDate,ModifiedDate,PwnCount,Description,LogoPath,DataClasses,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree
0,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,"In approximately March 2015, the free web host...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'IP addresses', 'Names', '...",True,False,False,False,False,False,False
1,123RF,123RF,123rf.com,2020-03-22,2020-11-15T00:59:50Z,2020-11-15T01:07:10Z,8661578,"In March 2020, the stock photo site <a href=""h...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'IP addresses', 'Names', '...",True,False,False,False,False,False,False
2,126,126,126.com,2012-01-01,2016-10-08T07:46:05Z,2016-10-08T07:46:05Z,6414191,"In approximately 2012, it's alleged that the C...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'Passwords']",False,False,False,False,False,False,False
3,17Media,17,17app.co,2016-04-19,2016-07-08T01:55:03Z,2016-07-08T01:55:03Z,4009640,"In April 2016, customer data obtained from the...",https://haveibeenpwned.com/Content/Images/Pwne...,"['Device information', 'Email addresses', 'IP ...",True,False,False,False,False,False,False
4,17173,17173,17173.com,2011-12-28,2018-04-28T04:53:15Z,2018-04-28T04:53:15Z,7485802,"In late 2011, <a href=""https://news.softpedia....",https://haveibeenpwned.com/Content/Images/Pwne...,"['Email addresses', 'Passwords', 'Usernames']",False,False,False,False,False,False,False


In [9]:
breaches["DataClasses"] = breaches["DataClasses"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
all_categories = set(item for sublist in breaches["DataClasses"] for item in sublist)
expanded_df = pd.DataFrame(
    {category: breaches["DataClasses"].apply(lambda x: category in x) for category in all_categories}
)
breaches = pd.concat([breaches.drop(columns=["DataClasses"]), expanded_df], axis=1)

In [10]:
breaches_everypol_hibp = long_everypol_hibp.merge(breaches, left_on="Breach", right_on="Name", how="left")
breaches_everypol_hibp.head()

Unnamed: 0,Filename,Breach,Present,Name,Title,Domain,BreachDate,AddedDate,ModifiedDate,PwnCount,...,Employers,Credit card CVV,Security questions and answers,Drug habits,Education levels,Travel plans,Ages,Loan information,Company names,Nicknames
0,wouter.raskin@dekamer.be,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,...,False,False,False,False,False,False,False,False,False,False
1,k.verhoeven@tweedekamer.nl,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,...,False,False,False,False,False,False,False,False,False,False
2,lcoffice@dphk.org,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,...,False,False,False,False,False,False,False,False,False,False
3,pk.sreemathi@sansad.nic.in,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,...,False,False,False,False,False,False,False,False,False,False
4,edurubio@parlamento.gub.uy,000webhost,False,000webhost,000webhost,000webhost.com,2015-03-01,2015-10-26T23:35:45Z,2017-12-10T21:44:27Z,14936670,...,False,False,False,False,False,False,False,False,False,False


In [11]:
boolean_columns = breaches_everypol_hibp.select_dtypes(include=['bool'])
grouped_bool_counts = boolean_columns.groupby(breaches_everypol_hibp["Filename"]).sum().reset_index()
grouped_bool_counts.describe()

Unnamed: 0,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Email messages,Payment histories,Political donations,...,Employers,Credit card CVV,Security questions and answers,Drug habits,Education levels,Travel plans,Ages,Loan information,Company names,Nicknames
count,8535.0,8535.0,8535.0,8535.0,8535.0,8535.0,8535.0,8535.0,8535.0,8535.0,...,8535.0,8535.0,8535.0,8535.0,8535.0,8535.0,8535.0,8535.0,8535.0,8535.0
mean,207.0,1.0,0.0,0.0,12.0,2.0,1.0,5.0,3.0,1.0,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,207.0,1.0,0.0,0.0,12.0,2.0,1.0,5.0,3.0,1.0,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
25%,207.0,1.0,0.0,0.0,12.0,2.0,1.0,5.0,3.0,1.0,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
50%,207.0,1.0,0.0,0.0,12.0,2.0,1.0,5.0,3.0,1.0,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
75%,207.0,1.0,0.0,0.0,12.0,2.0,1.0,5.0,3.0,1.0,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
max,207.0,1.0,0.0,0.0,12.0,2.0,1.0,5.0,3.0,1.0,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0


In [12]:
grouped_bool_counts.head()

Unnamed: 0,Filename,IsVerified,IsFabricated,IsSensitive,IsRetired,IsSpamList,IsMalware,IsSubscriptionFree,Email messages,Payment histories,...,Employers,Credit card CVV,Security questions and answers,Drug habits,Education levels,Travel plans,Ages,Loan information,Company names,Nicknames
0,(i)joice.george@sansad.nic.in,207,1,0,0,12,2,1,5,3,...,16,0,1,0,2,0,0,0,0,0
1,1.office@bjpanda.org,207,1,0,0,12,2,1,5,3,...,16,0,1,0,2,0,0,0,0,0
2,101bbb@naver.com,207,1,0,0,12,2,1,5,3,...,16,0,1,0,2,0,0,0,0,0
3,2016kimkj@gmail.com,207,1,0,0,12,2,1,5,3,...,16,0,1,0,2,0,0,0,0,0
4,2016kimss@gmail.com,207,1,0,0,12,2,1,5,3,...,16,0,1,0,2,0,0,0,0,0


In [13]:
pol_dat = pd.read_csv("../data/everypol/everypol_combined_legislature_data.csv", low_memory=False)
pol_dat.head()

Unnamed: 0,id,name,sort_name,email,twitter,facebook,group,group_id,area_id,area,...,n_unique_emails,person_count_legistype,ltype,url,cc,leg_start_date,lastmod,cc3,pop2024,lastmod_year
0,35ad9676-8485-4137-9a16-50f2844f3ab2,"Adhalrao Patil, Shri Shivaji","Adhalrao Patil, Shri Shivaji",shivajirao@sansad.nic.in,,,Shiv Sena,SS,shirur,Shirur,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
1,f1f0f31d-ddb8-4681-82c0-4484d0d26ee3,"Adhikari, Shri Deepak (Dev)","Adhikari, Shri Deepak (Dev)",adhikari.deepak@sansad.nic.in,idevadhikari,IamTheDev,All India Trinamool Congress,AITC,ghatal,Ghatal,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
2,bb68e3c6-de79-4d07-ad1f-90bb9a61ded0,"Adhikari, Shri Sisir Kumar","Adhikari, Shri Sisir Kumar",sisiradhikari76@yahoo.com,,,All India Trinamool Congress,AITC,kanthi,Kanthi,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
3,c9a891c3-adb6-4581-b93b-72f9c00beacc,"Adhikari, Shri Suvendu","Adhikari, Shri Suvendu",adhikari.suvendu@sansad.nic.in,,,All India Trinamool Congress,AITC,tamluk,Tamluk,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019
4,5c22c70d-8317-4f94-97ec-2ccadd1acdf3,"Adityanath , Shri Yogi","Adityanath , Shri Yogi",yogi.adityanath@sansad.nic.in,,,Bharatiya Janata Party,BJP,gorakhpur,Gorakhpur,...,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019


In [14]:
pol_dat.shape

(25087, 35)

In [15]:
pol_dat = clean_email_column_no_dedupe(pol_dat)
pol_dat.shape

(13798, 35)

In [16]:
pol_hibp = pol_dat.merge(grouped_bool_counts, left_on="email", right_on="Filename", how="left")
pol_hibp.head()

Unnamed: 0,id,name,sort_name,email,twitter,facebook,group,group_id,area_id,area,...,Employers,Credit card CVV,Security questions and answers,Drug habits,Education levels,Travel plans,Ages,Loan information,Company names,Nicknames
0,35ad9676-8485-4137-9a16-50f2844f3ab2,"Adhalrao Patil, Shri Shivaji","Adhalrao Patil, Shri Shivaji",shivajirao@sansad.nic.in,,,Shiv Sena,SS,shirur,Shirur,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
1,f1f0f31d-ddb8-4681-82c0-4484d0d26ee3,"Adhikari, Shri Deepak (Dev)","Adhikari, Shri Deepak (Dev)",adhikari.deepak@sansad.nic.in,idevadhikari,IamTheDev,All India Trinamool Congress,AITC,ghatal,Ghatal,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
2,bb68e3c6-de79-4d07-ad1f-90bb9a61ded0,"Adhikari, Shri Sisir Kumar","Adhikari, Shri Sisir Kumar",sisiradhikari76@yahoo.com,,,All India Trinamool Congress,AITC,kanthi,Kanthi,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
3,c9a891c3-adb6-4581-b93b-72f9c00beacc,"Adhikari, Shri Suvendu","Adhikari, Shri Suvendu",adhikari.suvendu@sansad.nic.in,,,All India Trinamool Congress,AITC,tamluk,Tamluk,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
4,5c22c70d-8317-4f94-97ec-2ccadd1acdf3,"Adityanath , Shri Yogi","Adityanath , Shri Yogi",yogi.adityanath@sansad.nic.in,,,Bharatiya Janata Party,BJP,gorakhpur,Gorakhpur,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0


In [17]:
pol_hibp.describe().round(2)

Unnamed: 0,row_id,leg_start_year,row_count,n_unique_emails,person_count_legistype,lastmod,pop2024,lastmod_year,IsVerified,IsFabricated,...,Employers,Credit card CVV,Security questions and answers,Drug habits,Education levels,Travel plans,Ages,Loan information,Company names,Nicknames
count,13798.0,13798.0,13798.0,13798.0,13798.0,13798.0,13266.0,13798.0,13128.0,13128.0,...,13128.0,13128.0,13128.0,13128.0,13128.0,13128.0,13128.0,13128.0,13128.0,13128.0
mean,263.89,2013.62,318.33,216.76,675.62,1556704000.0,88893240.0,2018.99,207.0,1.0,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
std,174.66,3.72,196.91,170.04,511.54,1800954.0,272269400.0,0.07,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,1.0,1997.0,31.0,31.0,31.0,1542919000.0,80341.0,2018.0,207.0,1.0,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
25%,119.0,2012.0,158.0,78.0,267.0,1556117000.0,6378654.0,2019.0,207.0,1.0,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
50%,215.0,2015.0,291.0,149.0,541.0,1557305000.0,26889130.0,2019.0,207.0,1.0,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
75%,390.0,2016.0,432.0,314.0,964.0,1557812000.0,68556800.0,2019.0,207.0,1.0,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
max,749.0,2019.0,686.0,646.0,1783.0,1557904000.0,1441720000.0,2019.0,207.0,1.0,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0


In [18]:
columns_to_fill = list(all_categories) + ['IsVerified', 'IsFabricated', 'IsSensitive', 
                   'IsRetired', 'IsSpamList', 'IsMalware', 'IsSubscriptionFree']
pol_hibp[columns_to_fill] = pol_hibp[columns_to_fill].fillna(0)

In [19]:
pol_hibp.groupby(['country', 'legislature', 'leg_start_year']).mean(numeric_only=True).reset_index().describe().round(2)

Unnamed: 0,leg_start_year,row_id,row_count,n_unique_emails,person_count_legistype,lastmod,pop2024,lastmod_year,IsVerified,IsFabricated,...,Employers,Credit card CVV,Security questions and answers,Drug habits,Education levels,Travel plans,Ages,Loan information,Company names,Nicknames
count,114.0,114.0,114.0,114.0,114.0,114.0,105.0,114.0,114.0,114.0,...,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0
mean,2012.75,348.47,214.21,116.27,478.28,1556414000.0,38874960.0,2018.98,195.42,0.94,...,15.1,0.0,0.94,0.0,1.89,0.0,0.0,0.0,0.0,0.0
std,4.08,187.53,157.16,109.53,442.56,2479202.0,143128200.0,0.13,30.43,0.15,...,2.35,0.0,0.15,0.0,0.29,0.0,0.0,0.0,0.0,0.0
min,1997.0,1.0,31.0,31.0,31.0,1542919000.0,80341.0,2018.0,59.14,0.29,...,4.57,0.0,0.29,0.0,0.57,0.0,0.0,0.0,0.0,0.0
25%,2011.0,196.5,111.0,51.25,171.75,1555979000.0,5262079.0,2019.0,202.41,0.98,...,15.65,0.0,0.98,0.0,1.96,0.0,0.0,0.0,0.0,0.0
50%,2014.0,365.5,170.0,73.0,292.0,1557280000.0,10319100.0,2019.0,207.0,1.0,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
75%,2016.0,488.75,297.0,142.25,606.0,1557718000.0,40409180.0,2019.0,207.0,1.0,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
max,2019.0,749.0,686.0,646.0,1783.0,1557904000.0,1441720000.0,2019.0,207.0,1.0,...,16.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0


### Let's merge the rest (India, Eur.)

In [20]:
no_hibp = process_json_files_to_matrix("../data/no_hibp")
dk_hibp = process_json_files_to_matrix("../data/dk_hibp")
in_hibp = process_json_files_to_matrix("../data/india/ls_hibp")
bihar_hibp = process_json_files_to_matrix("../data/india/bihar_hibp")
tn_hibp = process_json_files_to_matrix("../data/india/tn_hibp")
up_hibp = process_json_files_to_matrix("../data/india/up_hibp")
hp_hibp = process_json_files_to_matrix("../data/india/hp_hibp")
del_hibp = process_json_files_to_matrix("../data/india/delhi_hibp")

In [21]:
long_no_hibp  = pd.melt(no_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
dk_no_hibp    = pd.melt(dk_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
in_no_hibp    = pd.melt(in_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
bihar_no_hibp = pd.melt(bihar_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
tn_no_hibp    = pd.melt(tn_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
up_no_hibp    = pd.melt(up_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
hp_no_hibp    = pd.melt(hp_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')
del_no_hibp   = pd.melt(del_hibp, id_vars=['Filename'], var_name='Breach', value_name='Present')

In [22]:
# Norway
no_parl = pd.read_csv("../data/no_parliament.csv")
no_parl = clean_email_column_no_dedupe(no_parl)
print(no_parl.shape)

# Denmark
wb = openpyxl.load_workbook("../data/danish_parliament_1_2025.xlsx")
ws = wb.active  
data = list(ws.values)
dk_parl = pd.DataFrame(data)
dk_parl.columns = dk_parl.iloc[0]
dk_parl = dk_parl[1:].reset_index(drop=True)
dk_parl = clean_email_column_no_dedupe(dk_parl)
print(dk_parl.shape)

# Bihar
with open('../data/india/bihar/bihar.txt', 'r', encoding='utf-8') as file:
    lines = file.read().splitlines()
    lines = [line.split('\t') for line in lines]

bihar_df = pd.DataFrame(lines, columns=['sr_no', 'photo', 'constituency', 'name', 'gender', 'party', 'contact', 'email'])
bihar_df = clean_email_column_no_dedupe(bihar_df)
print(bihar_df.shape)

# UP
up_df = pd.read_csv("../data/india/up/up_18_mlas.csv", usecols=['email'], encoding="utf-8", quotechar='"', sep=",", engine="python")
up_df = clean_email_column_no_dedupe(up_df)
print(up_df.shape)

# HP
hp_df = pd.read_csv("../data/india/hp_14.csv")
hp_df.columns = hp_df.columns.str.lower()
hp_df = clean_email_column_no_dedupe(hp_df)
print(hp_df.shape)

# TN
tn_df = pd.read_csv("../data/india/tn/tn.csv")
tn_df.rename(columns={"Email Address": "email"}, inplace=True)
tn_df = clean_email_column_no_dedupe(tn_df)
print(tn_df.shape)

# Delhi
del_df = pd.read_csv("../data/india/delhi/delhi_7th_assembly.csv")
del_df.rename(columns={"Email": "email"}, inplace=True)
del_df = clean_email_column_no_dedupe(del_df)
print(del_df.shape)

# India
in_df = pd.read_csv("../data/india/ls_long.csv")
in_df = clean_email_column_no_dedupe(del_df)
print(in_df.shape)

FileNotFoundError: [Errno 2] No such file or directory: '../data/no_parliament.csv'

In [None]:
# Add metadata
no_parl = no_parl.assign(country='Norway', year=2025, legislature='National Legislature')
dk_parl = dk_parl.assign(country='Denmark', year=2025, legislature='National Legislature')
in_hibp = in_df.assign(country='India', year=2025, legislature='National Legislature')

bihar_hibp = bihar_df.assign(country='India', year=2025, legislature='Bihar Legislature')
tn_df = tn_df.assign(country='India', year=2025, legislature='Tamil Nadu State Legislature')
up_df = up_df.assign(country='India', year=2025, legislature='UP State Legislature')
hp_df = hp_df.assign(country='India', year=2025, legislature='HP Legislature')
del_df = del_df.assign(country='India', year=2025, legislature='Delhi Legislature')