### Download and Merge EveryPol Data

In [1]:
import os
import pandas as pd
import requests
from io import StringIO

In [2]:
data = pd.read_csv("../data/everypol-walkthrough.csv")
data['row_id'] = data.index + 1
data.head()

Unnamed: 0,country,legislature,term,leg_start_year,row_count,n_unique_emails,person_count_legistype,ltype,url,cc,leg_start_date,lastmod,cc3,pop2024,lastmod_year,row_id
0,India,Lok Sabha,16th Lok Sabha,2014,541,511,541,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,IN,2014-05-26,1557812040,IND,1441720000.0,2019,1
1,China,National People’s Congress,12th National People’s Congress,2013,2987,0,2956,unicameral legislature,https://cdn.rawgit.com/everypolitician/everypo...,CN,2013-03-05,1554305411,CHN,1410132000.0,2019,2
2,United-States-of-America,House of Representatives,113th Congress,2013,449,0,1739,lower house,https://cdn.rawgit.com/everypolitician/everypo...,US,2013-01-06,1557715874,USA,336610700.0,2019,3
3,United-States-of-America,House of Representatives,114th Congress,2015,447,0,1739,lower house,https://cdn.rawgit.com/everypolitician/everypo...,US,2015-01-06,1557715874,USA,336610700.0,2019,4
4,United-States-of-America,House of Representatives,108th Congress,2003,444,0,1739,lower house,https://cdn.rawgit.com/everypolitician/everypo...,US,2003-01-06,1557715874,USA,336610700.0,2019,5


In [3]:
# Let's filter to CSVs where we have more than 30
print(data.shape)
sm_data = data[data["n_unique_emails"] > 30]
sm_data.shape

(757, 16)


(116, 16)

In [4]:
sm_data["n_unique_emails"].describe()

count    116.000000
mean     116.948276
std      108.956142
min       31.000000
25%       51.750000
50%       73.000000
75%      143.250000
max      646.000000
Name: n_unique_emails, dtype: float64

In [5]:
output_folder = "../data/legislature_data"
os.makedirs(output_folder, exist_ok=True)

dfs = []

for _, row in sm_data.iterrows():
    try:
        print(f"Fetching data for {row['country']} - {row['legislature']} - {row['leg_start_year']}")
        response = requests.get(row['url'])
        response.raise_for_status()

        csv_filename = os.path.join(output_folder, f"{row['country']}_{row['legislature']}_{row['leg_start_year']}_{row['row_id']}.csv")
        with open(csv_filename, "wb") as file:
            file.write(response.content)
        print(f"Saved {csv_filename}")

        csv_data = StringIO(response.text)
        df = pd.read_csv(csv_data)

        # Add key
        df["row_id"] = row["row_id"]
    
        dfs.append(df)
    except Exception as e:
        print(f"Failed to process {row['country']} - {row['legislature']} - {row['leg_start_year']}: {e}")

Fetching data for India - Lok Sabha - 2014
Saved ../data/legislature_data/India_Lok Sabha_2014_1.csv
Fetching data for Nigeria - Senate - 2015
Saved ../data/legislature_data/Nigeria_Senate_2015_45.csv
Fetching data for Nigeria - House of Representatives - 2015
Saved ../data/legislature_data/Nigeria_House of Representatives_2015_46.csv
Fetching data for Iran - Majles - 2012
Saved ../data/legislature_data/Iran_Majles_2012_62.csv
Fetching data for Iran - Majles - 2016
Saved ../data/legislature_data/Iran_Majles_2016_63.csv
Fetching data for Tanzania - National Assembly - 2015
Saved ../data/legislature_data/Tanzania_National Assembly_2015_110.csv
Fetching data for Tanzania - National Assembly - 2005
Saved ../data/legislature_data/Tanzania_National Assembly_2005_111.csv
Fetching data for Tanzania - National Assembly - 2010
Saved ../data/legislature_data/Tanzania_National Assembly_2010_113.csv
Fetching data for UK - House of Commons - 2015
Saved ../data/legislature_data/UK_House of Commons_20

Saved ../data/legislature_data/Denmark_Folketing_2011_392.csv
Fetching data for Finland - Eduskunta - 2003
Saved ../data/legislature_data/Finland_Eduskunta_2003_408.csv
Fetching data for Finland - Eduskunta - 2015
Saved ../data/legislature_data/Finland_Eduskunta_2015_409.csv
Fetching data for Finland - Eduskunta - 2011
Saved ../data/legislature_data/Finland_Eduskunta_2011_410.csv
Fetching data for Finland - Eduskunta - 2007
Saved ../data/legislature_data/Finland_Eduskunta_2007_417.csv
Fetching data for Slovakia - National Council - 2012
Saved ../data/legislature_data/Slovakia_National Council_2012_441.csv
Fetching data for Slovakia - National Council - 2006
Saved ../data/legislature_data/Slovakia_National Council_2006_442.csv
Fetching data for Slovakia - National Council - 2016
Saved ../data/legislature_data/Slovakia_National Council_2016_443.csv
Fetching data for Slovakia - National Council - 2010
Saved ../data/legislature_data/Slovakia_National Council_2010_444.csv
Fetching data for 

In [6]:
combined_df = pd.concat(dfs, ignore_index=True)
combined_df.shape

(25087, 20)

In [7]:
final_df = pd.merge(combined_df, sm_data, on="row_id", how="left")
list(final_df)

['id',
 'name',
 'sort_name',
 'email',
 'twitter',
 'facebook',
 'group',
 'group_id',
 'area_id',
 'area',
 'chamber',
 'term_x',
 'start_date',
 'end_date',
 'image',
 'gender',
 'wikidata',
 'wikidata_group',
 'wikidata_area',
 'row_id',
 'country',
 'legislature',
 'term_y',
 'leg_start_year',
 'row_count',
 'n_unique_emails',
 'person_count_legistype',
 'ltype',
 'url',
 'cc',
 'leg_start_date',
 'lastmod',
 'cc3',
 'pop2024',
 'lastmod_year']

In [10]:
final_df.to_csv(os.path.join(output_folder, "everypol_combined_legislature_data.csv"), index=False)

In [14]:
# Get Unique emails for running through HIBP
pd.DataFrame(final_df['email'].unique(), columns=["email"]).to_csv("../data/everypol_unique_emails.csv", index = False)