# Preprocessing House of Representative Data

Data was downloaded from https://voteview.com/data and saved directly to input_path.

Congress 112-118 (2011-2024)

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
input_path = '/content/drive/MyDrive/ANLP Project/Final_Version/Data/House_Members/'
output_path = '/content/drive/MyDrive/ANLP Project/Final_Version/Data/'

Mounted at /content/drive


In [None]:
# imports
import pandas as pd

We just need `congress`, `district`, `state`, `party`, `bioname`, and `year` columns for the 50 states from 2011 to 2023.

In [None]:
# 50 US States
US_STATES = [
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
    'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
    'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ',
    'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC',
    'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'
]

state_map = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas',
    'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware',
    'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
    'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
    'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
    'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
    'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada',
    'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York',
    'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
    'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
    'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah',
    'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia',
    'WI': 'Wisconsin', 'WY': 'Wyoming', 'DC': 'District of Columbia'
}

party_map = {
    100: 'Democrat', 200: 'Republican'
}

houses = []
for i in range(112, 118+1):
  house = pd.read_csv(input_path + f'H{i}_members.csv')
  house = house[house['chamber'] == 'House'] # only for house of reps
  house = house[house['state_abbrev'].isin(US_STATES)] # only for the 50 states
  house['state'] = house['state_abbrev'].map(state_map)
  house['district'] = pd.to_numeric(house['district_code']).astype('Int64')
  house['party'] = house['party_code'].map(party_map) # dem or rep
  houses.append(house[['congress', 'district', 'state', 'party', 'bioname']])

houserep = pd.concat(houses)
len(houserep)

3119

Add year information. Expand rows so house info for each year in 2011-2023.

In [None]:
congress_to_years = {
    112: [2012], # we dont care about 2011
    113: [2013, 2014],
    114: [2015, 2016],
    115: [2017, 2018],
    116: [2019, 2020],
    117: [2021, 2022],
    118: [2023], # we dont care about 2024
}

# Create a list to store expanded rows
expanded_rows = []

for idx, row in houserep.iterrows():
    congress_num = row['congress']
    years = congress_to_years.get(congress_num, [])

    # Create a row for each year
    for year in years:
        new_row = row.copy()
        new_row['year'] = year
        expanded_rows.append(new_row)

# Create new dataframe from expanded rows
houserep = pd.DataFrame(expanded_rows).reset_index(drop=True).sort_values(by=['year', 'state', 'district'])

len(houserep)

5348

In [None]:
houserep.to_csv(output_path + 'house_of_reps.csv')