# Get Congressional Activity
***
**Project:** Congressional Activity  
**Author:** Tami McManus
**Last Updated:** July 6, 2023

This notebook reads in raw congressional activity data from an Excel file and transforms it into dataframes. The data is then saved as Excel files for consumption by Tableau.
  
Congressional Activity Resumes were originally scraped from:  
https://www.senate.gov/    

***
# Notebook Setup
***

Library versions used in original notebook:  
requests -- 2.27.1  
pandas -- 1.4.2  

In [1]:
# Import libraries
import requests
import pandas as pd

In [2]:
# Check libraries Versions
print(f'Import Complete: {requests.__name__} {requests.__version__}')
print(f'Import Complete: {pd.__name__} {pd.__version__}')

Import Complete: requests 2.27.1
Import Complete: pandas 1.4.2


In [3]:
# Set data frame defaults
pd.options.display.max_colwidth = 60

In [4]:
%%html
<!-- Prevent text wrappping in dataframe displays for a cleaner print -->
<style> .dataframe td {white-space: nowrap;}</style>

***  
# Read Raw Activity Data
***

In [5]:
# Read in all worksheets
file_name = '../Data/Std Data - Resumes.xlsx'
raw_data_dict = pd.read_excel(file_name, sheet_name=None)
raw_data_dict.keys()

dict_keys(['1983 - 98.1', '1984 - 98.2', '1985 - 99.1', '1986 - 99.2', '1987 - 100.1', '1988 - 100.2', '1989 - 101.1', '1990 - 101.2', '1991 - 102.1', '1992 - 102.2', '1993 - 103.1', '1994 - 103.2', '1995 - 104.1', '1996 - 104.2', '1997 - 105.1', '1998 - 105.2', '1999 - 106.1', '2000 - 106.2', '2001 - 107.1', '2002 - 107.2', '2003 - 108.1', '2004 - 108.2', '2005 - 109.1', '2006 - 109.2', '2007 - 110.1', '2008 - 110.2', '2009 - 111.1', '2010 - 111.2', '2011 - 112.1', '2012 - 112.2', '2013 - 113.1', '2014 - 113.2', '2015 - 114.1', '2016 - 114.2', '2017 - 115.1', '2018 - 115.2', '2019 - 116.1', '2020 - 116.2', '2021 - 117.1', '2022 - 117.2'])

***
# Process Data
***

In [6]:
# Create empty dataframes to hold the final data for the house and senate
senate_df = pd.DataFrame()
house_df = pd.DataFrame()
senate_conf_df = pd.DataFrame()

In [7]:
for key in raw_data_dict.keys():
    # Copy the dataframe from the current worksheet
    raw_df = raw_data_dict[key].copy()
    
    # Split the worksheet name into year, congress, and session
    i = key.find(' - ')
    j = key.find('.')
    year = key[:i]
    congress = key[i+3:j]
    session = key[j+1]
    
    # Clean up the senate data so that the dataframe can be transposed
    raw_df.set_index('Unnamed: 0', inplace=True)
    raw_senate_df = raw_df[['Senate']].transpose().copy()
    raw_senate_df.insert(loc = 0, column = 'Year', value = year)
    raw_senate_df.insert(loc = 1, column = 'Congress', value = congress)
    raw_senate_df.insert(loc = 2, column = 'Session', value = session)
    #raw_senate_df.set_index(['Key'], drop = True, inplace = True)
    raw_senate_df.rename_axis(None, axis=1, inplace=True)
    
    # Add cleaned and transposed senate data to the senate dataframe and set all data to int data type
    senate_df = pd.concat([senate_df, raw_senate_df], ignore_index=True).copy()
    senate_df = senate_df.astype(int)
        
    # Repeat the data cleaning and transpose process for house data
    raw_house_df = raw_df[['House']].transpose().copy()
    raw_house_df.insert(loc = 0, column = 'Year', value = year)
    raw_house_df.insert(loc = 1, column = 'Congress', value = congress)
    raw_house_df.insert(loc = 2, column = 'Session', value = session)
    raw_house_df.reset_index(drop = True, inplace = True)
    raw_house_df.rename_axis(None, axis=1, inplace=True)
    
    # Add cleaned and transposed senate data to the senate dataframe and set all data to int data type
    house_df = pd.concat([house_df, raw_house_df], ignore_index=True).copy()
    house_df = house_df.astype(int)
    
    # Repeat the data cleaning and transpose process for the senate confirmation data
    raw_senate_conf_df = raw_df[['Unnamed: 4', 'Civilian', 'Army', 'Navy', 'Air Force', 'Marine Corps', 'Space Force']].copy()
    raw_senate_conf_df.dropna(how='all', inplace=True)
    raw_senate_conf_df.set_index('Unnamed: 4', inplace=True)
    raw_senate_conf_df.transpose()
    raw_senate_conf_df.insert(loc = 0, column = 'Year', value = year)
    raw_senate_conf_df.insert(loc = 1, column = 'Congress', value = congress)
    raw_senate_conf_df.insert(loc = 2, column = 'Session', value = session)
    raw_senate_conf_df.reset_index(drop = False, inplace = True)
    raw_senate_conf_df.rename(columns={'Unnamed: 4': 'Description'}, inplace=True)
    raw_senate_conf_df.rename_axis(None, axis=1, inplace=True)

     # Add cleaned and transposed senate confirmation data to the senate dataframe and set all data to int data type
    senate_conf_df = pd.concat([senate_conf_df, raw_senate_conf_df], ignore_index=True).copy()
    index_cols = ['Year', 'Congress', 'Session']
    int_cols = ['Civilian', 'Army', 'Navy', 'Air Force', 'Marine Corps', 'Space Force']
    senate_conf_df[index_cols + int_cols] = senate_conf_df[index_cols + int_cols].astype(int)
    senate_conf_df = senate_conf_df[index_cols + ['Description'] + int_cols]

***
# Preview Data
***

In [8]:
senate_df.head()

Unnamed: 0,Year,Congress,Session,Days in Session,Hours in Session,Laws Enacted (Public),Laws Enacted (Private),Bills in Conference,Senate Bills Passed,House Bills Passed,...,Measures Pending on Calendar,Bills Introduced,Joint Resolutions Introduced,Concurrent Resolutions Introduced,Simple Resolutions Introduced,Quorum Calls,Yea-and-Nay Votes,Recorded Votes,Bills Vetoed,Vetoes Overridden
0,1983,98,1,150,1010,80,0,3,170,99,...,152,2196,209,86,302,18,381,0,3,1
1,1984,98,2,131,940,112,14,5,159,239,...,221,897,150,69,186,19,292,0,2,1
2,1985,99,1,170,1252,110,0,4,106,93,...,94,2000,255,102,294,20,381,0,0,1
3,1986,99,2,143,1278,114,3,1,164,209,...,180,954,177,73,225,16,359,0,1,1
4,1987,100,1,170,1217,93,2,6,110,135,...,76,1998,239,95,353,36,420,0,1,2


In [9]:
senate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 34 columns):
 #   Column                                  Non-Null Count  Dtype
---  ------                                  --------------  -----
 0   Year                                    40 non-null     int32
 1   Congress                                40 non-null     int32
 2   Session                                 40 non-null     int32
 3   Days in Session                         40 non-null     int32
 4   Hours in Session                        40 non-null     int32
 5   Laws Enacted (Public)                   40 non-null     int32
 6   Laws Enacted (Private)                  40 non-null     int32
 7   Bills in Conference                     40 non-null     int32
 8   Senate Bills Passed                     40 non-null     int32
 9   House Bills Passed                      40 non-null     int32
 10  Senate Joint Resolutions Passed         40 non-null     int32
 11  House Joint Resolutio

In [10]:
house_df.head()

Unnamed: 0,Year,Congress,Session,Days in Session,Hours in Session,Laws Enacted (Public),Laws Enacted (Private),Bills in Conference,Senate Bills Passed,House Bills Passed,...,Measures Pending on Calendar,Bills Introduced,Joint Resolutions Introduced,Concurrent Resolutions Introduced,Simple Resolutions Introduced,Quorum Calls,Yea-and-Nay Votes,Recorded Votes,Bills Vetoed,Vetoes Overridden
0,1983,98,1,146,851,75,4,2,70,234,...,102,4580,440,237,385,35,297,201,3,1
1,1984,98,2,120,852,133,11,6,128,323,...,120,1862,223,142,235,55,227,181,1,1
2,1985,99,1,152,965,130,0,8,42,214,...,56,4011,496,267,352,43,255,184,6,1
3,1986,99,2,129,829,112,13,16,103,289,...,97,1732,260,152,252,37,224,227,6,1
4,1987,100,1,169,909,104,1,15,44,272,...,20,3830,437,235,345,23,234,254,2,2


In [11]:
house_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 34 columns):
 #   Column                                  Non-Null Count  Dtype
---  ------                                  --------------  -----
 0   Year                                    40 non-null     int32
 1   Congress                                40 non-null     int32
 2   Session                                 40 non-null     int32
 3   Days in Session                         40 non-null     int32
 4   Hours in Session                        40 non-null     int32
 5   Laws Enacted (Public)                   40 non-null     int32
 6   Laws Enacted (Private)                  40 non-null     int32
 7   Bills in Conference                     40 non-null     int32
 8   Senate Bills Passed                     40 non-null     int32
 9   House Bills Passed                      40 non-null     int32
 10  Senate Joint Resolutions Passed         40 non-null     int32
 11  House Joint Resolutio

In [12]:
senate_conf_df.head()

Unnamed: 0,Year,Congress,Session,Description,Civilian,Army,Navy,Air Force,Marine Corps,Space Force
0,1983,98,1,Nominations (Carry Over),0,0,0,0,0,0
1,1983,98,1,Nominations (New),3454,14782,21944,12819,2990,0
2,1983,98,1,Confirmed,2978,14782,21944,12792,2990,0
3,1983,98,1,Unconfirmed,2,0,0,26,0,0
4,1983,98,1,Withdrawn,0,0,0,0,0,0


In [13]:
senate_conf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 281 entries, 0 to 280
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Year          281 non-null    int32 
 1   Congress      281 non-null    int32 
 2   Session       281 non-null    int32 
 3   Description   281 non-null    object
 4   Civilian      281 non-null    int32 
 5   Army          281 non-null    int32 
 6   Navy          281 non-null    int32 
 7   Air Force     281 non-null    int32 
 8   Marine Corps  281 non-null    int32 
 9   Space Force   281 non-null    int32 
dtypes: int32(9), object(1)
memory usage: 12.2+ KB


***
# Write to Excel
***

In [14]:
with pd.ExcelWriter('../Data/Scrubbed Data - Resumes.xlsx') as writer:
    senate_df.to_excel(writer, sheet_name='Senate', index=False)
    house_df.to_excel(writer, sheet_name='House', index=False)
    senate_conf_df.to_excel(writer, sheet_name='Confirmations', index=False)

***