# Get Congressional Activity
***
**Project:** Congressional Activity  
**Author:** Tami McManus
**Last Updated:** August 1, 2023

This notebook reads in raw congressional activity data from an Excel file and transforms it into dataframes. The data is then saved as Excel files for consumption by Tableau.
  
Congressional Activity Resumes were originally scraped from:  
https://www.senate.gov/    

***
# Notebook Setup
***

Library versions used in original notebook:  
requests -- 2.27.1  
pandas -- 1.4.2  
numpy -- 1.21.5 

In [1]:
# Import libraries
import requests
import pandas as pd
import numpy as np

In [2]:
# Check libraries Versions
print(f'Import Complete: {requests.__name__} {requests.__version__}')
print(f'Import Complete: {pd.__name__} {pd.__version__}')
print(f'Import Complete: {np.__name__} {np.__version__}')

Import Complete: requests 2.27.1
Import Complete: pandas 1.4.2
Import Complete: numpy 1.21.5


In [3]:
# Set data frame defaults
pd.options.display.max_colwidth = 60

In [4]:
%%html
<!-- Prevent text wrappping in dataframe displays for a cleaner print -->
<style> .dataframe td {white-space: nowrap;}</style>

***  
# Read Raw Activity Data
***

In [5]:
# Read in all worksheets
file_name = '../Data/Raw Data - Resumes.xlsx'
raw_data_dict = pd.read_excel(file_name, sheet_name=None, header=None, skiprows=1, usecols='A:C')

***
# Process Data
***

In [6]:
# Create empty dataframes to hold the final data
gen_activity_df = pd.DataFrame()
#measures_df = pd.DataFrame()
#confirm_df = pd.DataFrame()

In [7]:
# Define column lables by group for easier processing later on
key_cols = ['Year', 'Congress', 'Session', 'Chamber']
gen_cols = ['Days in session', 'Time in session', '...Pages of proceedings', '...Extension of Remarks',
            'Public bills enacted into law', 'Private bills enacted into law', 'Bills in conference',
            'Bills through conference', 'Special reports', 'Conference reports', 'Measures pending on calendar', 
            'Quorum calls', 'Yea-and-nay votes', 'Recorded votes', 'Bills vetoed', 'Vetoes overridden']
measure_cols = ['Measures passed, total', '...Senate bills', '...House bills', '...Senate joint resolutions', 
                '...House joint resolutions', '...Senate concurrent resolutions', '...House concurrent resolutions',
                '...Simple resolutions', 'Measures reported, total', '...Senate bills', '...House bills', 
                '...Senate joint resolutions', '...House joint resolutions', '...Senate concurrent resolutions',
                '...House concurrent resolutions', '...Simple resolutions', 'Measures introduced, total', '...Bills', 
                '...Joint resolutions', '...Concurrent resolutions', '...Simple resolutions']

In [8]:
for key in raw_data_dict.keys():
    # Copy the dataframe from the current worksheet
    raw_df = raw_data_dict[key].copy()

    # Split the dates string into start and end dates for the session
    sess_date = raw_df.at[0,0]
    dates = sess_date.split(' through ')
    if dates[0].count(',') == 0:
        start_date = dates[0] + sess_date[len(sess_date) - 6:]
    else:
        start_date = dates[0]
    end_date = dates[1]

    # Remove the informational cells at the top of the worksheet
    raw_df.drop([0, 1, 2], inplace=True)
    raw_df.set_index(0, inplace=True)
        
    # Split the worksheet name into year, congress, and session
    i = key.find(' - ')
    j = key.find('.')
    year = key[:i]
    congress = key[i+3:j]
    session = key[j+1]
    
    # Add the key columns to the dataframe
    raw_df.loc['Year'] = [year, year] 
    raw_df.loc['Congress'] = [congress, congress]
    raw_df.loc['Session'] = [session, session]
    raw_df.loc['Chamber'] = ['Senate', 'House']
        
    # Label the Senate and House data columns, remove the Congressional Record row, and any empty rows
    raw_df.rename({1:'Senate', 2:'House'}, axis=1, inplace=True)
    raw_df.drop(index='Congressional Record:', inplace=True)
    raw_df = raw_df[raw_df.index.notnull()]
    
    # Iterate through the data for the senate, then the house
    for chamber in ['Senate', 'House']:
        
        # Filter the data for the chamber in question and transpose it
        raw_chamber_df = raw_df[[chamber]].transpose().copy()
        
        # Drop the measure columns for the general activity dataframe        
        raw_chamber_df.drop(measure_cols, axis=1, inplace=True, errors='ignore')
        
        # Add cleaned and transposed senate data to the senate dataframe and set all data to int data type
        gen_activity_df = pd.concat([gen_activity_df, raw_chamber_df], ignore_index=True).copy()
    

***
# Cleanup Data
***

**Pick up Here**

#Change the order of columns, putting the key columns first
#temp_cols = gen_activity_df.columns.values.tolist()
temp_cols = temp_cols.remove('Year')

print(temp_cols)


***
# Preview Data
***

In [15]:
gen_activity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82 entries, 0 to 81
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Days in session                 82 non-null     object
 1   Time in session                 81 non-null     object
 2   ...Pages of proceedings         72 non-null     object
 3   ...Extension of Remarks         31 non-null     object
 4   Public bills enacted into law   82 non-null     object
 5   Private bills enacted into law  35 non-null     object
 6   Bills in conference             66 non-null     object
 7   Bills through conference        8 non-null      object
 8   Special reports                 82 non-null     object
 9   Conference reports              59 non-null     object
 10  Measures pending on calendar    80 non-null     object
 11  Quorum calls                    79 non-null     object
 12  Yea-and-nay votes               82 non-null     obje

In [9]:
gen_activity_df.head()

Unnamed: 0,Days in session,Time in session,...Pages of proceedings,...Extension of Remarks,Public bills enacted into law,Private bills enacted into law,Bills in conference,Bills through conference,Special reports,Conference reports,...,Quorum calls,Yea-and-nay votes,Recorded votes,Bills vetoed,Vetoes overridden,Year,Congress,Session,Chamber,Bills not signed
0,150,"1,010 hrs., 47'",,,80,,3,4,25,4.0,...,18,381,,3.0,1,1983,98,1,Senate,
1,146,"851 hrs., 45'",,,75,4.0,2,29,45,33.0,...,35,297,201.0,3.0,1,1983,98,1,House,
2,131,"940 hrs., 28'",14612.0,,112,14.0,5,21,11,,...,19,292,,2.0,1,1984,99,2,Senate,
3,120,"852 hrs., 59'",12284.0,,133,11.0,6,27,40,53.0,...,55,227,181.0,1.0,1,1984,99,2,House,
4,170,"1,252 hrs., 31'",18418.0,,110,,4,8,18,2.0,...,20,381,,,1,1985,99,1,Senate,


In [10]:
gen_activity_df.tail()

Unnamed: 0,Days in session,Time in session,...Pages of proceedings,...Extension of Remarks,Public bills enacted into law,Private bills enacted into law,Bills in conference,Bills through conference,Special reports,Conference reports,...,Quorum calls,Yea-and-nay votes,Recorded votes,Bills vetoed,Vetoes overridden,Year,Congress,Session,Chamber,Bills not signed
77,145,"477 hrs, 44′",6025,1075.0,41,,2.0,,11,,...,1.0,192,34.0,1.0,,2020,116,2,House,
78,176,"976 hrs, 31′",8836,,30,,,,8,,...,4.0,473,,,,2021,117,1,Senate,
79,152,"642 hrs, 16′",6713,1289.0,39,,,,10,,...,1.0,387,,,,2021,117,1,House,
80,178,"958 hrs., 32′",10113,,101,,1.0,,4,,...,,421,,,,2022,117,2,Senate,
81,164,"621 hrs., 48′",10550,1368.0,146,2.0,1.0,,42,,...,1.0,532,16.0,,,2022,117,2,House,


***
# Write to Excel
***

**Rewrite this section**

In [None]:
with pd.ExcelWriter('../Data/Scrubbed Data - Resumes.xlsx') as writer:
    senate_df.to_excel(writer, sheet_name='Senate', index=False)
    house_df.to_excel(writer, sheet_name='House', index=False)
    senate_conf_df.to_excel(writer, sheet_name='Confirmations', index=False)

In [None]:
gen_activity_df.to_excel('../Data/Scrubbed - Gen.xlsx', index=False)

In [None]:
gen_activity_df

***