In [None]:
import os
import datetime
import pandas as pd
import json
import time

from collections import OrderedDict

def remove_extra_spaces(s):
    while '  ' in s:
        s = s.replace('  ', ' ')
    return s.strip()

DO_SAVE = False

In [None]:
data = []

start_date = '2015-01-01'
end_date   = '2022-01-01'

day        = datetime.datetime.strptime(start_date, '%Y-%m-%d')
end_day    = datetime.datetime.strptime(end_date, '%Y-%m-%d')

while day <= end_day:
    cur_date_str = datetime.datetime.strftime(day, '%Y-%m-%d')
    base_dir = 'output/%d/CREC-%s/json' % (day.year, cur_date_str)
    if os.path.exists(os.path.join(base_dir,'JSON_PARSING_DONE')):
        start_time = time.time()
        
        num_speeches = 0
        for fname in os.listdir(base_dir):
            if fname.endswith('.json'):
                o=json.load(open(os.path.join(base_dir,fname),'r'))
                h=o['header']
                if h['extension']:
                    continue
                    
                num_speeches += 1
                
                if h['chamber'] == 'House':
                    chamber = 'H'
                elif h['chamber'] == 'Senate':
                    chamber = 'S'
                else:
                    raise Exception('Unknown chamber %s' % h['chamber'])
                    
                for ndx, c in enumerate(o['content']):
                    if c['kind']!='speech':
                        continue
                    
                    speaker = c['speaker'].strip()
                    if 'clerk' in speaker:
                        raise Exception('Found a speech by the clerk')
                        
                    text = c['text'].strip()
                    
                    # Remove speaker from beginning of speech
                    if text.startswith(speaker + '.'):
                        text = text[len(speaker + '.'):]
                    elif text.startswith(speaker + ' .'):
                        text = text[len(speaker + ' .'):]
                    else:
                        print(speaker+ '/' +text[:100])
                        raise Exception('Text doesn\'t start with speaker name')
                        
                    text = remove_extra_spaces(text.replace('\n',' '))
                    
                    speech_id = '%s-%d' % (o['id'], ndx)
                        
                    row = OrderedDict(
                        speech_id  = speech_id,
                        speech     = text,
                        chamber    = chamber,
                        date       = cur_date_str,
                        speaker    = speaker,
                        speaker_bioguide = c['speaker_bioguide'],
                        vol        = h['vol'],
                        num        = h['num'],
                        pages      = h['pages'],
                        doc_title  = o['doc_title'],
                        title      = o['title'])
                    data.append(row)
                    
        load_time = time.time() - start_time
        print("Loaded %4d speeches from %s (time: %0.2f s tot, %4.1f ms/speech)" % 
              (num_speeches, cur_date_str, load_time, 1000*load_time/num_speeches if num_speeches > 0 else 0) )
                    
    day += datetime.timedelta(days=1)
    
df = pd.DataFrame.from_dict(data).astype({'speech_id':str,
                                          'speech':str, 'chamber':str, 'date':'datetime64', 
                                          'speaker':str, 'speaker_bioguide':str, 'vol':int,
                                          'num':int, 'pages':str, 'doc_title':str, 'title':str})
del data
df = df.set_index('speech_id')

if DO_SAVE:
    df.to_hdf('crec2015to2021.hdf', key='RecentCRECData', format='fixed', mode='w', complevel=5)
