# Compare speeches from old dataset and new CREC dataset

This script will iterate over the days in 2015, and try to match speeches between the old dataset and new dataset

In [None]:
import os, datetime, json, pprint, re
import pandas as pd
from collections import OrderedDict
from config import DATA_DIR

# Define some utilities for cleaning text
def remove_extra_spaces(s):
    while '  ' in s:
        s = s.replace('  ',' ')
    return s.strip()

def remove_punct(s): # removes punctuations
    s = s.replace(".", " ")
    s = s.replace(",", " ")
    s = s.replace("'", "")
    s = s.replace('"', " ")
    s = s.replace('-', "")
    s = s.replace('`', "")
    s = s.replace(';', " ")
    s = s.replace('*', " ")
    s = s.replace('~', " ")
    s = s.replace('_', " ")
    s = s.replace('\\', "")
    s = s.replace('•', " ")
    
    return s.strip()

def stripspeech(s):
    s = s.replace('§', 'sec')
    s = remove_extra_spaces(remove_punct(s).lower())
    s = s.replace(' ', '')
    s = s.replace('9111', '911')
    s = s.replace('9/11', '911')
    return s


# We will compare speeches in old and dataset and new dataset using the 
# "edlib" (edit distance library)
import edlib

def match_speeches(a,b, verbose=False):
    match1 = { rndx1 : None for rndx1, r1 in a.iterrows() }
    match2 = { rndx2 : None for rndx2, r2 in b.iterrows() }

    def distance_func(r1,r2): 
        return edlib.align(stripspeech(r1.speech), stripspeech(r2.speech))['editDistance']

    for rndx1, r1 in a.iterrows():
        for rndx2, r2 in b.iterrows():
            d=distance_func(r1,r2)
            if d<20:
                match1[rndx1]=rndx2
                match2[rndx2]=rndx1
                break

    if verbose:
        for rndx1, r1 in a.iterrows():
            if match1[rndx1] is None:
                print('No match to speech in old dataset')
                print(r1)
                print()

        for rndx2, r2 in b.iterrows():
            if match2[rndx2] is None:
                print('No match to speech in new dataset')
                print(r2)
                print()
    return match1, match2

In [None]:
# Load new speeches data from CREC
df=pd.read_hdf(DATA_DIR+'/recent-data/crec2015to2021.hdf')  

# Load old speeches data from 114th congress
df_old=pd.read_csv(DATA_DIR +'/created_data/daily_114.csv',sep='\t', index_col='speech_id')

# Restrict attention to house and senate speeches
df_old=df_old.loc[df_old.chamber.isin(['H','S'])]
# Drop speeches where the speaker is "The speaker", "The clerk", etc.
# These are very incosistent and hard to compare
valid_speakers = ['mr. ','ms. ','mrs.', 'dr. ','miss'] 
df_old=df_old[df_old.speaker.str.lower().str[0:4].isin(valid_speakers)]

# Check that we are not missing speeches in new data

def short_filter(s):  # Remove very short speeches, as these are inconsistent
    return len(stripspeech(s))>=1500

for unique_date in sorted(set(df_old.date)):
    a = df_old[df_old.date==unique_date]
    a = a[a.speech.apply(short_filter)]
    b = df[df.date==unique_date]
    b = b[b.speech.apply(short_filter)]
    
    if len(a) > len(b): # on this day, there are more speeches in old dataset than new dataset
        print(unique_date, 'old dataset has more speeches: %d vs %d' % (len(a), len(b)))
        
    match1, match2 = match_speeches(a,b)
    # Calculate percentage of speeches in old dataset that match to a new speech
    p1 = len([v for v in match1.values() if v is not None])/len(match1)
    print(unique_date, 'matched percentage: %3d%% out of %d speeches' % (int(100*p1),len(match1)))
    
            