# Compare speeches from old dataset and new CREC dataset

This script will iterate over the days in 2015, and try to match speeches between the old dataset and new dataset

Matching speeches proceeds in the following way. We first process the speech in the following way:
* We only keep speeches from House and Senate
* We only keep speeches where the speaker name starts with "Dr.", "Mr.", "Ms.", "Mrs.", "Miss". This eliminates speeches from the "The speaker", "The clerk", etc., which are mostly procedural in nature and very inconsistent / hard to compare
* We throw out speeches that are very short (these also tend to be procedural), of 1500 characters or less
* From the remaining speeches, we remove all punctuation.

Of the remaining and cleaned speeches, we say that an old and a new speech match if their "edit distance" (also called Levenshtein distance) is 20 or less


In [None]:
import os, datetime, json, pprint, re
import pandas as pd
from collections import OrderedDict
from config import DATA_DIR, DATA_DIR_DOWNLOADS

# Define some utilities for cleaning text
def remove_extra_spaces(s):
    while '  ' in s:
        s = s.replace('  ',' ')
    return s.strip()

def remove_punct(s): # removes punctuations
    s = s.replace(".", " ")
    s = s.replace(",", " ")
    s = s.replace("'", "")
    s = s.replace('"', " ")
    s = s.replace('-', "")
    s = s.replace('`', "")
    s = s.replace(';', " ")
    s = s.replace('*', " ")
    s = s.replace('~', " ")
    s = s.replace('_', " ")
    s = s.replace('\\', "")
    s = s.replace('•', " ")
    
    return s.strip()

def stripspeech(s):
    # Clean up and standardize the speech a bit
    s = s.replace('§', 'sec')
    s = remove_extra_spaces(remove_punct(s).lower())
    s = s.replace(' ', '')
    s = s.replace('9111', '911')
    s = s.replace('9/11', '911')
    return s


# We will compare speeches in old and dataset and new dataset using the 
# "edlib" (edit distance library)
import edlib

def match_speeches(a,b, verbose=False):
    match1 = { rndx1 : None for rndx1, r1 in a.iterrows() }
    match2 = { rndx2 : None for rndx2, r2 in b.iterrows() }

    def distance_func(r1,r2): 
        return edlib.align(stripspeech(r1.speech), stripspeech(r2.speech))['editDistance']

    for rndx1, r1 in a.iterrows():
        for rndx2, r2 in b.iterrows():
            d=distance_func(r1,r2)
            if d<20:
                match1[rndx1]=rndx2
                match2[rndx2]=rndx1
                break

    if verbose:
        for rndx1, r1 in a.iterrows():
            if match1[rndx1] is None:
                print('No match to speech in old dataset')
                print(r1)
                print()

        for rndx2, r2 in b.iterrows():
            if match2[rndx2] is None:
                print('No match to speech in new dataset')
                print(r2)
                print()
    return match1, match2

In [None]:
# Load new speeches data from CREC
df_new=pd.read_hdf(DATA_DIR_DOWNLOADS+'/crec2015to2021.hdf')  

# Load old speeches data from 114th congress
df_old=pd.read_csv(DATA_DIR +'/created_data/daily_114.csv',sep='\t', index_col='speech_id')

# Restrict attention to house and senate speeches
df_old=df_old.loc[df_old.chamber.isin(['H','S'])]
# Drop speeches where the speaker is "The speaker", "The clerk", etc.
# These are very incosistent and hard to compare
valid_speakers = ['mr. ','ms. ','mrs.', 'dr. ','miss'] 
df_old=df_old[df_old.speaker.str.lower().str[0:4].isin(valid_speakers)]

In [None]:
# Check that we are not missing speeches in new data
def short_filter(s):  # Remove very short speeches, as these are inconsistent
    return len(stripspeech(s))>=1500

tot_days = 0
tot_old  = 0
tot_new  = 0
tot_old_success = 0
tot_new_success = 0

for unique_date in sorted(set(df_old.date)):
    c_old = df_old[df_old.date==unique_date]
    c_old = c_old[c_old.speech.apply(short_filter)]
    c_new = df_new[df_new.date==unique_date]
    c_new = c_new[c_new.speech.apply(short_filter)]
    
    other_msg = ""
    
    num_old = len(c_old)
    num_new = len(c_new)
    
    if num_old > num_new: # on this day,  more speeches in old data than new data
        other_msg = '(old data has more speeches: %d vs %d' % (num_old, num_new)
        
    match_old, match_new = match_speeches(c_old, c_new)
    # Calculate percentage of speeches in old dataset that match to a new speech
    

    success_old = len([v for v in match_old.values() if v is not None])
    success_new = len([v for v in match_new.values() if v is not None])
    
    p_old, p_new = '---', '---'
    if num_old: p_old = "%3d" % (100*success_old/num_old)
    if num_new: p_new = "%3d" % (100*success_new/num_new)
    print(unique_date, 'old->new: %s%% (%4d/%4d), new->old: %s%% (%4d/%4d) %s' % 
          (p_old, success_old, num_old, p_new, success_new, num_new, other_msg))

    tot_days += 1
    tot_old  += num_old
    tot_new  += num_new
    tot_old_success += success_old
    tot_new_success += success_new
    

p_old, p_new = '---', '---'
if tot_old: p_old = "%3d" % (100*tot_old_success/tot_old)
if tot_new: p_new = "%3d" % (100*tot_new_success/tot_new)
    
print('----------')
print('TOTAL %3dd old->new: %s%% (%4d/%4d), new->old: %s%% (%4d/%4d)' % 
      (tot_days, p_old, tot_old_success, tot_old, p_new, tot_new_success, tot_new))
