In [None]:
import os
#import setup_utils as utils
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from collections import defaultdict

In [None]:
#Unabridged source code originally available at: https://github.com/keyonvafa/tbip
# difference to data quoting=3!!!

In [None]:
#data source : https://data.stanford.edu/congress_text#download-data
#Please download and unzip hein-daily.zip
#data diractory where Hein-Daily database is saved
data_dir = '/Users/paulhofmarcher/Documents/svn/baR/Projects/Congress_Speeches/hein-daily' 
#save directory, to be changed as needed
save_dir = '/Users/paulhofmarcher/Documents/svn/baR/Projects/Congress_Speeches/data_180322'


#predefined set of stopwords
stopwords = set(
        np.loadtxt(os.path.join(data_dir, 
                                "stopwords.txt"),
                   dtype=str,
                   delimiter="\n")) #to be changed approrpriately wherever stopwords are stored

#stopwords available at: https://github.com/keyonvafa/tbip/blob/master/setup/stopwords/senate_speeches.txt
#to be downloaded and saved to data_dir as defined above

#Parameters

#minimum number of speeches given by a senator 
#default value 24
min_speeches = 24
#minimum number of senators using a bigram
#default value 10
min_authors_per_word = 10

#parameters for count vectorizer
min_df = 0.001 #minimum document frequency
max_df = 0.3 #maximum document frequency
stop_words = stopwords
ngram_range = (2, 2) #bigrams only
token_pattern = "[a-zA-Z]+" 



In [None]:
#Helper function
#source code originally available at: https://github.com/keyonvafa/tbip
#Count number of occurrences of each value in array of non-negative integers
#documentation: https://numpy.org/doc/stable/reference/generated/numpy.bincount.html

def bincount_2d(x, weights):
    _, num_topics = weights.shape
    num_cases = np.max(x) + 1
    counts = np.array(
      [np.bincount(x, weights=weights[:, topic], minlength=num_cases)
       for topic in range(num_topics)])
    return counts.T

In [None]:
#creating a complete vocabulary covering all the sessions

for i in range(97, 115):
    if(i < 100):
        speeches = pd.read_csv(os.path.join(data_dir, 'speeches_0' + str(i) + '.txt'), 
                               encoding="ISO-8859-1", 
                               sep="|",quoting=3,
                               on_bad_lines='warn')
        description = pd.read_csv(os.path.join(data_dir, 'descr_0' + str(i) + '.txt'), 
                                  encoding="ISO-8859-1", 
                                  sep="|")
        speaker_map = pd.read_csv(os.path.join(data_dir, '0' + str(i) + '_SpeakerMap.txt'), 
                                  encoding="ISO-8859-1", 
                                  sep="|")
    else:
        speeches = pd.read_csv(os.path.join(data_dir, 'speeches_' + str(i) + '.txt'), 
                               encoding="ISO-8859-1", 
                               sep="|", quoting=3,
                               on_bad_lines='warn')
        description = pd.read_csv(os.path.join(data_dir, 'descr_' + str(i) + '.txt'), 
                                  encoding="ISO-8859-1", 
                                  sep="|")
        speaker_map = pd.read_csv(os.path.join(data_dir, str(i) + '_SpeakerMap.txt'), 
                                  encoding="ISO-8859-1", 
                                  sep="|")

    merged_df = speeches.merge(description, 
                               left_on='speech_id', 
                               right_on='speech_id')
    df = merged_df.merge(speaker_map, left_on='speech_id', right_on='speech_id')
    
    # Only look at senate speeches.
    #to select speakers with speeches in the senate (includes Senators and House Reps)
    senate_df = df[df['chamber_x'] == 'S']
    #to select ONLY Senators uncomment the next line
    #senate_df = df[df['chamber_y'] == 'S'] ##  here 7.2
    speaker = np.array(
        [' '.join([first, last]) for first, last in 
         list(zip(np.array(senate_df['firstname']), 
                  np.array(senate_df['lastname'])))])
    speeches = np.array(senate_df['speech'])
    party = np.array(senate_df['party'])

    # Remove senators who make less than 24 speeches
    unique_speaker, speaker_counts = np.unique(speaker, return_counts=True)
    absent_speakers = unique_speaker[np.where(speaker_counts < min_speeches)]
    absent_speaker_inds = [ind for ind, x in enumerate(speaker) 
                           if x in absent_speakers]
    speaker = np.delete(speaker, absent_speaker_inds)
    speeches = np.delete(speeches, absent_speaker_inds)
    party = np.delete(party, absent_speaker_inds)
    speaker_party = np.array(
        [speaker[i] + " (" + party[i] + ")" for i in range(len(speaker))])

    # Create mapping between names and IDs.
    speaker_to_speaker_id = dict(
        [(y.title(), x) for x, y in enumerate(sorted(set(speaker_party)))])
    author_indices = np.array(
        [speaker_to_speaker_id[s.title()] for s in speaker_party])
    author_map = np.array(list(speaker_to_speaker_id.keys()))

    count_vectorizer = CountVectorizer(min_df=min_df,
                                       max_df=max_df, 
                                       stop_words=stop_words, 
                                       ngram_range=ngram_range,
                                       token_pattern=token_pattern)
    
    # Learn initial document term matrix. This is only initial because we use it to
    # identify words to exclude based on author counts.
    counts = count_vectorizer.fit_transform(speeches.astype(str))
    vocabulary = np.array(
        [k for (k, v) in sorted(count_vectorizer.vocabulary_.items(), 
                                key=lambda kv: kv[1])])

    # Remove bigrams spoken by less than 10 Senators.
    counts_per_author = bincount_2d(author_indices, counts.toarray())
    author_counts_per_word = np.sum(counts_per_author > 0, axis=0)
    acceptable_words = np.where(
        author_counts_per_word >= min_authors_per_word)[0]

    # Fit final document-term matrix with modified vocabulary.
    count_vectorizer = CountVectorizer(ngram_range=(2, 2),
                                       vocabulary=vocabulary[acceptable_words])
    counts = count_vectorizer.fit_transform(speeches.astype(str))
    vocabulary = np.array(
        [k for (k, v) in sorted(count_vectorizer.vocabulary_.items(), 
                                key=lambda kv: kv[1])])

    #counts_dense = remove_cooccurring_ngrams(counts, vocabulary) #not required since only bigrams are being considered 
    # Remove speeches with no words.
    existing_speeches = np.where(np.sum(counts, axis=1) > 0)[0]
    counts = counts[existing_speeches]
    author_indices = author_indices[existing_speeches]
    # session specific vocabulary saved to ~/data
    np.savetxt(os.path.join(save_dir, 'vocabulary_' + str(i) + '.txt'), vocabulary, fmt="%s") 
    print("vocabulary saved for session "+str(i))



  exec(code_obj, self.user_global_ns, self.user_ns)
b'Skipping line 4272: expected 2 fields, saw 3\nSkipping line 20062: expected 2 fields, saw 3\nSkipping line 42459: expected 2 fields, saw 3\nSkipping line 128479: expected 2 fields, saw 3\nSkipping line 184232: expected 2 fields, saw 3\nSkipping line 198989: expected 2 fields, saw 3\n'


vocabulary saved for session 97


b'Skipping line 38606: expected 2 fields, saw 3\nSkipping line 121720: expected 2 fields, saw 44\nSkipping line 150102: expected 2 fields, saw 3\nSkipping line 208534: expected 2 fields, saw 3\nSkipping line 230417: expected 2 fields, saw 3\nSkipping line 231036: expected 2 fields, saw 3\nSkipping line 231523: expected 2 fields, saw 3\n'


vocabulary saved for session 98


b'Skipping line 130853: expected 2 fields, saw 3\nSkipping line 133567: expected 2 fields, saw 3\nSkipping line 254360: expected 2 fields, saw 3\nSkipping line 259692: expected 2 fields, saw 3\n'
b'Skipping line 274300: expected 2 fields, saw 15\nSkipping line 278643: expected 2 fields, saw 3\nSkipping line 280818: expected 2 fields, saw 3\n'


vocabulary saved for session 99


b'Skipping line 8628: expected 2 fields, saw 6\nSkipping line 50417: expected 2 fields, saw 3\nSkipping line 85318: expected 2 fields, saw 4\nSkipping line 91950: expected 2 fields, saw 3\nSkipping line 99609: expected 2 fields, saw 4\nSkipping line 120098: expected 2 fields, saw 3\nSkipping line 120710: expected 2 fields, saw 4\nSkipping line 130940: expected 2 fields, saw 3\nSkipping line 149048: expected 2 fields, saw 3\nSkipping line 157111: expected 2 fields, saw 3\nSkipping line 173213: expected 2 fields, saw 3\nSkipping line 177119: expected 2 fields, saw 3\nSkipping line 179075: expected 2 fields, saw 5\nSkipping line 179463: expected 2 fields, saw 3\nSkipping line 200426: expected 2 fields, saw 3\nSkipping line 201310: expected 2 fields, saw 4\nSkipping line 209273: expected 2 fields, saw 3\nSkipping line 210370: expected 2 fields, saw 4\nSkipping line 211440: expected 2 fields, saw 3\nSkipping line 213738: expected 2 fields, saw 3\nSkipping line 215061: expected 2 fields, saw

vocabulary saved for session 100


b'Skipping line 704: expected 2 fields, saw 3\nSkipping line 1286: expected 2 fields, saw 3\nSkipping line 2715: expected 2 fields, saw 3\nSkipping line 2887: expected 2 fields, saw 3\nSkipping line 3234: expected 2 fields, saw 3\nSkipping line 3238: expected 2 fields, saw 3\nSkipping line 3283: expected 2 fields, saw 3\nSkipping line 3384: expected 2 fields, saw 3\nSkipping line 3408: expected 2 fields, saw 4\nSkipping line 3421: expected 2 fields, saw 4\nSkipping line 3524: expected 2 fields, saw 3\nSkipping line 3552: expected 2 fields, saw 3\nSkipping line 3884: expected 2 fields, saw 3\nSkipping line 4124: expected 2 fields, saw 3\nSkipping line 4567: expected 2 fields, saw 3\nSkipping line 4962: expected 2 fields, saw 3\nSkipping line 5060: expected 2 fields, saw 3\nSkipping line 5673: expected 2 fields, saw 3\nSkipping line 7621: expected 2 fields, saw 3\nSkipping line 9163: expected 2 fields, saw 3\nSkipping line 11912: expected 2 fields, saw 3\nSkipping line 14248: expected 2 

vocabulary saved for session 101


b'Skipping line 221: expected 2 fields, saw 4\nSkipping line 8905: expected 2 fields, saw 3\nSkipping line 12004: expected 2 fields, saw 3\nSkipping line 14029: expected 2 fields, saw 4\nSkipping line 40831: expected 2 fields, saw 3\nSkipping line 40890: expected 2 fields, saw 3\nSkipping line 65448: expected 2 fields, saw 3\nSkipping line 71982: expected 2 fields, saw 3\nSkipping line 76038: expected 2 fields, saw 3\nSkipping line 76645: expected 2 fields, saw 4\nSkipping line 80990: expected 2 fields, saw 3\nSkipping line 84991: expected 2 fields, saw 3\nSkipping line 87692: expected 2 fields, saw 3\nSkipping line 109950: expected 2 fields, saw 3\nSkipping line 117402: expected 2 fields, saw 3\nSkipping line 135443: expected 2 fields, saw 3\nSkipping line 148357: expected 2 fields, saw 3\nSkipping line 149057: expected 2 fields, saw 3\nSkipping line 166972: expected 2 fields, saw 3\nSkipping line 190165: expected 2 fields, saw 3\nSkipping line 212084: expected 2 fields, saw 3\nSkippi

vocabulary saved for session 102


b'Skipping line 2854: expected 2 fields, saw 3\nSkipping line 46799: expected 2 fields, saw 3\nSkipping line 58266: expected 2 fields, saw 3\nSkipping line 75384: expected 2 fields, saw 3\nSkipping line 96214: expected 2 fields, saw 3\nSkipping line 111944: expected 2 fields, saw 3\nSkipping line 112547: expected 2 fields, saw 4\nSkipping line 112548: expected 2 fields, saw 3\nSkipping line 125773: expected 2 fields, saw 3\nSkipping line 126979: expected 2 fields, saw 3\nSkipping line 163074: expected 2 fields, saw 3\nSkipping line 163168: expected 2 fields, saw 4\nSkipping line 163169: expected 2 fields, saw 3\nSkipping line 201479: expected 2 fields, saw 3\nSkipping line 216049: expected 2 fields, saw 4\nSkipping line 216050: expected 2 fields, saw 3\nSkipping line 227539: expected 2 fields, saw 3\n'


vocabulary saved for session 103


b'Skipping line 38128: expected 2 fields, saw 3\nSkipping line 60948: expected 2 fields, saw 4\n'


vocabulary saved for session 104
vocabulary saved for session 105


b'Skipping line 680: expected 2 fields, saw 3\n'


vocabulary saved for session 106


b'Skipping line 46543: expected 2 fields, saw 3\nSkipping line 47646: expected 2 fields, saw 3\n'


vocabulary saved for session 107


b'Skipping line 315: expected 2 fields, saw 3\nSkipping line 1177: expected 2 fields, saw 4\nSkipping line 1180: expected 2 fields, saw 3\nSkipping line 1602: expected 2 fields, saw 3\nSkipping line 1706: expected 2 fields, saw 3\nSkipping line 3800: expected 2 fields, saw 3\nSkipping line 4085: expected 2 fields, saw 4\nSkipping line 5539: expected 2 fields, saw 3\nSkipping line 23353: expected 2 fields, saw 3\nSkipping line 39940: expected 2 fields, saw 3\nSkipping line 41117: expected 2 fields, saw 3\nSkipping line 56606: expected 2 fields, saw 3\nSkipping line 64448: expected 2 fields, saw 3\nSkipping line 67966: expected 2 fields, saw 3\nSkipping line 72444: expected 2 fields, saw 3\nSkipping line 74706: expected 2 fields, saw 3\nSkipping line 82804: expected 2 fields, saw 3\nSkipping line 86682: expected 2 fields, saw 3\nSkipping line 94020: expected 2 fields, saw 3\nSkipping line 97727: expected 2 fields, saw 3\nSkipping line 98896: expected 2 fields, saw 3\nSkipping line 102205

vocabulary saved for session 108


b'Skipping line 7151: expected 2 fields, saw 3\nSkipping line 26149: expected 2 fields, saw 3\nSkipping line 32866: expected 2 fields, saw 3\n'


vocabulary saved for session 109


b'Skipping line 5787: expected 2 fields, saw 3\nSkipping line 29405: expected 2 fields, saw 3\nSkipping line 70043: expected 2 fields, saw 3\nSkipping line 194087: expected 2 fields, saw 4\n'


vocabulary saved for session 110


b'Skipping line 42302: expected 2 fields, saw 3\n'


vocabulary saved for session 111
vocabulary saved for session 112


b'Skipping line 90057: expected 2 fields, saw 3\n'


vocabulary saved for session 113


b'Skipping line 81801: expected 2 fields, saw 3\n'


vocabulary saved for session 114


In [None]:
#pip install session_info
#import session_info
#session_info.show()

In [None]:
#cretae a combined vocabulary for all the sessions

super_vocab = [] #empty list

for i in range(97, 115):
    v = pd.read_csv(os.path.join(save_dir, 'vocabulary_' + str(i) +'.txt'), 
                    header = None)
    v = v[0].tolist()
    super_vocab.append(v) #append session specific vocabulary
    
results_list = super_vocab #list of lists
results_union = set().union(*results_list) #set union of lists
vocab_full = list(results_union) #change datatype to list
vocab_full = sorted(vocab_full) #sorted alphabetically
#complete vocabulary saved to ~/data
np.savetxt(os.path.join(save_dir, 'vocabulary.txt'), vocab_full, fmt = "%s")