# Preprocessing Pipeline
- select congressional speech sentences that contain certain objects
- setences include human or nonhuman objects, but does not necessarily contain management expression
- similar preprocessing pipeline should be applied to other datasets

In [1]:
import pickle
import numpy as np
import random
import os
import re
import os
import re
import pandas as pd
import nltk
from nltk.corpus import words
from tqdm import tqdm
tqdm.pandas()
from multiprocessing import Pool, cpu_count


In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")
english_words = set(words.words())

## Convert individual file to CSV
- copy from txt_to_csv ipynb
- note that we did not use those filtering criteria to select sentences containing objects

In [3]:
def txt_to_dataframe(filepath):
    # Initialize an empty list to store the rows
    rows = []
    
    # Open the file, ignoring decoding errors
    with open(filepath, 'r', errors='ignore') as file:
        for line in file:
            # Split each line by '|' and strip the newline character
            parts = line.strip().split('|')
            # Append the split line to the rows list
            if len(parts) == 2 and parts[0]!='speech_id':  # Ensure the line has two parts
                rows.append(parts)
    
    # Convert the list of rows to a DataFrame
    df = pd.DataFrame(rows, columns=['speech_id', 'speech'])

    return df

# Initital Filtering

In [4]:
# extract sentences containing given objects, this would be a coarse filtering
# there might be chances that a given word is not in the sentence (eg, want parent but only get apparent)
def sentence_extract(text, object_ls):
    relevant_sents = []
    if isinstance(text, float) == False:
        sents = nltk.sent_tokenize(text)
        for sent in sents:
            sent_lower = sent.lower()
            for word in object_ls:
                if word in sent_lower:
                    sent = re.sub(' +', ' ', sent) #remove more than one spaces
                    relevant_sents.append((word,sent))
    return relevant_sents

# Further Filtering

In [5]:
# helper functions
def lemmatize_words(words):
    # Create a dictionary to store the results
    lemmas = {}
    # Process each word using spaCy to obtain its lemma
    for word in words:
        # Convert the word into a spaCy document object to access linguistic annotations
        doc = nlp(word)
        for token in doc:
            # Add the original word and its lemma to the dictionary
            lemmas[word] = token.lemma_
    
    return lemmas

In [6]:
def mask_token_in_text(doc, token_id):
    masked_tokens = []
    for token in doc:
        if token.i == token_id:
            masked_tokens.append('[MASK]')
        else:
            masked_tokens.append(token.text)

    # Join the tokens back to form the masked sentence
    masked_text = ' '.join(masked_tokens)
    return masked_text

# Further filtering: remove sentences that did not contain the word we need
# also make sure that the word detected is a noun
def sentence_structure_check(word, text, lemma_dict):
    # check lemma form
    if_contain_lemma = False
    word_lemma = lemma_dict[word] 
    head_verb = "NA"
    if_VO=False
    if_SV=False
    
    doc = nlp(text)
    for i in range(len(doc)):
        token = doc[i]
        if token.lemma_ == word_lemma:
            if token.pos_ == "NOUN": # only select nouns
                if_contain_lemma= True
                focal_object = token.lemma_
                focal_object_id = token.i  # ID of the object
                # check structure: VO
                if token.dep_ in ['dobj', 'pobj', 'iobj'] and token.head.pos_ == "VERB":
                    if_VO=True
                    head_verb = token.head.lemma_
                # check structure: SV
                if 'subj' in token.dep_ and token.head.pos_ == "VERB":
                    if_SV=True
                    head_verb = token.head.lemma_
                break
                
    if if_contain_lemma==True:
        object_masked_sent = mask_token_in_text(doc, focal_object_id)
        return True, (focal_object, focal_object_id, object_masked_sent), (if_VO, if_SV, head_verb)
    else:
        return False, ("NA", "NA", "NA"), (if_VO, if_SV, head_verb)

In [7]:
# wrapper function to deal with list of (word, sent) structure
def apply_parallel(df_group):
    df_group['result'] = df_group['relevant_sent'].apply(lambda x:  sentence_structure_check(x[0],x[1], total_lemma_dict))
    return df_group

def parallelize_dataframe(df, func, n_chunks, ncores):
    pool = Pool(ncores)

    df_split = np.array_split(df, n_chunks)
    results = []

    with tqdm(total=len(df_split)) as pbar:
        for result in pool.imap_unordered(func, df_split):
            results.append(result)
            pbar.update(1)

    pool.close()
    pool.join()
    return pd.concat(results)

In [None]:
# sample sentences that have at least 10 tokens
def sent_length(sent):
    return len(sent.split(" "))

# check ocr quality
def ocr_quality_check(text):
    tokens = text.split(" ")
    non_english_count = sum(1 for token in tokens if token not in english_words)
    total_words = len(tokens)
    if total_words == 0:
        return 1
    non_english_ratio = non_english_count / total_words
    return non_english_ratio

# remove special tokens/punctuations except for comma, period, and question mark
def remove_special_tokens(text):

    

In [None]:
def df_formatting(df):
    df['if_selected'] = df['result'].apply(lambda x: x[0])
    df['object'] = df['result'].apply(lambda x: x[1][0])
    df['object_mask'] = df['result'].apply(lambda x: x[1][2])
    df['sent_unmask'] = df['relevant_sent'].apply(lambda x: x[1])
    df['if_vo'] = df['result'].apply(lambda x: x[2][0])
    df['if_sv'] = df['result'].apply(lambda x: x[2][1])
    df['head_verb'] = df['result'].apply(lambda x: x[2][2])
    return df

# Running with data

In [None]:
data_path = "/zfs/projects/faculty/amirgo-management/congress/speeches/"
processed_path = "/zfs/projects/faculty/amirgo-management/congress/speeches_processed/"

In [None]:
# use to match words in sentences, may not be full because of potential plural forms
mind_list=['anger','stress','pain', 'emotion','expectation','anxiety','anxieties','trust','feeling','grief',
           'happiness', 'sadness', 'fear', 'disgust', 'surprise', 'shame', 'guilt','love','joy', 
           'despair','disappointment','excitement']
body_list=['weight','health','care','disease','illness','diabetes',',medication','nutrition','addiction']
relation_list = ['jealousy', 'envy', 'compassion', 'empathy', 'relationship','friendship', 'leadership','hostility', 'rejection', 'recognition',
                 'rivalry', 'conformity', 'conflict', 'status', 'authority', 'legitimacy', 'popularity', 'disagreement', 'dissent',
                 'interaction', 'communication', 'collaboration', 'coordination', 'cooperation', 'competition', 'conversation',
                 'intimacy','responsibility']
# change to human list not conducted in the current version of code
human_list = ['parent', 'child', 'kid','sibling','brother','sister','mother','father',
              'mom','dad','uncle','aunt','husband','wife','wives','spouse','fiance','fiancee','lover','friend', 'enemy','enemies',
              'son','daughter','nephew','niece','cousin','neighbour','colleague','classmate','roommate']

human_pub_list = ['professor','teacher','student','doctor','nurse','patient','priest',
                  'rabbi','pastor','lawyer','officer','prisoner','inmate']

total_ls = mind_list+body_list+relation_list+human_list+human_pub_list
total_lemma_dict = lemmatize_words(total_ls)

In [None]:
# apply to all years
files = os.listdir(data_path)
selected_files = []
for f in files:
    id = int(f.split("_")[1].split(".")[0])
    if id>=81:
        selected_files.append(f)
selected_files.sort()

In [None]:
def single_year_processing_pipeline(filename):
    file_id = filename.split(".")[0]
    df = txt_to_dataframe(data_path+filename)
    # initial filtering
    df['relevant_sent'] = df['speech'].apply(lambda x: sentence_extract(x,total_ls))
    df = df[df['relevant_sent'].astype(bool)]
    df = df.explode('relevant_sent')
    df.reset_index(inplace=True, drop=True)
    # add formating: remove special punctuations other than normal ones
    

    # further filtering
    df = parallelize_dataframe(df, apply_parallel, 1000, 8)
    df = df_formatting(df)
    df = df[df['if_selected']==True]
    df.drop(columns=['relevant_sent', 'result','speech'], inplace=True)
    df.reset_index(inplace=True, drop=True)
    # to select sentences with higher quality
    df['sent_length'] = df['sent_unmask'].apply(lambda x: sent_length(x))
    df['noneng_ratio'] = df['sent_unmask'].apply(lambda x: ocr_quality_check(x))
    df.to_pickle(processed_path+file_id+"_obj_sents.pkl")
    return

In [None]:
for f in selected_files[1:]:
    single_year_processing_pipeline(f)

In [None]:
# concat all dfs
all_dfs = []
for f in selected_files:
    file_id = f.split(".")[0]
    df = pd.read_pickle(processed_path+file_id+"_obj_sents.pkl")
    all_dfs.append(df)
all_df = pd.concat(all_dfs)
all_df.reset_index(inplace=True, drop=True)
all_df.to_pickle(processed_path + "congress_total_obj_sents.pkl")


In [None]:
# save object list for fill mask prediction
all_objects = list(set(all_df['object']))
# save
with open(processed_path + "human_nonhuman_masked_objects.pkl", 'wb') as f:
    pickle.dump(all_objects, f)