# Preprocessing Pipeline Shared by other datasets
- similar to those in NYT, GoogleBooks, and COHA

In [2]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import os
import re
from tqdm import tqdm
tqdm.pandas()

In [3]:
import nltk
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

import spacy
nlp = spacy.load("en_core_web_sm")

In [4]:
data_path = '/zfs/projects/faculty/amirgo-management/congress/speeches_processed/'
df = pd.read_pickle(data_path + "total_mgmt_sents.pkl")

In [9]:
# explode the df
df_flat = df.explode('mgmt_sents').copy()
df_flat.reset_index(drop=True, inplace=True)
df_flat.to_pickle(data_path+"mgmt_sents_tagged_checkpoint.pkl")

## Unified Preprocessing Steps

In [29]:
# step 1:extract mgmt related sentences and tag them with usage
def extract_mgmt_sentences(sent):
    sent_lower = sent.lower()
    if "manage" in sent_lower or "managing" in sent_lower:
        return True
    else:
        return False

def classify_manage_forms_with_pos(sentence):
    verb_forms = {'manage', 'manages', 'managed','managing'}
    noun_forms = {'management', 'manager'}
    adj_forms = {'managerial', 'manageable','managing','managed'}

    # Tokenize and get POS tags
    words = word_tokenize(sentence)
    pos_tags = pos_tag(words)

    has_verb = False
    has_noun = False
    has_adj = False

    for word, tag in pos_tags:
        word_lower = word.lower()
        if word_lower in verb_forms and tag.startswith('VB'):
            has_verb = True
        elif word_lower in noun_forms and tag.startswith('NN'):
            has_noun = True
        elif word_lower in adj_forms and tag.startswith('JJ'):
            has_adj = True

    return has_verb, has_noun, has_adj
        
def mgmt_tagging(df):
    df['has_mgmt'] = df['mgmt_sents'].apply(extract_mgmt_sentences)
    df_selected = df[df['has_mgmt']==True].copy()
    df_selected[['has_verb', 'has_noun', 'has_adj']] = df_selected.progress_apply(lambda row: classify_manage_forms_with_pos(row['mgmt_sents']), axis=1, result_type='expand')
    print("Total Sentences: ", df.shape[0])
    print("Total MGMT Sentences:", df_selected.shape[0])
    return df_selected    


In [30]:
# step 2: basic filtering based on grammar information
def if_intransitive_verb_weaker(row):
    if_intransitive = False
    if row['has_verb']==True:
        sent = row['mgmt_sents']
        phrase_ls = ['manage to', 'manages to', 'managing to', 'managed to']
        for p in phrase_ls:
            if p in sent:
                if_intransitive = True
    return if_intransitive

# optional: noun phrases with mgmt conpounds
def if_comp_phrase(row):
    if_comp = False
    if row['has_verb']==True:
        sent = row['mgmt_sents']
        doc = nlp(sent)
        for token in doc:
            if "manag" in token.text and token.dep_ in ['amod', 'compound']:
                if_comp = True
    return if_comp
    
def basic_filtering(df):
    df['if_intransitive_verb'] = df.apply(lambda row: if_intransitive_verb_weaker(row), axis=1)
    df['if_comp_phrase'] = df.progress_apply(lambda row: if_comp_phrase(row), axis=1)
    # ratio of intransive verb
    print('Ratio of Intransitive verbs among all verbs:', df['if_intransitive_verb'].sum()/df['has_verb'].sum())
    print('Ratio of compound verbs among all verbs:', df['if_comp_phrase'].sum()/df['has_verb'].sum())
    return df

In [31]:
# step 3: dependency parsing
def extraction_management_noun(doc):
    modifier_dict = {}
    modifier_idx_dict = {}
    modifier_np_dict = {}
    for token in doc:
        if ("management" in token.text)| ("manager" in token.text):
            if token.text not in modifier_dict:
                modifier_dict[token.text] = []
                modifier_idx_dict[token.text] = []
                modifier_np_dict[token.text] = []
            children_text = []
            children_ids = []
            children = [child for child in token.children]
            children_text.extend([child.text for child in children if child.pos_ =="NOUN"])
            children_ids.extend([child.i for child in children if child.pos_ =="NOUN"])
            for child in token.children:
                if child.dep_ == "prep":
                    grand_children = [grand_child for grand_child in child.children]
                    children_text.extend([grand_child.text for grand_child in grand_children if grand_child.pos_ =="NOUN"])
                    children_ids.extend([grand_child.i for grand_child in grand_children if grand_child.pos_ =="NOUN"])
            modifier_dict[token.text].extend(children_text)
            modifier_idx_dict[token.text].extend(children_ids)

    # matching with noun phrase
    chunk_ids = {}
    for chunk in doc.noun_chunks:
        chunk_ids[chunk.text]=chunk.start
        
    for key in modifier_idx_dict:
        modifier_np_dict[key] = []
        for i in range(len(modifier_idx_dict[key])):
            item = modifier_idx_dict[key][i]
            original_text = modifier_dict[key][i]
            for chunk in chunk_ids:
                chunk_len = len(str(chunk).split())
                if item >= chunk_ids[chunk] and item<=(chunk_ids[chunk]+chunk_len):
                    modifier_np_dict[key].append(str(chunk))
            if len(modifier_np_dict[key])==0:
                modifier_np_dict[key].append(original_text)

    for key in modifier_np_dict:
        modifier_np_dict[key]=list(set(modifier_np_dict[key]))

    return modifier_dict, modifier_np_dict

def extract_subject_object_v2(doc):
    subj_obj_dict = {}
    for token in doc:
        if (token.text in ["managing", "manage","managed", "manages"]) and (token.pos_=="VERB"):
            if token.text not in subj_obj_dict:
                subj_obj_dict[token.text] = {"subj":[], "obj":[]}
            subjects = []
            objects = []
            subject_text = "NA"
            object_text = "NA"
            for child in token.children:
                if child.dep_  == "nsubj":
                    subjects.append(child)
                elif child.dep_  in ["nsubjpass","dobj","pobj"]:
                    objects.append(child)
                elif child.dep_ == "agent":
                    subjects.extend([grandchild for grandchild in child.children])
                    
            if subjects:
                subj = subjects[0]
                if subj.pos_ == "PRON":
                    subject_text = subj.text
                else:
                    subject_text = " ".join([t.text for t in subj.lefts] + [subj.text] + [t.text for t in subj.rights])
            if objects:
                obj = objects[0]
                if obj.pos_ == "PRON":
                    object_text = obj.text
                else:
                    object_text = " ".join([t.text for t in obj.lefts] + [obj.text] + [t.text for t in obj.rights])
            subj_obj_dict[token.text]["subj"].append(subject_text)
            subj_obj_dict[token.text]["obj"].append(object_text)
    return subj_obj_dict

In [32]:
# step 3.1 noun parsing pipeline
def noun_extraction_pipeline(df):
    noun_sent = list(df[(df['has_noun']==True)]['mgmt_sents'])
    noun_sent_idx = list(df[(df['has_noun']==True)].index)
    
    modifier_dict_ls = []
    modifier_np_dict_ls = []
    for sent in tqdm(noun_sent):
        sent = sent.lower()
        doc = nlp(sent)
        modifier_dict, modifier_np_dict = extraction_management_noun(doc)
        modifier_dict_ls.append(modifier_dict)
        modifier_np_dict_ls.append(modifier_np_dict)

    df['noun_has_modifier']=False
    for i in range(len(noun_sent_idx)):
        idx = noun_sent_idx[i]
        mdict = modifier_dict_ls[i]
        mlen = 0
        for key in mdict:
            mlen+=len(mdict[key])
        if mlen>0:
            df.loc[idx, 'noun_has_modifier'] = True

    # append modifiers as column
    df['modifiers']=[{} for _ in range(len(df))]
    df['modifiers_np']=[{} for _ in range(len(df))]
    for i in range(len(noun_sent_idx)):
        idx = noun_sent_idx[i]
        mdict = modifier_dict_ls[i]
        mnp_dict = modifier_np_dict_ls[i]
        df.at[idx, 'modifiers'] = mdict
        df.at[idx, 'modifiers_np'] = mnp_dict

    print('Ratio of nouns with modifiers among all nouns:', df['noun_has_modifier'].sum()/df['has_noun'].sum())
    return df

In [38]:
# step 3.2 verb parsing pipeline
def verb_extraction_pipeline(df):
    verb_sent = list(df[(df['has_verb']==True) & (df['if_intransitive_verb']==False)]['mgmt_sents'])
    verb_sent_idx = list(df[(df['has_verb']==True) & (df['if_intransitive_verb']==False)].index)

    #  extract sentences for WSD
    with open(data_path+f"verb_sent_wo_WSD.pkl","wb")as f:
        pickle.dump(verb_sent, f)
        
    subj_obj_dict_ls = []
    for sent in tqdm(verb_sent):
        doc = nlp(sent)
        subj_obj_dict = extract_subject_object_v2(doc)
        subj_obj_dict_ls.append(subj_obj_dict)
        
    # append modifiers as column
    df['verb_subj_obj']=[{} for _ in range(len(df))]
    for i in range(len(verb_sent_idx)):
        idx = verb_sent_idx[i]
        so_dict = subj_obj_dict_ls[i]
        df.at[idx, 'verb_subj_obj'] = so_dict

    # save the final outcome
    df.to_pickle(data_path+f"mgmt_sents_tagged_checkpoint.pkl")
    return df

## Main code

In [39]:
df = pd.read_pickle(data_path+"mgmt_sents_tagged_checkpoint.pkl")
df = mgmt_tagging(df)
df = basic_filtering(df)
df = noun_extraction_pipeline(df)
df = verb_extraction_pipeline(df)

100%|██████████| 60010/60010 [07:40<00:00, 130.38it/s]


In [45]:
df['mgmt_sents'].at[2]

'and bring better understanding of the management of illness in the family.'

## WSD

In [49]:
# last step: extract sentences for WSD
def wsd_tag_pipeline(output_path):
    # read from WSD results
    with open(output_path+f"predictions_wsd.pkl","rb")as f:
        prediction = pickle.load(f)
    with open(output_path+f"confidence_scores_wsd.pkl","rb")as f:
        confidence = pickle.load(f)

    df = pd.read_pickle(output_path+f"mgmt_sents_tagged_checkpoint.pkl")

    verb_sent = list(df[(df['has_verb']==True) & (df['if_intransitive_verb']==False)]['mgmt_sents'])
    verb_sent_idx = list(df[(df['has_verb']==True) & (df['if_intransitive_verb']==False)].index)
    
    # ensure the mapping
    assert len(verb_sent)==len(prediction)
        
    df['WSD_pred'] = 999
    df['WSD_conf'] = 999

    # Create a temporary DataFrame with the new values
    updates = pd.DataFrame({'WSD_pred': prediction, 'WSD_conf': confidence}, index=verb_sent_idx)

    # Update the main DataFrame using the 'update' method
    df.update(updates)
    df.to_pickle(output_path+f"total_mgmt_sent_tagged.pkl")
    return 

In [50]:
wsd_tag_pipeline(data_path)

  df.update(updates)


In [51]:
test = pd.read_pickle(data_path+"total_mgmt_sent_tagged.pkl")

In [52]:
test

Unnamed: 0,speech_id,speech,mgmt_sents,has_mgmt,has_verb,has_noun,has_adj,if_intransitive_verb,if_comp_phrase,noun_has_modifier,modifiers,modifiers_np,verb_subj_obj,WSD_pred,WSD_conf
0,940000353,Mr. Speaker. congressional reform suffered a c...,These actionsparticularly the return of the fa...,True,False,True,False,False,False,True,{'management': ['business']},{'management': ['the public business']},{},999.0,999.000000
1,940000387,Mr. Speaker. like other Members I have spent m...,Nearly half of .the firms belonging to the Nat...,True,False,True,False,False,False,False,{'management': []},{'management': []},{},999.0,999.000000
2,940000429,Mr. Speaker. I am submitting to the Congress t...,and bring better understanding of the manageme...,True,False,True,False,False,False,True,{'management': ['illness']},{'management': ['illness']},{},999.0,999.000000
3,940000441,Mr. Speaker. I have today introduced legislati...,the subcommittee discovered that Federal manag...,True,False,False,True,False,False,False,{},{},{},999.0,999.000000
4,940000467,Mr. Speaker. it is with much pleasure that I t...,managed to raise enough money to donate 28 Sun...,True,True,False,False,True,False,False,{},{},{},999.0,999.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
405110,920321197,Mr. President. the Kansas City Federal Regiona...,The integrated grant administration program is...,True,False,True,False,False,False,True,{'management': ['budget']},{'management': ['budget']},{},999.0,999.000000
405111,920321198,Mr. President. among those present to comment ...,Both have been selected to furnish technical m...,True,False,True,False,False,False,False,{'management': []},{'management': []},{},999.0,999.000000
405112,920321229,Mr. President. one of the most active and succ...,one of the most active and successful construc...,True,True,False,False,False,False,False,{},{},"{'managed': {'subj': ['one family'], 'obj': ['...",1.0,0.999845
405113,920321231,Mr. President. the Subcommittee on Air and Wat...,solid waste management and resource recovery.,True,False,True,False,False,False,True,"{'management': ['waste', 'resource']}",{'management': ['solid waste management and re...,{},999.0,999.000000
