# Fillmask Result Analysis (Old version)
- after inference via fillmask_fix_target.py, analyse the results
- calculate metaphor via log ratio
- longitudinal trend plot
- (not sure) Document level analysis to avoid file overweight

In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import time
import math
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()

In [23]:
import statsmodels.api as sm
from statsmodels.formula.api import glm
from statsmodels.formula.api import ols

In [28]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
model_name = "MacBERTh"
model_path = f"/zfs/projects/faculty/amirgo-management/BERT/{model_name}"
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load Dataset

In [30]:
data_path = "/zfs/projects/faculty/amirgo-management/congress/speeches_processed/"
df = pd.read_pickle(data_path + f"object_193_target_fillmask_{model_name}_0728.pkl")

In [31]:
# before calculating result, remove object_target_pred_prob is an empty list
df = df[df['object_target_pred_prob'].astype(bool)]
df.reset_index(inplace=True, drop=True)

In [32]:
def congress_to_year(speech_id):
    speech_id = str(speech_id)
    if len(speech_id)==9:
        congress_number = speech_id[:2]
    elif len(speech_id)==10:
        congress_number = speech_id[:3]
    # convert sequence to year
    year = 1789 + ((int(congress_number) - 1) * 2)
    return year
    
df['year'] = df['speech_id'].apply(congress_to_year)

In [33]:
# load entity lists; convert to ids
def gen_token_idx(total_objects, tokenizer = tokenizer):
    filtered_objects = []
    bert_vocab = set(tokenizer.get_vocab().keys())
    for word in total_objects:
        if word in bert_vocab:
            filtered_objects.append(word)
    filtered_objects_idx = [tokenizer.encode(token, add_special_tokens=False)[0] for token in filtered_objects]
    return filtered_objects, filtered_objects_idx

with open(data_path + "human_nonhuman_masked_objects.pkl", 'rb') as f:
    original_objects = pickle.load(f)
# a more ambiguous list
managing_objects = [
    'manager', 'employer', 'director', 'executive', 'chairman', 'leader','controller',  
    'investment', 'capital', 'budget', 'money', 'finance', 'fund', 'estate', 'property', 'equity', 'profit', 'margin',
    'revenue', 'income', 'salary', 'wage', 'pay', 'compensation', 'expense', 'cost', 'price', 'fee', 'charge','payment', 
    'bill', 'account', 'balance', 'credit', 'loan', 'mortgage', 'interest', 'tax', 'taxation', 'liability',
    'worker', 'labor', 'staff', 'personnel', 'subordinate', 'intern', 
    'business', 'commerce', 'corporation', 'firm', 'company', 'industry', 'market', 'economy', 'enterprise', 'trade', 'organization',
    'project', 'task', 'initiative', 'campaign', 'program', 
    'product', 'merchandise', 'commodity', 'goods', 'service', 'offering','brand','franchise', 'patent',
    'client', 'consumer', 'buyer', 'seller', 'consumption',
    'contract', 'agreement', 'deal', 'transaction', 'sale', 'purchase',
    'profession', 'job', 'occupation', 'career', 'position', 'duty', 'responsibility', 'obligation',
    'plan', 'solution', 'innovation', 'acquisition','negotiation', 'operation', 'production'
]

random_objects = ['adoption', 'aerial', 'agricultural', 'amtrak', 'announcements', 'antenna', 'brave', 'cadet', 'captures', 'carroll',
                   'champaign', 'charley', 'ecosystem', 'excuses', 'exit', 'french', 'freshman', 'goal', 'headache', 'inter', 'knock',
                     'liberty', 'lifeboat', 'london', 'manifest', 'mrs', 'multimedia', 'narcotics', 'nitrate', 'orr', 'ow', 'parliamentary', 
                     'plantation', 'proof', 'protect', 'provider', 'ready', 'reese', 'revolutionaries', 'ribbons', 'san', 'sanders', 
                     'satisfaction', 'scope', 'series', 'sucker', 'superstructure', 'whig', 'whiskey']

total_objects = managing_objects + random_objects + original_objects
total_objects, total_object_idx = gen_token_idx(total_objects) # filter out non-bert vocab

def token_to_idx(token, total_objects = total_objects):
    return total_objects.index(token)

def idx_to_token(idx, total_objects = total_objects):
    return total_objects[idx]

In [34]:
# set up managing object groups
asset_words = ['investment', 'capital', 'budget', 'money', 'finance', 'fund', 'estate', 'property', 'equity', 'profit', 'margin',
    'revenue', 'income', 'salary', 'wage', 'pay', 'compensation', 'expense', 'cost', 'price', 'fee', 'charge','payment', 
    'bill', 'account', 'balance', 'credit', 'loan', 'mortgage', 'interest', 'tax', 'taxation', 'liability']
person_words = ['worker', 'labor', 'staff', 'personnel', 'subordinate', 'intern',  'client', 'consumer', 'buyer', 'seller', 'patron']
business_words = ['business', 'commerce', 'corporation', 'firm', 'company', 'industry', 'market', 'economy', 'enterprise', 'trade', 'organization',
    'project', 'task', 'initiative', 'campaign', 'program']
product_words = ['product', 'merchandise', 'commodity', 'goods', 'service', 'offering','brand','franchise', 'patent']
manager_words = ['manager', 'employer', 'director', 'executive', 'chairman', 'leader','controller']
operation_words = ['contract', 'agreement', 'deal', 'transaction', 'sale', 'purchase',
    'profession', 'job', 'occupation', 'career', 'position', 'duty', 'responsibility', 'obligation',
    'plan', 'solution', 'innovation', 'acquisition','negotiation', 'operation', 'production']

# set up private categories
mind_list=['anger','stress','pain', 'emotion','expectation','anxiety','trust','feeling','grief',
           'happiness', 'sadness', 'fear', 'disgust', 'surprise', 'shame', 'guilt','love','joy', 'passion', 'zeal',
           'despair','disappointment','excitement']
body_list=['weight','health','care','disease','illness','diabetes',',medication','nutrition','addiction']
relation_list =  ['jealousy', 'envy', 'compassion', 'empathy', 'affection', 'appreciation', 'desire','hope','relationship','friendship',
'support','confidence', 'enthusiasm','sympathy','respect','admiration','integrity','loyalty','devotion','commitment','solidarity',
'rejection', 'recognition', 'rivalry',  'conflict', 'status', 'disagreement', 'dissent','intimacy','responsibility']
human_list = ['parent', 'child', 'kid','sibling','brother','sister','mother','father',
              'mom','dad','uncle','aunt','husband','wife','spouse','partner','fiance','fiancee','lover','friend', 
              'son','daughter','nephew','niece','cousin','neighbour']

# Calculate Probability
- overall, this measurement is accurate but not precise; meaning that while it captures the pattern we want, it also introduces noise

In [35]:
# get the probability of original object
def get_orig_object_prob(row):
    obj = row['object']
    prob = row['object_target_pred_prob']
    return prob[token_to_idx(obj)]

# Optional: compare with random scores
def get_group_prob(prob, words):
    group_prob = []
    for word in words:
        if word not in total_objects:
            continue
        idx = token_to_idx(word)
        group_prob.append(prob[idx])
    return np.mean(group_prob)

def gen_orig_group_prob(row):
    obj = row['object']
    if obj in human_list:
        return row['human_group_prob']
    elif obj in mind_list:
        return row['mind_group_prob']
    elif obj in body_list:
        return row['body_group_prob']
    elif obj in relation_list:
        return row['relation_group_prob']
    else:
        return 0

# measurement of metaphority with all sentences (including literal and lack of context)
def overall_metaphority_score_all(row, method='orig_ratio'):
    metaphor_score = []
    for obj in managing_objects:
        metaphor_score.append(row[f'{obj}_{method}'])

    # get the max score
    max_score = max(metaphor_score)
    max_obj = managing_objects[metaphor_score.index(max_score)]
    return (max_score, max_obj)

# measurement of metaphority with all sentences (including literal and lack of context)
def overall_metaphority_score_group_group(row, method='orig_group_ratio'):
    metaphor_score = []
    managing_objects_group = ['asset_group', 'person_group', 'business_group', 'product_group', 'manager_group', 'operation_group']
    for obj in managing_objects_group:
        metaphor_score.append(row[f'{obj}_{method}'])

    # get the max score
    max_score = max(metaphor_score)
    max_obj = managing_objects_group[metaphor_score.index(max_score)]
    return (max_score, max_obj)

# a pipeline for measurement
def gen_individual_ratio(df):
    # predicted object probability
    for word in tqdm(managing_objects):
        idx = token_to_idx(word)
        df[f'{word}_prob'] = df.apply(lambda x: x['object_target_pred_prob'][idx], axis=1)

    # compared to original object
    df['orig_prob'] = df.apply(lambda row: get_orig_object_prob(row), axis=1)

    # individual ratio
    for word in tqdm(managing_objects):
        df[f'{word}_orig_ratio'] = np.log(df[f'{word}_prob']/df['orig_prob'])

    # overall metaphority score
    df[['overall_metaphor_score','metaphor_obj_max']] = df.apply(overall_metaphority_score_all, axis=1, result_type='expand')
    return df

def gen_subgroup_ratio(df):
    df['asset_group_prob'] = df['object_target_pred_prob'].apply(lambda x: get_group_prob(x, asset_words))
    df['person_group_prob'] = df['object_target_pred_prob'].apply(lambda x: get_group_prob(x, person_words))
    df['business_group_prob'] = df['object_target_pred_prob'].apply(lambda x: get_group_prob(x, business_words))
    df['product_group_prob'] = df['object_target_pred_prob'].apply(lambda x: get_group_prob(x, product_words))
    df['manager_group_prob'] = df['object_target_pred_prob'].apply(lambda x: get_group_prob(x, manager_words))
    df['operation_group_prob'] = df['object_target_pred_prob'].apply(lambda x: get_group_prob(x, operation_words))

    df['human_group_prob'] = df['object_target_pred_prob'].apply(lambda x: get_group_prob(x, human_list))
    df['mind_group_prob'] = df['object_target_pred_prob'].apply(lambda x: get_group_prob(x, mind_list))
    df['body_group_prob'] = df['object_target_pred_prob'].apply(lambda x: get_group_prob(x, body_list))
    df['relation_group_prob'] = df['object_target_pred_prob'].apply(lambda x: get_group_prob(x, relation_list))
    df['orig_group_prob'] = df.apply(gen_orig_group_prob, axis=1)
    df = df[df['orig_group_prob']!=0]

    df['asset_group_orig_group_ratio'] = np.log(df['asset_group_prob']/df['orig_group_prob'])
    df['person_group_orig_group_ratio'] = np.log(df['person_group_prob']/df['orig_group_prob'])
    df['business_group_orig_group_ratio'] = np.log(df['business_group_prob']/df['orig_group_prob'])
    df['product_group_orig_group_ratio'] = np.log(df['product_group_prob']/df['orig_group_prob'])
    df['manager_group_orig_group_ratio'] = np.log(df['manager_group_prob']/df['orig_group_prob'])
    df['operation_group_orig_group_ratio'] = np.log(df['operation_group_prob']/df['orig_group_prob'])

    df[['overall_metaphor_score_group_group','metaphor_obj_group_group_max']] = df.apply(overall_metaphority_score_group_group, axis=1, result_type='expand', method='orig_group_ratio')
    return df


def gen_allgroup_ratio(df):
    df['private_group_prob'] = df['object_target_pred_prob'].apply(lambda x: get_group_prob(x, mind_list+body_list+relation_list+human_list))
    df['all_mgmt_group_prob'] = df['object_target_pred_prob'].apply(lambda x: get_group_prob(x, managing_objects))
    df['all_mgmt_group_private_group_ratio'] = np.log(df['all_mgmt_group_prob']/df['private_group_prob'])
    return df


In [36]:
# run the pipeline
print("Start generating ratios")
start = time.time()

# select only when original object is in list
df = df[df['object'].isin(total_objects)].reset_index(drop=True)
df = gen_individual_ratio(df)
df = gen_subgroup_ratio(df)
df = gen_allgroup_ratio(df)
print("Finish generating ratios")
print(f"Time spent: {time.time()-start}")

# save the result
df.to_pickle(data_path + f"object_193_target_fillmask_{model_name}_0728_w_measure.pkl")

Start generating ratios


 58%|█████▊    | 56/97 [20:40<21:41, 31.74s/it]

In [2]:
data_path = "/zfs/projects/faculty/amirgo-management/congress/speeches_processed/"
model_name = "MacBERTh"
df = pd.read_pickle(data_path + f"object_193_target_fillmask_{model_name}_0728_w_measure.pkl")

In [3]:
df.head()

Unnamed: 0,speech_id,if_selected,object,object_mask,sent_unmask,if_vo,if_sv,head_verb,sent_length,noneng_ratio,...,person_group_orig_group_ratio,business_group_orig_group_ratio,product_group_orig_group_ratio,manager_group_orig_group_ratio,operation_group_orig_group_ratio,overall_metaphor_score_group_group,metaphor_obj_group_group_max,private_group_prob,all_mgmt_group_prob,all_mgmt_group_private_group_ratio
0,810000004,True,recognition,The Chair notes that the senior Senator from K...,The Chair notes that the senior Senator from K...,False,False,,14,0.428571,...,-3.967595,-0.431037,-1.187556,0.360611,-1.107556,1.298948,asset_group,2.8e-05,0.000175,1.839491
1,810000004,True,recognition,would like to say to the Senator from Kentucky...,would like to say to the Senator from Kentucky...,False,False,,28,0.214286,...,-6.14979,-1.129848,-1.83941,-1.503196,0.244393,0.244393,operation_group,7e-06,8e-06,0.097796
2,810000004,True,recognition,There will always be the [MASK] of the fact th...,There will always be the recognition of the fa...,False,False,,22,0.136364,...,-12.25199,-11.045845,-9.408712,-10.600413,-7.904203,-7.904203,operation_group,0.014982,1.3e-05,-7.017233
3,810000004,True,recognition,There will always be [MASK] of Integrity witho...,There will always be recognition of Integrity ...,False,False,,27,0.148148,...,-1.636365,-0.189509,0.870945,-1.061061,0.987846,0.987846,operation_group,4.1e-05,1.5e-05,-0.98026
4,810000004,True,recognition,There will always be [MASK] of difficult achie...,There will always be recognition of difficult ...,False,False,,44,0.090909,...,-5.47211,-2.852627,-1.80435,-3.454914,-3.601016,-1.80435,product_group,1.1e-05,2e-06,-1.482826


In [13]:
df.to_pickle(data_path + f"object_234_target_fillmask_{model_name}_0728_w_measure.pkl")

# Show Examples

In [13]:
def top_k_frames(prob, k):
    top_frame_dict = {}
    top_k_idx = np.argsort(prob)[::-1][:k]
    for idx in top_k_idx:
        top_frame_dict[idx_to_token(idx)] = prob[idx]
    return top_frame_dict

def show_examples(original_object=None, df=df):
    if original_object!=None:
        df = df[df['object']==original_object]
    df = df.sort_values(by=f'overall_metaphor_score', ascending=False)
    top_examples = df.head(10)

    # print out examples
    for idx, row in top_examples.iterrows():
        top_frame_dict = top_k_frames(row['object_target_pred_prob'], 5)
        print('Speech: ', row['speech_id'])
        print('Sentence: ', row['object_mask'])
        print('Original object:', row['object'])
        print('Metaphor object:', row['metaphor_obj_max'])
        
        print('Metaphor score:', row['overall_metaphor_score'])
        print('Other top frames :', top_frame_dict)
        print("----------------------------------")
    return 

In [14]:
# human_list = ['child', 'kid','parent']
# df_selected = df[df['object'].isin(human_list)]
# df_selected = df_selected[(df_selected['managed_agg_random_ratio']>0) & (df_selected['orig_random_ratio']>0)]
show_examples('child', df)

Speech:  1050067469
Sentence:  a State covering [MASK] under the new title XXI must offer at least the coverage listed under the options specified in section 2103(a ) .
Original object: child
Metaphor object: company
Metaphor score: 13.069551573156595
Other top frames : {'company': 0.49854132533073425, 'authority': 0.06390116363763809, 'corporation': 0.019285229966044426, 'firm': 0.008926511742174625, 'project': 0.003077300963923335}
----------------------------------
Speech:  1070054813
Sentence:  Childrens rights groups estimate that the United States imports more than $ 100 million in goods each year which are produced by bonded and indentured [MASK] .
Original object: child
Metaphor object: labor
Metaphor score: 12.727279619453942
Other top frames : {'labor': 0.6261781454086304, 'capital': 0.0012631171848624945, 'worker': 0.0005102544673718512, 'investment': 0.0002930847695097327, 'business': 7.733350503258407e-05}
----------------------------------
Speech:  1050090997
Sentence:  W