# make my annotation (narrations.csv and groups.csv) from egoclip

In [95]:
import os
import pandas as pd
import json
from dataset import SRLPredictor
import numpy as np
import ast


In [96]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## srl

In [None]:
import pandas as pd
all_narration_df = pd.read_csv("../dataset/egoclip.csv", sep='\t', on_bad_lines='warn')
all_narration_df

In [20]:
# use a subset as example
all_narration_df = all_narration_df.iloc[:1000,:]

In [None]:
all_narration_df.head()

In [22]:
# number of videos
len(all_narration_df['video_uid'].unique())

4

In [23]:
# narration column name
narration_column_name = 'clip_text' # 'narration_text'

In [24]:
# process narration text
# # del start #C/#O
all_narration_df[narration_column_name] = all_narration_df[narration_column_name].str.split(' ', n=1).str[1]
# skip those contain unsure
all_narration_df = all_narration_df[-all_narration_df[narration_column_name].str.contains('#unsure')]

In [None]:
all_narration_df

In [26]:
# srl_results = srl(['a cat eats food', 'a dog eats food'])
slrpredictor = SRLPredictor('cpu')
srl_results = slrpredictor.predict(all_narration_df[narration_column_name].tolist())

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# list of dict
len(srl_results), srl_results

In [32]:
# phrase into dataframe
srl_df = {}
dropped_sentence_no = []
for sentence_no, srl_result in enumerate(srl_results):
    if srl_result['verbs'] == []:
        dropped_sentence_no.append(sentence_no)
        continue
    srl_matching = list(zip(srl_result['verbs'][0]['tags'], srl_result['words'])) # e.g., {'B-ARG0': '#', 'I-ARG0': 'C', 'B-V': 'interacts', 'B-ARG1': 'with', 'I-ARG1': 'X'}

    for srl_full_tag, srl_word in srl_matching:
        srl_tag = srl_full_tag.split('-')[-1]

        if srl_tag not in srl_df:
            srl_df[srl_tag] = {}
        if sentence_no not in srl_df[srl_tag]:
            srl_df[srl_tag][sentence_no] = []
        srl_df[srl_tag][sentence_no].append(srl_word)

srl_df = pd.DataFrame(srl_df)


In [None]:
# dataframe
srl_df

In [45]:
# see how many narrations are left after applying SRL. Some narrations are not phrased baecause of the SRL method
len(srl_df) / len(all_narration_df)

0.985

In [37]:
# part of narrations are not SRL-ed
len(all_narration_df.iloc[dropped_sentence_no, :]) / len(all_narration_df)

0.015

In [None]:
# non-SRL-ed narrations
all_narration_df.iloc[dropped_sentence_no, :]

In [None]:
# concat narration and SRL
complete_srl_df = pd.concat([all_narration_df, srl_df], axis=1)
complete_srl_df

In [None]:
# drop columns with too many NaNs. drop rows >= 1 NaNs after dropping columns with too many NaNs. 
threshold = 0.9
clean_srl_df = complete_srl_df.dropna(axis=1, thresh=len(srl_df)*threshold).dropna(axis=0)
clean_srl_df

In [44]:
# see how many narrations are left who have at least SVO format
len(clean_srl_df) / len(all_narration_df)

0.893

In [None]:
# reset index
save_srl_df = clean_srl_df.reset_index(drop=True)
save_srl_df

In [48]:
srl_path = '../dataset/egoclip_srl.csv'
save_srl_df.to_csv(srl_path)

## group narrations based on taxonomy

In [49]:
# load taxonomy
taxonomy_verb_path = '/z/dat/Ego4D/raw/v2/annotations/narration_verb_taxonomy.csv'
taxonomy_verb_df = pd.read_csv(taxonomy_verb_path)
taxonomy_verb_df['group'] = taxonomy_verb_df['group'].apply(ast.literal_eval)

taxonomy_noun_path = "/z/dat/Ego4D/raw/v2/annotations/narration_noun_taxonomy.csv"
taxonomy_noun_df = pd.read_csv(taxonomy_noun_path)
taxonomy_noun_df['group'] = taxonomy_noun_df['group'].apply(ast.literal_eval)

In [None]:
taxonomy_verb_df

In [None]:
save_srl_df

In [70]:
# make tagged verb and noun into list
save_srl_df['tag_verb'] = save_srl_df['tag_verb'].apply(ast.literal_eval)
save_srl_df['tag_noun'] = save_srl_df['tag_noun'].apply(ast.literal_eval)

# make SRL chunks into one string
save_srl_df['ARG0'] = save_srl_df['ARG0'].apply(' '.join)
save_srl_df['V'] = save_srl_df['V'].apply(' '.join)
save_srl_df['ARG1'] = save_srl_df['ARG1'].apply(' '.join)

In [None]:
save_srl_df

In [None]:
taxonomy_verb_df.iloc[[17, 68]	, :]

In [None]:
# For each narration, find the verb group and noun group
from dataset import grouping
grouping_results = save_srl_df.apply(grouping, args=(taxonomy_verb_df, taxonomy_noun_df), axis=1)
grouping_results_df = pd.DataFrame({'valid_tag_noun': grouping_results})
grouping_results_df

In [None]:
# concate the grouping results with other columns
grouped_narration_df = pd.concat([save_srl_df, grouping_results_df], axis=1)
grouped_narration_df

In [131]:
# post process
clean_narration_df = grouped_narration_df.dropna(axis=0)
# see how many narrations have tagged noun
len(clean_narration_df) / len(grouped_narration_df), len(clean_narration_df) / len(all_narration_df)

(1.0, 0.893)

In [132]:
# save as narrations.csv
clean_narration_df.to_csv('../dataset/egoclip_narrations.csv')

## make exploed narrations.csv

In [None]:
clean_narration_df = pd.read_csv('../dataset/egoclip_narrations.csv', index_col=0)
clean_narration_df

In [21]:
# conver to list
clean_narration_df['valid_tag_noun'] = clean_narration_df['valid_tag_noun'].apply(ast.literal_eval)
clean_narration_df['tag_verb'] = clean_narration_df['tag_verb'].apply(ast.literal_eval)

In [22]:
expanded_verb = clean_narration_df['tag_verb'].explode().reset_index()
expanded_noun = clean_narration_df['valid_tag_noun'].explode().reset_index()

In [None]:
expanded_noun

In [None]:
expanded_verb

In [None]:
vn_merged = pd.merge(expanded_verb, expanded_noun, on='index', how='outer')
vn_merged

In [None]:
exploed_narration_df = pd.merge(clean_narration_df, vn_merged, left_index=True, right_on='index').drop(columns=['index'])
exploed_narration_df

In [None]:
# exploed the verb and noun elements so that each row belongs to one group
exploed_narration_df = exploed_narration_df.rename(columns={"tag_verb_x": "tag_verb_original", "valid_tag_noun_x": "valid_tag_noun_original", "tag_verb_y": "tag_verb_single", "valid_tag_noun_y": "valid_tag_noun_single"})
exploed_narration_df

In [28]:
# save exploed narrations
exploed_narration_df.to_csv('../dataset/egoclip_narrations_exploed.csv')

## make groups.csv

In [None]:
# make groups.csv
group_df = exploed_narration_df.groupby(['tag_verb_single', 'valid_tag_noun_single']).apply(lambda x: x.index.tolist()).reset_index()
group_df.columns = ['tag_verb', 'tag_noun', 'narration_indices']
group_df

In [None]:
exploed_narration_df

In [None]:
def get_indices_for_diff_noun(row, narration_df):
    # Get the narrations with the same tag_verb
    narrations_with_same_verb = narration_df[narration_df['tag_verb_single'] == row['tag_verb']]
    
    # Convert the current row's tag_noun to a list
    # current_nouns = eval(row['tag_noun'])
    current_nouns = row['tag_noun']
    # Filter out the narrations which have any of the current group's tag_noun in their valid_tag_noun
    filtered_narrations = narrations_with_same_verb[narrations_with_same_verb['valid_tag_noun_single']!=current_nouns]
    
    return filtered_narrations.index.tolist()

group_df['mismatch_noun'] = group_df.apply(get_indices_for_diff_noun, args=(exploed_narration_df,), axis=1)

group_df

In [None]:
def get_indices_for_diff_verb(row, narration_df):
    # Get the narrations with the same tag_noun
    narrations_with_same_verb = narration_df[narration_df['valid_tag_noun_single'] == row['tag_noun']]
    
    # Convert the current row's tag_noun to a list
    # current_nouns = eval(row['tag_noun'])
    current_nouns = row['tag_verb']
    # Filter out the narrations which have any of the current group's tag_noun in their valid_tag_noun
    filtered_narrations = narrations_with_same_verb[narrations_with_same_verb['tag_verb_single']!=current_nouns]
    
    return filtered_narrations.index.tolist()

group_df['mismatch_verb'] = group_df.apply(get_indices_for_diff_verb, args=(exploed_narration_df,), axis=1)

group_df

In [None]:
# make mismatch_verb_noun column
def find_complement_set(sublist, corpora):
    return [el for el in corpora if el not in sublist]
group_df['mismatch_verb_noun'] = group_df['narration_indices'].apply(find_complement_set, args=(exploed_narration_df.index.tolist(),))
group_df

In [180]:
# save
group_df.to_csv('../dataset/egoclip_groups.csv')

## substantiate

In [29]:
# read narrations.csv and groups.csv
narration_df = pd.read_csv('../dataset/egoclip_narrations_exploed.csv', index_col=0)
group_df = pd.read_csv('../dataset/egoclip_groups.csv', index_col=0)

In [None]:
group_df

In [None]:
narration_df

In [67]:
# random sample n groups
sampled_groups = group_df.sample(n=1)
sampled_groups

Unnamed: 0,tag_verb,tag_noun,narration_indices,mismatch_noun,mismatch_verb,mismatch_verb_noun
34,11,49,"[947, 1024, 1040]","[17, 18, 33, 35, 36, 38, 39, 40, 57, 61, 64, 6...","[205, 206, 208, 209, 210, 211, 232, 233, 236, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."


In [None]:

# make a dataframe to show group verb and noun labels (canonical), and show k narrations, k verb-mismatched, k noun-mismatched and k vn-mismatched narration rows in the group
k = 2
expanded_narr = sampled_groups['narration_indices'].apply(ast.literal_eval).explode().reset_index().sample(n=k)
expanded_mis_v = sampled_groups['mismatch_verb'].apply(ast.literal_eval).explode().reset_index().sample(n=k)
expanded_mis_n = sampled_groups['mismatch_noun'].apply(ast.literal_eval).explode().reset_index().sample(n=k)
expanded_mis_vn = sampled_groups['mismatch_verb_noun'].apply(ast.literal_eval).explode().reset_index().sample(n=k)

from functools import reduce
indices_samples_df = reduce(lambda  left,right: pd.merge(left,right,on='index',
                                            how='outer'), [expanded_narr, expanded_mis_v, expanded_mis_n, expanded_mis_vn])
indices_samples_df


In [None]:
sampled_groups_info_df = pd.merge(sampled_groups, indices_samples_df, left_index=True, right_on='index')
sampled_groups_info_df

In [None]:
# get narration text
sampled_groups_info_df['narration_indices_y_info'] = sampled_groups_info_df['narration_indices_y'].apply(lambda idx: narration_df.loc[idx, 'clip_text'])
sampled_groups_info_df['mismatch_verb_y_info'] = sampled_groups_info_df['mismatch_verb_y'].apply(lambda idx: narration_df.loc[idx, 'clip_text'])
sampled_groups_info_df['mismatch_noun_y_info'] = sampled_groups_info_df['mismatch_noun_y'].apply(lambda idx: narration_df.loc[idx, 'clip_text'])
sampled_groups_info_df['mismatch_verb_noun_y_info'] = sampled_groups_info_df['mismatch_verb_noun_y'].apply(lambda idx: narration_df.loc[idx, 'clip_text'])
sampled_groups_info_df

In [None]:
# group text
taxonomy_verb_df = pd.read_csv("/z/dat/Ego4D/raw/v2/annotations/narration_verb_taxonomy.csv")
taxonomy_noun_df = pd.read_csv("/z/dat/Ego4D/raw/v2/annotations/narration_noun_taxonomy.csv")
sampled_groups_info_df['tag_verb_info'] = sampled_groups_info_df['tag_verb'].apply(lambda x: taxonomy_verb_df.iloc[x, :]['label'])
sampled_groups_info_df['tag_noun_info'] = sampled_groups_info_df['tag_noun'].apply(lambda x: taxonomy_noun_df.iloc[x, :]['label'])
sampled_groups_info_df

In [None]:
# clean up
with pd.option_context('display.max_colwidth', None): # show full text
    clean_sampled_groups_info_df = sampled_groups_info_df[['tag_verb_info', 'tag_noun_info', 'narration_indices_y_info', 'mismatch_verb_y_info', 'mismatch_noun_y_info', 'mismatch_verb_noun_y_info', 
                                                           'tag_verb', 'tag_noun', 'index', 'narration_indices_y', 'mismatch_verb_y', 'mismatch_noun_y', 'mismatch_verb_noun_y']]
    display(clean_sampled_groups_info_df)