# Merging Checkpoints

As you can see from the scripts included in this project, we ended up batching the comparisons between our keyword utterances ($k \in K$) and our context utterances ($c \in C$). Partially, this was to decrease the noise in the office where the tower is stored while running our tests.

The following scripts are designed to stitch those pieces back together again, largely using the CEDA object/framework to do so.

In [None]:
from CEDA import ceda_model
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import os

In [None]:
CKPT_PATH = 'data/ckpts'
RAW_PATH = 'data/raw'
OUT_PATH = 'data/results'
OUT_NAME = 'ceda-results.csv'

In [None]:
df = []

In [None]:
mod = ceda_model()

files = [os.path.join(CKPT_PATH, f) for f in os.listdir(CKPT_PATH)]
for f in tqdm(files):
    mod.load_from_checkpoint(f)
    df += [mod.graph_df(residualize=False)]

In [None]:
df = pd.concat(df, ignore_index=True)
df.head()

Stupidly, I left out some crucial information for ascertaining whether $x$ and $y$ (i.e. $k$ and $c$) are in the same context. That missing info being the parent comments for $x$ and $y$. To get those, I'm addding in the following script.

In [None]:
dfc = pd.read_csv(os.path.join(RAW_PATH, 'corpus-localcontext.csv'))
dfc['parent_id_'] = [pid.split('_')[-1] for pid in tqdm(dfc['parent_id'].values)]

# conversion to get parent ids from the line number
conversion = {line_no: dfc['parent_id'].loc[line_no] for line_no in dfc.index}

# conversion to get when the comment was created from parent comment ids
#  used to get created at time for parent comments
parent_created_at_conversion = {cid: dfc['comment_created_at'].loc[dfc['comment_id'].isin([cid])].values[0] for cid in dfc['comment_id'].unique()}

# conversion to get comment ups from comment id
comment_ups_conversion = {cid: comment_ups for cid, comment_ups in dfc[['comment_id', 'comment_ups']].values}

# conversion to get all tags associated with a parent_id.
parent_tags = {
    pid: '|'.join(dfc['tag'].loc[dfc['parent_id_'].isin([pid]) & ~dfc['tag'].isna()])
    for pid in dfc['parent_id_'].loc[~dfc['tag'].isna()].unique()
}

In [None]:
df['x_parent_id'] = [conversion[line_no] for line_no in tqdm(df['x_line_no'].values)]

In [None]:
df['y_parent_id'] = [conversion[line_no] for line_no in tqdm(df['y_line_no'].values)]

In [None]:
df['x_comment_ups'] = [comment_ups_conversion[cid] for cid in tqdm(df['x_comment_id'].values)]

In [None]:
df['y_comment_ups'] = [comment_ups_conversion[cid] for cid in tqdm(df['y_comment_id'].values)]

I also want to create a context label, alongside of selecting a context beginning timestamp.

In [None]:
df['x_parent_id_'] = [pid.split('_')[-1] for pid in tqdm(df['x_parent_id'].values)]
df['y_parent_id_'] = [pid.split('_')[-1] for pid in tqdm(df['y_parent_id'].values)]

df['x_parent_id_'].loc[df['x_parent_id_'].isin(['ROOT'])] = df['x_comment_id'].loc[df['x_parent_id_'].isin(['ROOT'])]
df['y_parent_id_'].loc[df['y_parent_id_'].isin(['ROOT'])] = df['y_comment_id'].loc[df['y_parent_id_'].isin(['ROOT'])]

df['x_context_id'] = df['x_parent_id_'].values
df['y_context_id'] = None
df['same_context'] = False

In [None]:
# get children and label context
sel = df['x_comment_id'] == df['y_parent_id_']
df['cc_is_child'] = sel
df['y_context_id'].loc[sel] = df['x_context_id'].loc[sel]
# df['y_tag'].loc[sel] = df['x_tag'].loc[sel]
df['same_context'].loc[sel] = True

In [None]:
# get siblings and label context
sel = df['x_parent_id'] == df['y_parent_id']
df['cc_is_sibling'] = sel
df['y_context_id'].loc[sel] = df['x_context_id'].loc[sel]
# df['y_tag'].loc[sel] = df['x_tag'].loc[sel]
df['same_context'].loc[sel] = True

In [None]:
# get parents and label context
sel = df['y_comment_id'] == df['x_parent_id_']
df['cc_is_parent'] = sel
df['y_context_id'].loc[sel] = df['x_context_id'].loc[sel]
# df['y_tag'].loc[sel] = df['x_tag'].loc[sel]
df['same_context'].loc[sel] = True

In [None]:
# df['x_context_time'] = [parent_created_at_conversion[cid] 
#                         if cid in parent_created_at_conversion.keys() else None 
#                         for cid in tqdm(df['x_context_id'].values)
#                         ] #df['x_context_id'].replace(parent_created_at_conversion)
# 
# df['y_context_time'] = [parent_created_at_conversion[cid] 
#                         if cid in parent_created_at_conversion.keys() else None 
#                         for cid in tqdm(df['y_context_id'].values)
#                         ] #df['x_context_id'].replace(parent_created_at_conversion)

In [None]:
# df['y_context_id'].loc[(~df['y_context_id'].isna() & df['y_context_time'].isna())].value_counts()

In [None]:
s1 = df['y_comment_id'].loc[df['cc_is_child']].unique()
s2 = df['y_comment_id'].loc[df['cc_is_parent'] | df['cc_is_sibling']].unique()

only_as_child_comments = list(set(s1).difference(set(s2)))
only_as_child_context_ids = {comment: df['x_parent_id_'].loc[df['y_comment_id'].isin([comment]) & df['same_context']].values[0] for comment in only_as_child_comments}

In [None]:
df['x_context_time'] = [parent_created_at_conversion[cid] 
                        if cid in parent_created_at_conversion.keys() else None 
                        for cid in tqdm(df['x_context_id'].values)
                        ] #df['x_context_id'].replace(parent_created_at_conversion)

df['y_context_time'] = [parent_created_at_conversion[cid] 
                        if cid in parent_created_at_conversion.keys() else None 
                        for cid in tqdm(df['y_context_id'].values)
                        ] #df['x_context_id'].replace(parent_created_at_conversion)

In [None]:
sel = df['y_comment_id'].isin(only_as_child_comments)

all_other_y_contexts = dict()
for comment in df['y_comment_id'].loc[df['same_context'] & ~sel].unique():
    responses = df[['y_context_id', 'y_context_time']].loc[df['y_comment_id'].isin([comment]) & df['same_context']].values
    all_other_y_contexts[comment] = responses[:,0][responses[:,1].argmin()]

df['y_context_id'].loc[~sel & (~df['same_context'])] = [all_other_y_contexts[comment] for comment in tqdm(df['y_comment_id'].loc[~sel & (~df['same_context'])].values)]

df['y_context_id'].loc[sel] = [only_as_child_context_ids[comment] for comment in tqdm(df['y_comment_id'].loc[sel].values)]

In [None]:
# sel = df.loc[df['same_context']]
# context_conversion = {yid: sel[['y_context_id', 'y_context_time', 'x_tag']].loc[sel['y_parent_id_'].isin([yid])].values for yid in sel['y_parent_id_'].unique()}

In [None]:
# # comparisons across contexts
# sel = ~df['y_context_id'].isna()
# for cid in tqdm(df['y_comment_id'].loc[sel].unique()):
#     sub = df.loc[sel & df['y_comment_id'].isin([cid])]
#     min_ = sub['y_context_time'].min()
#     earliest_head = sub['y_context_id'].loc[sub['y_context_time']==min_].values
#     df['y_context_id'].loc[~sel & df['y_comment_id'].isin([cid])] = earliest_head[0]

In [None]:
df['y_context_id'].isin(df['x_context_id'].unique()).mean()

In [None]:
# everything else:
sel = df['y_context_id'].isna()
print(sel.sum())
# df['y_context_id'].loc[sel] = df['y_parent_id_'].loc[sel]

In [None]:
df['x_context_time'] = [parent_created_at_conversion[cid] 
                        if cid in parent_created_at_conversion.keys() else None 
                        for cid in tqdm(df['x_context_id'].values)
                        ] #df['x_context_id'].replace(parent_created_at_conversion)

df['y_context_time'] = [parent_created_at_conversion[cid] 
                        if cid in parent_created_at_conversion.keys() else None 
                        for cid in tqdm(df['y_context_id'].values)
                        ] #df['x_context_id'].replace(parent_created_at_conversion)

Adding the context time for all the y_contexts, one last time . . . 

In [None]:
# sel = df['y_context_time'].isna()
# df['y_context_time'].loc[sel] = [parent_created_at_conversion[cid] 
#                         if cid in parent_created_at_conversion.keys() else None 
#                         for cid in tqdm(df['y_context_id'].loc[sel].values)
#                         ] #df['x_context_id'].replace(parent_created_at_conversion)

In [None]:
possible_y_tags = {
    xcid: '|'.join(df['x_tag'].loc[df['x_context_id'].isin([xcid])].unique())
    for xcid in df['x_context_id'].unique()
}

df['y_tag'] = [
    possible_y_tags[ycid] if ycid in possible_y_tags.keys() 
    else None 
    for ycid in tqdm(df['y_context_id'].values)
]

And some last checks.

In [None]:
df.isna().sum()

In [None]:
df['same_context'].loc[df['y_tag'].isna()].value_counts()

Just in case, I also want to note when the $x$ and $y$ authors are the same.

In [None]:
del df['x_parent_id_']
del df['y_parent_id_']

In [None]:
df['same_author'] = df['x_user'] == df['y_user']

In [None]:
df['same_author'].value_counts()

Let's also take a moment now and anonymize some of the data (and save our anonymization key locally)

In [None]:
anonymize_columns = [['x_user', 'y_user'], ['x_comment_id', 'y_comment_id'], ['x_submission_id', 'y_submission_id']]
for cols in anonymize_columns:
    values = np.unique(df[cols].values)
    values = np.random.choice(values, size=(len(values),), replace=False)
    
    conversion = {val:i+1 for i,val in enumerate(values)}
    
    # save conversion dictionary
    f = open(
        os.path.join(
            OUT_PATH, 
            cols[0].replace('x_', '').replace('y_', '')+'.json'
        ), 
        'w'
    )
    f.write(json.dumps(conversion,indent=4))
    f.close()
    
    # anonymize the column
    for col in cols:
        print(col)
        df[col] = [conversion[val] for val in tqdm(df[col].values)]

Finishing this, let's save the data.

In [None]:
df.to_csv(os.path.join(OUT_PATH, OUT_NAME), index=False, encoding='utf-8')

In [None]:
df.shape

In [None]:
df['y_tag'].value_counts()