# Merging Checkpoints

As you can see from the scripts included in this project, we ended up batching the comparisons between our keyword utterances ($k \in K$) and our context utterances ($c \in C$). Partially, this was to decrease the noise in the office where the tower is stored while running our tests.

The following scripts are designed to stitch those pieces back together again, largely using the CEDA object/framework to do so.

In [None]:
from shared.CEDA import ceda_model
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import os

In [None]:
CKPT_PATH = 'data/ckpts'
RAW_PATH = 'data/data'
OUT_PATH = 'data/results'
META_DATA_PATH = 'data/meta_data'
OUT_NAME = 'ceda-results.csv'

In [None]:
df = []

In [None]:
mod = ceda_model()

files = [os.path.join(CKPT_PATH, f) for f in os.listdir(CKPT_PATH) if not f.startswith('._') and f.endswith('.pt')]
for f in tqdm(files):
    mod.load_from_checkpoint(f)
    df += [mod.graph_df(residualize=False)]
    df[-1]['dyad'] = df[-1]['file'].values[0].split('/')[-1].replace('.docx', '')
    
    # label transition number in conversation
    df[-1]['transition'] = df[-1].index.values + 1
    
    # label speaker per conversation (for enumerated labeling/further anonymization)
    df[-1]['speaker'] = df[-1]['dyad'] + '-' + df[-1]['speaker']
    df[-1]['speaker2'] = df[-1]['dyad'] + '-' + df[-1]['speaker2']

In [None]:
df = pd.concat(df, ignore_index=True)
df.head()

Stupidly, I left out some crucial information for ascertaining whether $x$ and $y$ (i.e. $k$ and $c$) are in the same context. That missing info being the parent comments for $x$ and $y$. To get those, I'm addding in the following script.

In [None]:
dfc = [pd.read_csv(os.path.join(META_DATA_PATH, f)) for f in os.listdir(META_DATA_PATH) if not f.startswith('._') and f.endswith('.csv')]
dfc = pd.concat(dfc, ignore_index=True)
dfc.head()

and let's merge the data with the metadata.

In [None]:
dfc['Dyads'] = dfc['Dyads'].astype(str)

In [None]:
df = pd.merge(
    left=df, left_on='dyad',
    right=dfc, right_on='Dyads',
    how='left'
)

del df['Dyads']

In [None]:
df.head()

Let's also take a moment now and anonymize some of the data (and save our anonymization key locally)

In [None]:
anonymize_columns = [['speaker', 'speaker2'], ['dyad']]
for cols in anonymize_columns:
    values = np.unique(df[cols].values)
    values = np.random.choice(values, size=(len(values),), replace=False)
    
    conversion = {val:i+1 for i,val in enumerate(values)}
    
    # save conversion dictionary
    f = open(
        os.path.join(
            OUT_PATH, 
            cols[0].replace('x_', '').replace('y_', '')+'.json'
        ), 
        'w'
    )
    f.write(json.dumps(conversion,indent=4))
    f.close()
    
    # anonymize the column
    for col in cols:
        print(col)
        df[col] = [conversion[val] for val in tqdm(df[col].values)]

Finishing this, let's save the data.

In [None]:
df.shape

In [None]:
df.to_csv(os.path.join(OUT_PATH, OUT_NAME), index=False, encoding='utf-8')