**see the workdir/experiment_configs for the sample configuration files used for experiments**

In [None]:
import os
import json


def generate_from_template(template, result_dir, version=1):
    model_type = ''.join([e[0] for e in template['model_type'].split('-')])
    mask_type = 'mask' if template['mask_token'] else 'nomask'
    if 'max_multiunit' in template.keys():
        mm = f'mm{template["max_multiunit"]}'
        name = '-'.join((model_type, 
                    f'ntok{template["n_tokens"]}', 
                    f'nunits{template["n_units"]}', 
                    mask_type, 
                    f'k{template["n_top"]}',
#                     f'v{version}',
                    mm))
    else:
        name = '-'.join((model_type, 
                        f'ntok{template["n_tokens"]}', 
                        f'nunits{template["n_units"]}', 
                        mask_type, 
                        f'k{template["n_top"]}',
    #                     f'v{version}',
                        ))
    result = {
        'name' : name,
        'args' : template
    }
    
    result['args']['result_dir'] = os.path.join(result_dir, name)
    return result

### BERT 
with and without patterns/masking

In [None]:
result_dir = 'workdir/results/'

experiment_templates = [
    {
            "model_type" : "bert-large-cased",
            "dataset" : "workdir/data/swv_T.pkl",
            "proc_column" : "masked_sent",
            "n_tokens" : 1,
            "n_units" : 1,
            "mask_token" : False,
            "n_top" : 200,
            'batch_size' : 12
    },
    {
            "model_type" : "bert-large-cased",
            "dataset" : "workdir/data/swv_T.pkl",
            "proc_column" : "masked_sent",
            "n_tokens" : 1,
            "n_units" : 1,
            "mask_token" : True,
            "n_top" : 200,
            'batch_size' : 12
    },
    
    {
            "model_type" : "bert-large-cased",
            "dataset" : "workdir/data/swv_TandT.pkl",
            "proc_column" : "masked_sent",
            "n_tokens" : 1,
            "n_units" : 1,
            "mask_token" : True,
            "n_top" : 200,
            'batch_size' : 12
    },
    {
            "model_type" : "bert-large-cased",
            "dataset" : "workdir/data/swv_TandT.pkl",
            "proc_column" : "masked_sent",
            "n_tokens" : 1,
            "n_units" : 1,
            "mask_token" : False,
            "n_top" : 200,
            'batch_size' : 12
    }
]


experiments = []
for template in experiment_templates:
    experiments.append(generate_from_template(template, result_dir, version=1))

with open('workdir/experiment_configs/sample_verbs_st.json', 'w') as f:
    json.dump(experiments, f, indent=4)
    
print(json.dumps(experiments, indent=4))

## dsm baselines

- Experimemts configurations can also be defined without using the generate_from_template, as follows. This is more easy for Distributional thesaurus (dt) and static embeddings (dsm) experiements

In [None]:
experiments = [
    {
        "name": "dt_wiki_lem",
        "args": {
            "model_type": "dt+workdir/dt_wiki.csv.gz",
            "dataset": "workdir/data/swv_T.pkl",
            "proc_column": "masked_sent",
            "n_jobs": 1,
            "n_top": 200,
            "do_lemmatize": True,
            "batch_size" : 100000000000000,
            "result_dir": "workdir/results/paper_verbs_st/dt_wiki_lem"
        }
    },
    {
        "name": "dt_wiki_nolem",
        "args": {
            "model_type": "dt+workdir/dt_wiki.csv.gz",
            "dataset": "workdir/data/swv_T.pkl",
            "proc_column": "masked_sent",
            "n_jobs": 1,
            "n_top": 200,
            "do_lemmatize": False,
            "batch_size" : 100000000000000,
            "result_dir": "workdir/results/paper_verbs_st/dt_wiki_nolem"
        }
    },
    
    {
        "name": "fasttext_cc_lem",
        "args": {
            "model_type": "fasttext+workdir/dsm/fasttext_cc/cc.en.300.bin",
            "dataset": "workdir/data/swv_T.pkl",
            "proc_column": "masked_sent",
            "n_jobs": 1,
            "n_top": 200,
            "do_lemmatize": True,
            "batch_size" : 100000000000000,
            "result_dir": "workdir/results/paper_verbs_st/fasttext_cc_lem"
        }
    },
    
    {
        "name": "fasttext_cc_nolem",
        "args": {
            "model_type": "fasttext+workdir/dsm/fasttext_cc/cc.en.300.bin",
            "dataset": "workdir/data/swv_T.pkl",
            "proc_column": "masked_sent",
            "n_jobs": 1,
            "n_top": 200,
            "do_lemmatize": False,
            "batch_size" : 100000000000000,
            "result_dir": "workdir/results/paper_verbs_st/fasttext_cc_nolem"
        }
    },
    
    {
        "name": "word2vec_googlenews_lem",
        "args": {
            "model_type": "dsm+workdir/dsm/word2vec/GoogleNews-vectors-negative300.bin.gz",
            "dataset": "workdir/data/swv_T.pkl",
            "proc_column": "masked_sent",
            "n_jobs": 1,
            "n_top": 200,
            "do_lemmatize": True,
            "batch_size" : 100000000000000,
            "result_dir": "workdir/results/paper_verbs_st/word2vec_googlenews_lem"
        }
    },
    {
        "name": "word2vec_googlenews_nolem",
        "args": {
            "model_type": "dsm+workdir/dsm/word2vec/GoogleNews-vectors-negative300.bin.gz",
            "dataset": "workdir/data/swv_T.pkl",
            "proc_column": "masked_sent",
            "n_jobs": 1,
            "n_top": 200,
            "do_lemmatize": False,
            "batch_size" : 100000000000000,
            "result_dir": "workdir/results/paper_verbs_st/word2vec_googlenews_nolem"
        }
    }
]


In [None]:
with open('workdir/experiment_configs/sample_baselines_verbs_st.json', 'w') as f:
    json.dump(experiments, f, indent=4)
    
print(json.dumps(experiments, indent=4))