In [1]:
import nlcc
import glob
import time
import pandas as pd
import yaml
import seaborn as sns
import matplotlib.pyplot as plt
base_colors = ["e63946","f1faee","a8dadc","457b9d","1d3557"]
colors = ['#' + c for c in base_colors]
sns.set_style("white")
sns.set_style("ticks")
sns.set(rc={'axes.facecolor':'#f5f4e9', 
            'grid.color' : '#AAAAAA', 
            'axes.edgecolor':'#333333', 
            'figure.facecolor':'#FFFFFF', 
            'axes.grid': False,
            'axes.prop_cycle':   plt.cycler('color', plt.cm.Dark2.colors),
            'font.family': 'monospace'
           })

## Load Prompts

In [2]:
topic_categories = ['md', 'spectroscopy', 'bio', 'qm', 'sim', 'cheminf', 'genchem', 'thermo', 'stats', 'plot']
code_categories = ['code', 'human']
lang_categories = ['ch']
keys = ['name', 'language', 'context']

In [3]:
data = None
valid_files = []
def insert_row(r, data):
    if data is None:
        return {k: [v] for k,v in r.items()}
    return  {k: v + [r[k]] for k,v in data.items()}
for fn in glob.glob('../data/**/*.yml'):
    with open(fn, 'r') as f:
        d = yaml.safe_load(f)
        # slice        
        cat_str = d['categories'] 
        d = {k: d[k] for k in keys}
        # duplicate for categories
        tcs = [c for c in topic_categories if c in cat_str]
        ccs = [c for c in code_categories if c in cat_str]
        lcs = [c for c in lang_categories if c in cat_str]
        if len(lcs) == 0:
            lcs = ['en']
        if len(tcs) > 0:
            valid_files.append(fn)
        for tc in tcs:
            for cc in ccs:
                for lc in lcs:
                    d.update({'topic': tc, 'type': cc, 'natlang': lc})                    
                    data = insert_row(d, data)
                    
df = pd.DataFrame.from_dict(data)

In [4]:
df.groupby('topic').count()

Unnamed: 0_level_0,name,language,context,type,natlang
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bio,11,11,11,11,11
cheminf,10,10,10,10,10
genchem,11,11,11,11,11
md,10,10,10,10,10
plot,10,10,10,10,10
qm,6,6,6,6,6
sim,4,4,4,4,4
spectroscopy,4,4,4,4,4
stats,10,10,10,10,10
thermo,2,2,2,2,2


## Programmatic Prompts

In [None]:
paths = ' '.join(valid_files)
temperatures = [0.05, 0.2, 0.5]
k = 5
for i,t in enumerate(temperatures):    
    out = f'override_bench_{i}.csv'
    !nlcc-bench $paths $out --n $k --prompt python --temperature $t
    out = f'bench_{i}.csv'
    !nlcc-bench $paths $out --n $k --temperature $t        
    out = f'insert_bench_{i}.csv'
    !nlcc-bench $paths $out --n $k --prompt "insert" --temperature $t
    out = f'cheader_bench_{i}.csv'
    header = 'header:# MIT License\n\n# Copyright (c) 2022 University of Rochester\n\n'
    !nlcc-bench $paths $out --n $k --prompt "$header" --temperature $t
    out = f'lheader_bench_{i}.csv'
    header = 'header:# This is written by an expert Python programmer\n\n'
    !nlcc-bench $paths $out --n $k --prompt "$header" --temperature $t

../data/aa_polarity/aa_polarity.yml
../data/alignment/alignment.yml
'Trajectory' object has no attribute 'copy'
'int' object is not subscriptable
'int' object is not subscriptable
'Trajectory' object has no attribute 'copy'
'Trajectory' object has no attribute 'copy'
../data/aromatic_aa/aromatic_aa.yml
../data/arrhenius/arrhenius.yml
../data/bimolecular_rate/bimolecular.yml
../data/blast/blast.yml
No module named 'urllib2'
EOF while scanning triple-quoted string literal (<string>, line 37)
No module named 'urllib2'
No module named 'urllib2'
No module named 'urllib2'
../data/bravais/bravais.yml
invalid syntax (<string>, line 17)
invalid syntax (<string>, line 17)
invalid syntax (<string>, line 17)
invalid syntax (<string>, line 17)
invalid syntax (<string>, line 17)
../data/canonicalize/canonicalize.yml
name 'Chem' is not defined
name 'Chem' is not defined
name 'Chem' is not defined
name 'Chem' is not defined
name 'Chem' is not defined
../data/compare_electronegativity/compare_electrone

In [None]:
merged = None
for i,t in enumerate(temperatures):
    f = f'override_bench_{i}.csv'
    bf = pd.read_csv(f, delim_whitespace=True)    
    bf = pd.merge(bf, df, how='inner', on='name')
    bf.context.values[:] = 'none'
    if merged is None:
        merged = bf
    else:
        merged = pd.concat((merged, bf))
    f = f'bench_{i}.csv'
    bf = pd.read_csv(f, delim_whitespace=True)    
    bf = pd.merge(bf, df, how='inner', on='name')
    merged = pd.concat((merged, bf))
    f = f'insert_bench_{i}.csv'
    bf = pd.read_csv(f, delim_whitespace=True)    
    bf = pd.merge(bf, df, how='inner', on='name')
    bf.context.values[:] = [v + '-insert' for v in bf.context.values[:]]
    merged = pd.concat((merged, bf))
    f = f'cheader_bench_{i}.csv'
    bf = pd.read_csv(f, delim_whitespace=True)    
    bf = pd.merge(bf, df, how='inner', on='name')
    bf.context.values[:] = [v + '-copyright' for v in bf.context.values[:]]
    merged = pd.concat((merged, bf))
    f = f'lheader_bench_{i}.csv'
    bf = pd.read_csv(f, delim_whitespace=True)    
    bf = pd.merge(bf, df, how='inner', on='name')
    bf.context.values[:] = [v + '-authority' for v in bf.context.values[:]]
    merged = pd.concat((merged, bf))
def simple_context(c):
    if c == 'none':
        return c
    elif 'copyright' in c:
        return 'copyright'
    elif 'authority' in c:
        return 'authority'
    elif 'insert' in c:
        return 'insert'
    return 'custom'
    
merged = merged.assign(used_context=merged.context.apply(simple_context))
merged.to_pickle('promp_results.pkl')
merged.tail()

In [None]:
plt.figure(figsize=(4,4), dpi=90)
g = sns.FacetGrid(merged, col='topic', col_wrap=5, height=2.5, aspect=2, hue_order=temperatures)
g.map(sns.pointplot, 'used_context', 'result', 'temperature', 
      palette='Set2', dodge=True)
g.add_legend(title='Temperature')
g.set_axis_labels('Context', 'Accuracy')
g.set_titles('{col_name}')
g.savefig('accuracy.pdf')

In [None]:
plt.figure(figsize=(4,4), dpi=90)
g = sns.FacetGrid(merged, col='topic', col_wrap=5, height=2)
g.map(sns.pointplot, 'used_context', 'result', dodge=True, color='#333')
g.set_axis_labels('Context', 'Accuracy')
g.set_titles('{col_name}')
g.savefig('marginal_accuracy.pdf')

In [None]:
print(df.groupby('topic').count().iloc[:,0].to_latex())