# Code to Process Data from Marc

In [9]:
lab_pool = set()
with open("paper/data/Tungsten carbide data update.csv") as f:
    with open("paper/data/co_train.csv", "w") as g:
        for line in f.readlines():
            if len(line) < 5:
                continue
            x = line[1:].split(",")[0]
            y = line.split("of")[1][:-4]
            g.write(f'"{x}",{y}\n')
            lab_pool.add(x)

In [10]:
from itertools import product

wtr = range(4, 30)
ctempr = range(600, 1000 + 50, 50)
tempr = range(250, 350 + 25, 25)
dpwt = ["0.5 wt%", "5 wt%"]
dpotanti = [
    "with Co dopant metal",
    "with Fe dopant metal",
    "with Cu dopant metal",
    "with Pt dopant metal",
    "with Ni dopant metal",
]
dopantr = ["without any dopant metal"]
for w, d in product(dpwt, dpotanti):
    dopantr.append(f"{d} at {w}")

with open("paper/data/co_pool.txt", "w") as f:
    for wt, ctemp, temp, dopant in product(wtr, ctempr, tempr, dopantr):
        prompt = (
            f"A {wt} wt% tungsten carbide catalyst was prepared {dopant} and carburized at {ctemp} °C. "
            f"The reaction was run at {temp} °C"
        )
        if prompt in lab_pool:
            print("skipping found one")
            continue
        f.write(prompt.split(",")[0] + "\n")

# Mult Prediction

In [14]:
import bolift
import numpy as np
import matplotlib.pyplot as plt
import json
import pandas as pd
from langchain.prompts.prompt import PromptTemplate


data_path = "paper/data/co_train.csv"
np.random.seed(0)

In [20]:
raw_data = pd.read_csv(data_path)
raw_data

Unnamed: 0,conditions,coyield
0,A 15 wt% tungsten carbide catalyst was prepare...,1.66
1,A 15 wt% tungsten carbide catalyst was prepare...,3.03
2,A 15 wt% tungsten carbide catalyst was prepare...,1.61
3,A 15 wt% tungsten carbide catalyst was prepare...,4.12
4,A 15 wt% tungsten carbide catalyst was prepare...,0.52
5,A 15 wt% tungsten carbide catalyst was prepare...,3.36
6,A 15 wt% tungsten carbide catalyst was prepare...,9.8
7,A 15 wt% tungsten carbide catalyst was prepare...,18.98
8,A 15 wt% tungsten carbide catalyst was prepare...,6.21
9,A 15 wt% tungsten carbide catalyst was prepare...,16.35


In [21]:
r = raw_data[raw_data.coyield == raw_data.coyield.max()]
print(r)
print(r.conditions.values)

                                          conditions  coyield
7  A 15 wt% tungsten carbide catalyst was prepare...    18.98
['A 15 wt% tungsten carbide catalyst was prepared with Cu dopant metal at 5 wt% and carburized at 835 °C. The reaction was run at 350 °C']


In [22]:
asktell = bolift.AskTellFewShotTopk(
    x_formatter=lambda x: f'the synthesis procedure:"{x}"',
    y_name="the CO yield",
    y_formatter=lambda y: f"{y:.2f}%",
    model="gpt-4",
    temperature=1,
    selector_k=5,
)

In [23]:
for i, (c, y) in raw_data.iterrows():
    if i < len(raw_data) - 1:
        asktell.tell(c, y)

ValidationError: 1 validation error for OpenAIEmbeddings
__root__
  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass  `openai_api_key` as a named parameter. (type=value_error)

In [None]:
asktell.predict(c), y

In [None]:
# tell the last one
asktell.tell(c, y)

In [None]:
s = asktell.inv_predict(20)
s

In [None]:
with open("data/co_pool.txt") as f:
    pool = bolift.Pool(list(f.readlines()))
print(len(pool))

In [None]:
result = asktell.ask(pool, "greedy", inv_filter=50)

In [None]:
x = result[0][0]
print(x)

In [None]:
dist = asktell.predict(x)
print(dist, dist.mean(), dist.std())

# GPR

In [None]:
asktell = bolift.AskTellGPR(
    x_formatter=lambda x: f'synthesis procedure:"{x}"',
    y_name="CO yield",
    y_formatter=lambda y: f"{y:.2f}",
)

In [None]:
for i, (c, y) in raw_data.iterrows():
    if i < len(raw_data) - 1:
        asktell.tell(c, y)

In [None]:
asktell.predict(c), y

In [None]:
# tell the last one
asktell.tell(c, y)

In [None]:
with open("data/co_pool.txt") as f:
    pool = bolift.Pool(list(f.readlines()))

In [None]:
result = asktell.ask(pool, "greedy")

In [None]:
x = result[0][0]
print(x)

In [None]:
dist = asktell.predict(x)
print(dist, dist.mean(), dist.std())