In [6]:
from langchain_community.llms import Ollama
llm = Ollama(model="mixtral", num_gpu=4, keep_alive='1h', format='json')
llm.invoke("Why is the sky blue?")

n = 10
def llama_prompt(text):
    return llm.invoke(f"""
        please create {n} separate grants from the following selection criteria. 
        Do not include any explanations or apologies in your responses. 
        For each grant, I want a title and abstract, as a dictionary ('title' and 'abstract' as keys) which I can parse using json.loads() in Python.
        So the output should look like a jsonl of length {n} with each line being a dictionary with 'title' and 'abstract' keys.

        Please make sure to return {n} grants.
        Grant Selection Criteria: {text}
    """)

  llm = Ollama(model="mixtral", num_gpu=4, keep_alive='1h', format='json')


In [7]:
cat_6_8 = """
All aspects of testing, evaluation and provision of complementary approaches to conventional medicine in humans in a clinical, community or applied setting including:

hypnotherapy, massage, acupuncture and homeopathy
issues relating to health and social services and health care delivery
attitudes and beliefs of patients and health care professionals.

The focus of this criteria is the testing, evaluation and provision.
"""

cat_5_8 = """
Discovery and development of complementary approaches to conventional medical therapies including:

hypnotherapy, meditation, massage, acupuncture and homeopathy
mechanisms of action
testing in model systems

The focus of this criteria is the development of complementary approaches.
"""

cat_7_4 = """"
development and/or distribution of resources and equipment for use by the community including informatics systems
infrastructure support for trials, networks, consortia and centres"
"""

cat_5_7 = """" 
"Development of physical interventions including:

physical therapies, physiotherapy, occupational therapy, speech therapy, dietetics, exercise and osteopathy
mechanisms of action
testing in model systems"
"""

cat_1_4 = """"
Development of novel underpinning research measures and analytical methodologies including

development of statistical methods and algorithms for genomic analysis
development of mapping methodologies and novel data comparison methods
development of biological, psychological and socioeconomic research measures"
"""

In [8]:
cat_dict = {
    "cat_6_8": cat_6_8,
    "cat_5_8": cat_5_8,
    "cat_7_4": cat_7_4,
    "cat_5_7": cat_5_7,
    "cat_1_4": cat_1_4,
}

In [3]:
import pandas as pd
train = pd.read_parquet('../data/preprocessed/ra/train.parquet')

In [9]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

pd_synthetic = pd.DataFrame(columns=train.columns)

for keys, cat_description in tqdm(cat_dict.items()):
    category = '.'.join(keys.split('_')[1:])
    result = llama_prompt(cat_description)
    result_json = json.loads(result)

    # if result_json is a list, transform back to a dictionary
    if isinstance(result_json, list):
        result_json = {i: result for i, result in enumerate(result_json)}
    # check if result_json is a dictionary or a list of dictionaries
    elif isinstance(result_json, dict):
        result_json = [result for result in list(result_json.values())]
        if isinstance(result_json[0], list):
            result_json = [result for result in result_json[0]]
    # check if result_json is a list of dictionaries or a list of lists
    elif isinstance(result_json[0], list):
        result_json = [result for result in result_json[0]]

    for grant in result_json:
        grant_text = grant['title']+' '+grant['abstract']
        grant_text = grant_text.replace('\n', ' ')
        grant_text = grant_text.replace('\r', ' ')
        grant_text = grant_text.replace('\t', ' ')
        grant_text = grant_text.lower()
        labels = [0]*(len(train.columns)-1)
        new_row = pd.DataFrame([list(labels)+[grant_text]], columns=train.columns)
        # insert 1 at column corresponding to category
        new_row[category] = 1


        # add new row to synthetic dataset
        pd_synthetic = pd.concat([pd_synthetic, new_row], ignore_index=True)




100%|██████████| 5/5 [06:25<00:00, 77.01s/it]


In [16]:
train_enhanced = pd.concat([train, pd_synthetic], ignore_index=True)

In [20]:
# scramble rows
train_enhanced = train_enhanced.sample(frac=1).reset_index(drop=True)

In [22]:
train_enhanced.to_parquet('../data/preprocessed/ra/train_enhanced.parquet')