In [None]:
import os
import ast
import json
import yaml
import hashlib
import requests
import pandas as pd
import numpy as np

from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, GPT2TokenizerFast

In [None]:
def normalize_json_string(json_str):
    try:
        obj = json.loads(json_str)
        normalized_str = json.dumps(obj, sort_keys=True)
        return normalized_str
    except Exception:
        # Handle invalid JSON strings if necessary
        return json_str

def hash_json_string(json_str):
    normalized_str = normalize_json_string(json_str)
    return hashlib.md5(normalized_str.encode('utf-8')).hexdigest()

# filter deduplications

In [None]:
df = pd.read_csv("/opt/gpudata/gdc-eval/results/datasets/naive_sampler_100k_v2.tsv", sep="\t")
print(len(df))

# Create a hash column
df['json_hash'] = df['filters'].apply(hash_json_string)

# Drop duplicates based on the hash column
df_dedup_100k = df.drop_duplicates(subset='json_hash').copy()

print(len(df_dedup_100k))

In [None]:
print(f"deficit in 100k dataset: {100_000 - len(df_dedup_100k)}")

In [None]:
df = pd.read_csv("/opt/gpudata/gdc-eval/results/datasets/naive_sampler_1M_v1.tsv", sep="\t")
print(len(df))

# Create a hash column
df['json_hash'] = df['filters'].apply(hash_json_string)

# Drop duplicates based on the hash column
df_dedup_1M = df.drop_duplicates(subset='json_hash').copy()
print(len(df_dedup_1M))

In [None]:
print(f"deficit in 1M dataset: {1_000_000 - len(df_dedup_1M)}")

In [None]:
# check the overlap in the two datasets

intersection = set(df_dedup_100k["json_hash"].tolist()).intersection(set(df_dedup_1M["json_hash"].tolist()))
print(len(intersection))

In [None]:
df_cross_dataset_dupes = df_dedup_100k[df_dedup_100k["json_hash"].isin(list(intersection))]
df_cross_dataset_dupes

In [None]:
i = 70
json.loads(df_cross_dataset_dupes.iloc[i]["filters"])

In [None]:
# deficit

df_deficit = pd.read_csv("/opt/gpudata/gdc-eval/results/datasets/deficit_samples_v1.tsv", sep="\t")
print(len(df_deficit))
# Create a hash column
df_deficit['json_hash'] = df_deficit['filters'].apply(hash_json_string)

# Drop duplicates based on the hash column
df_deficit_dedup = df_deficit.drop_duplicates(subset='json_hash').copy()
df_deficit_dedup[:30_000].to_csv("/opt/gpudata/gdc-eval/results/datasets/deficit_samples_v2.tsv", sep="\t", index=False)

In [None]:
df = pd.read_csv("/opt/gpudata/gdc-eval/results/datasets/deficit_samples_v2.tsv", sep="\t")
df

In [None]:
df_dedup_100k

# naive sampling 

In [None]:
with open(
    "/opt/gpudata/anirudh/git-repos/gdc-eval/ref-data/fields_short_v2.yaml", "r"
) as f:
    cbf_mappings = yaml.safe_load(f)

In [None]:
len(cbf_mappings.keys())

In [None]:
len(cbf_mappings.values())

In [None]:
df = 6  # degrees of freedom, also the mean of the chi-squared distribution
num_samples = 1_000_000
margin = 0.15
samples_margin = np.round(np.random.chisquare(df=df, size=int(num_samples + num_samples*margin))).astype(
    int
)  # sample from chi-squared and round to nearest integer

In [None]:
print(len(samples_margin))

In [None]:
samples = [samples_margin[i] for i in list(samples_margin.nonzero()[0])]

In [None]:
print(len(samples))

In [None]:
print(min(samples))

In [None]:
print(max(samples))

In [None]:
df_samples_100k = pd.read_csv("/opt/gpudata/gdc-eval/results/datasets/naive_sampler_100k_v2.tsv", sep="\t")
df_samples_100k

In [None]:
100000 / 16

In [None]:
df_samples_100k_duplicates = df_samples_100k[df_samples_100k["filters"].duplicated(keep=False)]
df_samples_100k_duplicates

In [None]:
df_samples_100k_duplicates.iloc[0]["filters"]

In [None]:
df_samples_100k_duplicates.iloc[1]["filters"]

In [None]:
df_samples_100k_duplicates.iloc[2]["filters"]

In [None]:
df = df_samples_100k
filter_col = "filters"
df = df[df[filter_col] != "{}"]
print(len(df))
df = df.drop_duplicates(subset=[filter_col])
print(len(df))
# remove examples containing set_id
df["set_id"] = df[filter_col].str.contains("set_id")
df = df[df["set_id"] == False]
print(len(df))
# remove examples containing case_id
df["case_id"] = df[filter_col].str.contains("case_id")
df = df[df["case_id"] == False]
print(len(df))
# remove examples containing gene_id
df["gene_id"] = df[filter_col].str.contains("gene_id")
df = df[df["gene_id"] == False]
print(len(df))
# remove examples containing ssm_id
df["ssm_id"] = df[filter_col].str.contains("ssm_id")
df = df[df["ssm_id"] == False]
print(len(df))

In [None]:
def _prepare_filter_dataset(df, filter_col):
    df = df[df[filter_col] != "{}"]
    df = df.drop_duplicates(subset=[filter_col])
    # remove examples containing set_id
    df["set_id"] = df[filter_col].str.contains("set_id")
    df = df[df["set_id"] == False]
    # remove examples containing case_id
    df["case_id"] = df[filter_col].str.contains("case_id")
    df = df[df["case_id"] == False]
    # remove examples containing gene_id
    df["gene_id"] = df[filter_col].str.contains("gene_id")
    df = df[df["gene_id"] == False]
    # remove examples containing ssm_id
    df["ssm_id"] = df[filter_col].str.contains("ssm_id")
    df = df[df["ssm_id"] == False]

    def extract_logged(example):
        d = json.loads(example)
        if "isLoggedIn" in d:
            d.pop("isLoggedIn")
        return str(d)

    # remove 'isLoggedIn' key from filter dict
    df["filters_cleaned"] = df["filters"].apply(extract_logged)

    return df

df_filter_100k = _prepare_filter_dataset(df_samples_100k, "filters")
df_filter_100k

In [None]:
missed_indices = set(df_samples_100k.index.to_list()) - set(df_filter_100k.index.to_list())

In [None]:
missed_df_100k = df_samples_100k.iloc[sorted(list(missed_indices))]
missed_df_100k

In [None]:
for i, row in missed_df_100k.iterrows():
    print(json.loads(row["filters"]))
    break

# generated queries exploration

In [None]:
path = "/opt/gpudata/gdc-eval/results/datasets/Mistral-7B-Instruct-v0.3_generated_queries_100k_naive_v2.csv"

In [None]:
df = pd.read_csv(path)
df.head()

In [None]:
i = 5

In [None]:
df.iloc[i]["filters"]

In [None]:
type(df.iloc[i]["filters"])

In [None]:
type(ast.literal_eval(df.iloc[i]["filters"]))

In [None]:
json.loads(json.dumps(ast.literal_eval(df.iloc[i]["filters"])))

In [None]:
filter = ast.literal_eval(df.iloc[i]["filters"])

In [None]:
cases_endpt = 'https://api.gdc.cancer.gov/cases'
params = {
  "filters" : json.dumps(filter),
  "pretty" : "false",
}
response = requests.get(cases_endpt, params = params)
print(response.json())

In [None]:
df_1 = pd.read_csv("/opt/gpudata/gdc-eval/results/datasets/naive_sampler_1M_v1_part1.tsv", sep="\t")
len(df_1)

In [None]:
df_1.head()

In [None]:
example1 = df_1.iloc[0]["filters"]
print(example1)
print(type(example1))
print(json.loads(example1))

In [None]:
cases_endpt = ""

In [None]:
tok = GPT2TokenizerFast.from_pretrained("gpt2")

In [None]:
df_2 = pd.read_csv("/opt/gpudata/gdc-eval/results/datasets/naive_sampler_1M_v1_part2.tsv", sep="\t")
len(df_2)

In [None]:
df_3 = pd.read_csv("/opt/gpudata/gdc-eval/results/datasets/naive_sampler_1M_v1_part3.tsv", sep="\t")
len(df_3)

# Train Test set exploration

In [None]:
train_df = pd.read_csv("/opt/gpudata/steven/gdc-cohort-pilot/data/train.csv")
test_df = pd.read_csv("/opt/gpudata/steven/gdc-cohort-pilot/data/test.csv")

In [None]:
print(len(train_df))
print(len(test_df))

In [None]:
train_df.head()

In [None]:
train_df.iloc[97]["filters"]

In [None]:
train_df.iloc[0]["filters"]

In [None]:
json.loads(train_df.iloc[0]["filters"])

In [None]:
import requests
import json
example = {'op': 'and',
 'content': [{'op': 'in',
   'content': {'field': 'project.program.name', 'value': ['CPTAC']}}]}
projects_endpt = 'https://api.gdc.cancer.gov/cases'
params = {
    'filters': json.dumps(example),
    'size' : 1687,
    }
response = requests.get(projects_endpt, params = params)

In [None]:
id_set_ = set([hit.get("id") for hit in response.json()["data"]["hits"]])

In [None]:
projects_endpt = 'https://api.gdc.cancer.gov/cases'
example = {'op': 'and',
 'content': [{'op': 'in',
   'content': {'field': 'cases.project.program.name', 'value': ['CPTAC']}}]}
params = {
    'filters': json.dumps(example),
    'size': 1687,
        }
response = requests.get(projects_endpt, params = params)
# print(json.dumps(response.json(), indent=2))

In [None]:
id_set_cases = set([hit.get("id") for hit in response.json()["data"]["hits"]])

In [None]:
id_set_ - id_set_cases

In [None]:
def prepend_to_fields(json_str, prefix):
    def recurse(obj):
        if isinstance(obj, dict):
            for key, value in obj.items():
                if key == "field" and isinstance(value, str):
                    obj[key] = prefix + value
                else:
                    recurse(value)
        elif isinstance(obj, list):
            for item in obj:
                recurse(item)
        # Ignore primitives like str/int/None/etc.

    data = json.loads(json_str)
    recurse(data)
    return data

def modify_json_string(json_str):
    try:
        return json.dumps(prepend_to_fields(json_str, "cases."))
    except Exception as e:
        return None  # or log the error

In [None]:
df_100k = pd.read_csv("/opt/gpudata/gdc-eval/results/datasets/naive_sampler_100k_v2.tsv", sep="\t")
df_100k.head()

In [None]:
df_100k['corrected_filters'] = df_100k['filters'].apply(modify_json_string)
df_100k.head()

In [None]:
import random
rand_indices = random.sample(range(0, 100001), 10)

for i, row in df_100k.iterrows():
    print(json.loads(row["filters"]))
    print(json.loads(row["corrected_filters"]))
    print("\n")

    if i == 25:
        break

In [None]:
df_queries = pd.read_csv("/opt/gpudata/gdc-eval/results/datasets/Mistral-7B-Instruct-v0.3_generated_queries_100k_naive_v2.csv")
df_queries.head()

In [None]:
print(len(df_queries))

In [None]:
import random

rand_indices = random.sample(range(0, 100001), 25)

for i in rand_indices:
    print(i)
    print(json.loads(df_100k.iloc[i]["filters"]))
    print(json.loads(df_queries.iloc[i]["filters"]))
    print("\n")

In [None]:
# check old mistral rewrites 

rew_df = pd.read_csv("/opt/gpudata/gdc-eval/results/datasets/Mistral-7B-Instruct-v0.3_generated_queries_v2.csv")
rew_df.head()

# Check dataset generation

In [None]:
data = load_from_disk("/opt/gpudata/gdc-eval/results/datasets/gdc_eval_train_tokenized.hf")

In [None]:
print(data)

In [None]:
print(data["raw"])

In [None]:
next(iter(data["raw"]))

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("csv", data_files="/opt/gpudata/gdc-cohort-data/train.csv")

In [None]:
dataset

In [None]:
ds = dataset.map(lambda example: {"prompt":example["queries"].strip(),"completion":example["filters"]}, remove_columns=["filters", "queries"])

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "left"  # needed for flash attention
ds = ds.map(lambda example: {"text": f"{example['prompt']}{example['completion']}"})
def tokenize_function(examples):
            return tokenizer(
                examples["text"],
                max_length=1024,
                truncation=True,
                padding=True,
            )
tok_dataset = ds["train"].select(range(64)).map(tokenize_function, batched=True, remove_columns=["prompt", "completion"])
tok_dataset

In [None]:
type(ds)

In [None]:
rewrites_df =pd.read_csv("/opt/gpudata/gdc-cohort-data/user_cohort_rewrites_v1.csv")

In [None]:
rewrites_df

# Dataset Corrections : field prepend endpoints

In [None]:
def prepend_to_fields(json_str, prefix):
    def recurse(obj):
        if isinstance(obj, dict):
            for key, value in obj.items():
                if key == "field" and isinstance(value, str):
                    obj[key] = prefix + value
                else:
                    recurse(value)
        elif isinstance(obj, list):
            for item in obj:
                recurse(item)
        # Ignore primitives like str/int/None/etc.

    # data = json.loads(json_str)
    data = ast.literal_eval(json_str)
    recurse(data)
    return data

def extract_fields(json_str):
    fields = []

    def recurse(obj):
        if isinstance(obj, dict):
            for key, value in obj.items():
                if key == "field" and isinstance(value, str):
                    fields.append(value)
                else:
                    recurse(value)
        elif isinstance(obj, list):
            for item in obj:
                recurse(item)

    data = json.loads(json_str)
    recurse(data)
    return fields


def modify_json_string(json_str):
    try:
        return json.dumps(prepend_to_fields(json_str, "cases."))
    except Exception as e:
        return None  # or log the error

def all_fields_prefixed(obj, prefix):
    success = True

    def recurse(o):
        nonlocal success
        if isinstance(o, dict):
            for k, v in o.items():
                if k == "field" and isinstance(v, str):
                    if not v.startswith(prefix):
                        success = False
                else:
                    recurse(v)
        elif isinstance(o, list):
            for item in o:
                recurse(item)

    recurse(obj)
    return success

In [None]:
df_deficit = pd.read_csv("/opt/gpudata/gdc-eval/results/datasets/deficit_samples_v2_queries.tsv")
df_deficit

In [None]:
# 100k synthethic correction
df_100k = pd.read_csv("/opt/gpudata/gdc-eval/results/datasets/naive_sampler_100k_v2_queries.csv")
df_100k

In [None]:
df_corrected_100k = df_100k[["filters", "queries"]]
df_corrected_100k = df_corrected_100k.rename(columns={"filters": "filters_og"})
df_corrected_100k["filters"] = df_corrected_100k["filters_og"].apply(modify_json_string)
df_corrected_100k

In [None]:
df_corrected_100k["is_valid"] = df_corrected_100k["filters"].apply(lambda s: all_fields_prefixed(ast.literal_eval(s), 'cases.'))
df_corrected_100k[~df_corrected_100k["is_valid"]]

In [None]:
# 100k merging
n1 = 100_000 - len(df_corrected_100k)
df1_slice = df_corrected_100k[["filters", "queries"]]
df2_slice = df_deficit[:n1]
df_100k_full = pd.concat([df1_slice, df2_slice], axis=0, ignore_index=True).drop(columns="prompts")
df_100k_full.to_csv("/opt/gpudata/gdc-cohort-data/train_synthetic_100k.csv", index=False)

In [None]:
# users + 100k merging
df_users = pd.read_csv("/opt/gpudata/gdc-cohort-data/train.csv")
# users + 100k synthetic
df_u_100k = pd.concat([df_users, df_100k_full], axis=0, ignore_index=True)
df_u_100k.to_csv("/opt/gpudata/gdc-cohort-data/train_synthetic_users+100k.csv", index=False)

In [None]:
# 1M synthethic 
df_1M_part1 = pd.read_csv("/opt/gpudata/gdc-eval/results/datasets/naive_sampler_1M_v1_part1_queries.csv")
df_1M_part2 = pd.read_csv("/opt/gpudata/gdc-eval/results/datasets/naive_sampler_1M_v1_part2_queries.csv")
df_1M_part3 = pd.read_csv("/opt/gpudata/gdc-eval/results/datasets/naive_sampler_1M_v1_part3_queries.csv")

df_1M = pd.concat([df_1M_part1, df_1M_part2, df_1M_part3], axis=0, ignore_index=True)
df_1M

In [None]:
df_corrected_1M = df_1M[["filters", "queries"]]
df_corrected_1M = df_corrected_1M.rename(columns={"filters": "filters_og"})
df_corrected_1M["filters"] = df_corrected_1M["filters_og"].apply(modify_json_string)
df_corrected_1M

In [None]:
df_corrected_1M["is_valid"] = df_corrected_1M["filters"].apply(lambda s: all_fields_prefixed(ast.literal_eval(s), 'cases.'))
df_corrected_1M[~df_corrected_1M["is_valid"]]

In [None]:
# 1M merging
n2 = 1000_000 - len(df_corrected_1M)
df1_slice = df_corrected_1M
df2_slice = df_deficit[n1:(n1+n2)]
df_1M_full = pd.concat([df1_slice, df2_slice], axis=0, ignore_index=True)
df_1M_full.to_csv("/opt/gpudata/gdc-cohort-data/train_synthetic_1M.csv", index=False)

In [None]:
df_1M_full

In [None]:
# users + 100k merging
df_users = pd.read_csv("/opt/gpudata/gdc-cohort-data/train.csv")
# users + 100k synthetic
df_u_1M = pd.concat([df_users, df_1M_full], axis=0, ignore_index=True).drop(columns=["filters_og", "is_valid", "prompts"])
df_u_1M.to_csv("/opt/gpudata/gdc-cohort-data/train_synthetic_users+1M.csv", index=False)

# training script updates

In [None]:
dataset = load_dataset("csv", data_files={"train":"/opt/gpudata/gdc-eval/results/datasets/naive_sampler_100k_v2_queries.csv"})
print(dataset)

In [None]:
dataset = dataset.map(
        lambda example: {
            "prompt": example["queries"].strip(),
            "completion": example["filters"],
        },
        remove_columns=["filters", "prompts", "queries"],
    )

In [None]:
dataset = dataset.map(lambda example: {"text":f"{example['prompt']}{example['completion']}"})
dataset