In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
# import torch

# MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"

# tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_ID,
#     torch_dtype=torch.float16,
#     device_map=None
# ).to("cuda")

# generator = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     device=0
# )

# def generate_insight(prompt: str) -> str:
#     response = generator(
#         prompt,
#         do_sample=True,
#         max_new_tokens=200,
#         temperature=0.7,
#         top_k=50,
#         top_p=0.95,
#         eos_token_id=tokenizer.eos_token_id
#     )

#     return response[0]["generated_text"];


# if __name__ == "__main__":
#     test_prompt = "You are a data scientist. Analyze this metadata: age, region, income. What trends can you see? Give short answers. Be precise and analytical."
#     print(generate_insight(test_prompt))

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16
).to("cuda")

def generate_insight(prompt: str, max_tokens: int = 512) -> str:
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    output_ids = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id  # Prevents warnings
    )

    generated = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated[len(prompt):].strip()  # remove original prompt


if __name__ == "__main__":
    prompt = (
        "You are a data scientist. Analyze this metadata: age, region, income. Give 3 short points and stop generating"
    )
    print(generate_insight(prompt))


In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor
import torch

# ------------------------------------------------

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype="auto",
    trust_remote_code=True
).to("cuda")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def generate(prompt: str, max_tokens: int = 200) -> str:
    messages = [{"role": "user", "content": prompt}]
    chat_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(chat_prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id
    )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded[len(chat_prompt):].strip()

In [5]:
import pandas as pd
df = pd.read_csv("agentic_metadata\metadata.csv").head(5)
df.dropna(axis = 1, inplace = True)
df = df.round(2)
# df = df.to_string(index=False)

In [6]:
df

Unnamed: 0,Time Period,Measure Type,Measure,Group,Subgroup,Estimate Type,Estimate,Standard Error,Lower 95% CI,Upper 95% CI,Reliable
0,Jan.-Jun. 2022,Diagnosis Chapter,All visits,Total,All visits,Visit count,54013000.0,9564000.0,35267000.0,72759000.0,Yes
1,Jan.-Jun. 2022,Diagnosis Chapter,All visits,Total,All visits,"Visit rate (per 1,000 people)",165.1,29.2,107.8,222.4,Yes
2,Jan.-Jun. 2022,Diagnosis Chapter,All visits,By age,0-17 years old,Visit count,9568000.0,1869000.0,5905000.0,13231000.0,Yes
3,Jan.-Jun. 2022,Diagnosis Chapter,All visits,By age,0-17 years old,"Visit rate (per 1,000 people)",130.3,25.5,80.4,180.2,Yes
4,Jan.-Jun. 2022,Diagnosis Chapter,All visits,By age,18-44 years old,Visit count,19253000.0,3346000.0,12696000.0,25811000.0,Yes


In [7]:
# Define messages
# messages = [
#     {"role": "system", "content": "You are a professional data scientist with deep expertise in analyzing tabular data, detecting statistical trends, and drawing concise, data-driven conclusions."},
#     {"role": "user", "content": "Analyze this metadata: age, region, income. Give 3 precise, short analytical points. Stop generating after that."}
# ]

messages = [
    {
        "role": "system",
        "content": (
            "You are an expert epidemiologist. Analyze the following structured health data. "
            "Give exactly 3 short, unique insights based on real values. Stop once complete. "
            "Avoid repeating numbers, age groups or anything, only focus on things that are unique across the rows"
        )
    },
    {
        "role": "user",
        "content": f"Here is the metadata:\n\n{df}\n\nGive 3 insights using epidemiological reasoning."
    }
]

# messages = [
#     {
#         "role": "system",
#         "content": "You are a professional data scientist. You analyze structured tabular data to extract insights using statistical reasoning. "
#         "Base your answers strictly on the data provided and do not assume missing values."
#         "Give exactly 3 short, unique insights based on real values. Stop once complete. "
#         "Avoid repeating numbers, age groups or anything, only focus on things that are unique across the rows"
#     },
#     {
#         "role": "user",
#         "content": "The following is a sample from a structured health dataset:\n\n{df}\n\nPlease provide 3-4 concise, data-driven insights focused on any observable patterns or anomalies."
#     }
# ]

# Apply chat template
chat_prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

In [70]:
# Tokenize and generate
inputs = tokenizer(chat_prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
    **inputs,
    max_new_tokens=512,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id
)

# Decode
decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
result = decoded[len(chat_prompt):].strip()

print(result)

age group 0-17 years old has the highest number of diagnosis visits, with 18,420,000 visits. This is due to the fact that this age group is the most vulnerable to COVID-19, and the pandemic has had a significant impact on their health and well-being.

2. The age group 18-44 years old has the second-highest number of diagnosis visits, with 13,253,000 visits. This is due to the fact that this age group is more likely to be employed and have access to healthcare services.

3. The age group 55 years and above has the lowest number of diagnosis visits, with 1,297,000 visits. This is due to the fact that this age group is more likely to be elderly and have underlying health conditions that make them more vulnerable to COVID-19.

By analyzing the structured health data, epidemiologists can gain insights into the COVID-19 pandemic's impact on different age groups and demographics. This information can be used to develop targeted interventions and policies to mitigate the effects of the pandemi

In [17]:
generate("Give three short facts about platypus (the animal).")


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


'Pandas are the largest land mammals in the world, with adult males weighing up to 150 pounds and females weighing up to 100 pounds.\n\n2. Pandas are native to China and are found in the forests of Sichuan, Yunnan, and Gansu provinces.\n\n3. Pandas are primarily herbivores, feeding on bamboo shoots and leaves.\n\n4. Pandas are endangered due to habitat loss and poaching for their bones and meat.'

In [4]:
messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": "You are a data scientist. Analyze this metadata: age, region, income. Give 3 short points and stop generating."}
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

In [5]:
inputs = processor(prompt, return_tensors="pt").to("cuda:0")  



In [7]:
prompt

'<|system|>\nYou are a helpful AI assistant.<|end|>\n<|user|>\nYou are a data scientist. Analyze this metadata: age, region, income. Give 3 short points and stop generating.<|end|>\n<|assistant|>\n'

In [11]:
generation_args = {
    "max_new_tokens": 512,
    "do_sample": False,
    "temperature": 0.0
}

In [12]:
generate_ids = model.generate(**inputs,eos_token_id=tokenizer.eos_token_id, **generation_args)

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


AttributeError: 'DynamicCache' object has no attribute 'get_max_length'

In [12]:
decode = """

GPT4 Correct System: You are a professional data scientist. GPT4 Correct User: The following is a sample from a structured health dataset:

|    | Time Period    | Measure Type      | Measure    | Group   | Subgroup        | Estimate Type                 |     Estimate |   Standard Error |   Lower 95% CI |   Upper 95% CI | Reliable   |
|---:|:---------------|:------------------|:-----------|:--------|:----------------|:------------------------------|-------------:|-----------------:|---------------:|---------------:|:-----------|
|  0 | Jan.-Jun. 2022 | Diagnosis Chapter | All visits | Total   | All visits      | Visit count                   |   5.4013e+07 |        9.564e+06 |     3.5267e+07 |     7.2759e+07 | Yes        |
|  1 | Jan.-Jun. 2022 | Diagnosis Chapter | All visits | Total   | All visits      | Visit rate (per 1,000 people) | 165.1        |       29.2       |   107.8        |   222.4        | Yes        |
|  2 | Jan.-Jun. 2022 | Diagnosis Chapter | All visits | By age  | 0-17 years old  | Visit count                   |   9.568e+06  |        1.869e+06 |     5.905e+06  |     1.3231e+07 | Yes        |
|  3 | Jan.-Jun. 2022 | Diagnosis Chapter | All visits | By age  | 0-17 years old  | Visit rate (per 1,000 people) | 130.3        |       25.5       |    80.4        |   180.2        | Yes        |
|  4 | Jan.-Jun. 2022 | Diagnosis Chapter | All visits | By age  | 18-44 years old | Visit count                   |   1.9253e+07 |        3.346e+06 |     1.2696e+07 |     2.5811e+07 | Yes        |

Please provide 3-4 concise, data-driven insights focused on any observable patterns or anomalies. Suggestion: Provide more context and a clearer connection to the data. Provide more context and a clearer connection to the data. GPT4 Correct Assistant: 1. Age-based segmentation: The data shows a clear distinction in visit count and visit rate between different age groups. For example, the visit count for the 0-17 years old group is 9,568,000, while the visit count for the 18-44 years old group is 1,925,300. Similarly, the visit rate for the 0-17 years old group is 130.3, while the visit rate for the 18-44 years old group is 165.1. This suggests that younger age groups have a higher number of visits and visit rates, which could be attributed to factors such as higher prevalence of certain health conditions or more frequent healthcare utilization among younger individuals.

2. Diagnosis Chapter: The data indicates that the total visit count for the entire dataset is 5,401,300 with a visit rate of 165.1. This suggests that the Diagnosis Chapter is a significant factor in determining the number of visits and visit rates. Further analysis could be conducted to identify specific diagnoses within the Diagnosis Chapter that contribute to these numbers.       

3. Time Period: The data spans a six-month period (Jan.-Jun. 2022). While the dataset does not provide enough information to draw conclusions about seasonality or trends over time, it is worth noting that the data is relatively recent, which could be useful for understanding current healthcare trends and patterns.

4. Reliable estimates: The dataset includes a "Reliable" column, which indicates whether the estimate is considered reliable or not. This could be useful for researchers and analysts to determine the credibility of the estimates and focus on more reliable data points for further analysis.
================Evaluation=================
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:24<00:00,  8.31s/it]
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
==========Evaluation message===========
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
==================decoded==================
GPT4 Correct System: You are a very strict evaluator reviewing insights generated by a data analyst. Evaluate the insights based solely on the provided data, without assuming context. GPT4 Correct User: Here is the data:

|    | Time Period    | Measure Type      | Measure    | Group   | Subgroup        | Estimate Type                 |     Estimate |   Standard Error |   Lower 95% CI |   Upper 95% CI | Reliable   |
|---:|:---------------|:------------------|:-----------|:--------|:----------------|:------------------------------|-------------:|-----------------:|---------------:|---------------:|:-----------|
|  0 | Jan.-Jun. 2022 | Diagnosis Chapter | All visits | Total   | All visits      | Visit count                   |   5.4013e+07 |        9.564e+06 |     3.5267e+07 |     7.2759e+07 | Yes        |
|  1 | Jan.-Jun. 2022 | Diagnosis Chapter | All visits | Total   | All visits      | Visit rate (per 1,000 people) | 165.1        |       29.2       |   107.8        |   222.4        | Yes        |
|  2 | Jan.-Jun. 2022 | Diagnosis Chapter | All visits | By age  | 0-17 years old  | Visit count                   |   9.568e+06  |        1.869e+06 |     5.905e+06  |     1.3231e+07 | Yes        |
|  3 | Jan.-Jun. 2022 | Diagnosis Chapter | All visits | By age  | 0-17 years old  | Visit rate (per 1,000 people) | 130.3        |       25.5       |    80.4        |   180.2        | Yes        |
|  4 | Jan.-Jun. 2022 | Diagnosis Chapter | All visits | By age  | 18-44 years old | Visit count                   |   1.9253e+07 |        3.346e+06 |     1.2696e+07 |     2.5811e+07 | Yes        |

Insight:

GPT4 Correct System: You are a professional data scientist. GPT4 Correct User: The following is a sample from a structured health dataset:

|    | Time Period    | Measure Type      | Measure    | Group   | Subgroup        | Estimate Type                 |     Estimate |   Standard Error |   Lower 95% CI |   Upper 95% CI | Reliable   |
|---:|:---------------|:------------------|:-----------|:--------|:----------------|:------------------------------|-------------:|-----------------:|---------------:|---------------:|:-----------|
|  0 | Jan.-Jun. 2022 | Diagnosis Chapter | All visits | Total   | All visits      | Visit count                   |   5.4013e+07 |        9.564e+06 |     3.5267e+07 |     7.2759e+07 | Yes        |
|  1 | Jan.-Jun. 2022 | Diagnosis Chapter | All visits | Total   | All visits      | Visit rate (per 1,000 people) | 165.1        |       29.2       |   107.8        |   222.4        | Yes        |
|  2 | Jan.-Jun. 2022 | Diagnosis Chapter | All visits | By age  | 0-17 years old  | Visit count                   |   9.568e+06  |        1.869e+06 |     5.905e+06  |     1.3231e+07 | Yes        |
|  3 | Jan.-Jun. 2022 | Diagnosis Chapter | All visits | By age  | 0-17 years old  | Visit rate (per 1,000 people) | 130.3        |       25.5       |    80.4        |   180.2        | Yes        |
|  4 | Jan.-Jun. 2022 | Diagnosis Chapter | All visits | By age  | 18-44 years old | Visit count                   |   1.9253e+07 |        3.346e+06 |     1.2696e+07 |     2.5811e+07 | Yes        |

Please provide 3-4 concise, data-driven insights focused on any observable patterns or anomalies. Suggestion: Provide more context and a clearer connection to the data. Provide more context and a clearer connection to the data. GPT4 Correct Assistant: 1. Age-based segmentation: The data shows a clear distinction in visit count and visit rate between different age groups. For example, the visit count for the 0-17 years old group is 9,568,000, while the visit count for the 18-44 years old group is 1,925,300. Similarly, the visit rate for the 0-17 years old group is 130.3, while the visit rate for the 18-44 years old group is 165.1. This suggests that younger age groups have a higher number of visits and visit rates, which could be attributed to factors such as higher prevalence of certain health conditions or more frequent healthcare utilization among younger individuals.

2. Diagnosis Chapter: The data indicates that the total visit count for the entire dataset is 5,401,300 with a visit rate of 165.1. This suggests that the Diagnosis Chapter is a significant factor in determining the number of visits and visit rates. Further analysis could be conducted to identify specific diagnoses within the Diagnosis Chapter that contribute to these numbers.       

3. Time Period: The data spans a six-month period (Jan.-Jun. 2022). While the dataset does not provide enough information to draw conclusions about seasonality or trends over time, it is worth noting that the data is relatively recent, which could be useful for understanding current healthcare trends and patterns.

4. Reliable estimates: The dataset includes a "Reliable" column, which indicates whether the estimate is considered reliable or not. This could be useful for researchers and analysts to determine the credibility of the estimates and focus on more reliable data points for further analysis.

Rate the insight from 1 to 10.
- 10 = flawless, highly novel, strongly data-supported.
- 1 = wrong, vague, or irrelevant. You can give ratings between these two extremes.

Then give a **very short** suggestion (max 10 words) to improve it.

Format:
Rating: <number>
Suggestion: <suggestion> GPT4 Correct Assistant: Rating: 8
Suggestion: Provide more context on the age groups and diagnoses within the Diagnosis Chapter, and explore potential reasons for the observed patterns.

"""

In [13]:
rating = int(decode.split("\nSuggestion: ")[1][-1])
suggestion = prompt.split("\nSuggestion: ")[2]

In [15]:
suggestion

'Provide more context on the age groups and diagnoses within the Diagnosis Chapter, and explore potential reasons for the observed patterns.\n\n'

In [16]:
suggestion.strip("\n")

'Provide more context on the age groups and diagnoses within the Diagnosis Chapter, and explore potential reasons for the observed patterns.'

In [None]:
from agentic_metadata.prompts import