In [2]:
import pandas as pd

In [3]:
# Extraer los textos del servicio CARDIOTHORACIC
df = pd.read_csv('./cardiothoracic_texts.csv')

In [4]:
# Extrer 150 textos de manera aleatoria
df = df.sample(n=150, random_state=42)

In [5]:
df.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,...,category_3,category_4,category_5,section_name,category_1_name,category_2_name,category_3_name,category_4_name,chapter_name,super_section_name
294,10224362,25595893,2157-04-12 00:00:00,2157-04-14 14:49:00,,SURGICAL SAME DAY ADMISSION,P16QV8,PHYSICIAN REFERRAL,HOME,Medicare,...,R91.1,R91.1,R91.1,Abnormal findings on diagnostic imaging of lung,Solitary pulmonary nodule,Solitary pulmonary nodule,Solitary pulmonary nodule,Solitary pulmonary nodule,"Symptoms, signs and abnormal clinical and labo...",Abnormal findings on diagnostic imaging and in...
545,10417511,28293295,2147-11-02 16:02:00,2147-11-07 13:00:00,,URGENT,P33O7Z,TRANSFER FROM HOSPITAL,HOME HEALTH CARE,Private,...,I25.10,I25.10,I25.10,Chronic ischemic heart disease,Atherosclerotic heart disease of native corona...,Atherosclerotic heart disease of native corona...,Atherosclerotic heart disease of native corona...,Atherosclerotic heart disease of native corona...,Diseases of the circulatory system,Ischemic heart diseases
213,10163774,25837438,2127-10-11 17:00:00,2127-10-13 10:10:00,,OBSERVATION ADMIT,P95BQY,EMERGENCY ROOM,HOME,Medicare,...,R50.9,R50.9,R50.9,Fever of other and unknown origin,"Fever, unspecified","Fever, unspecified","Fever, unspecified","Fever, unspecified","Symptoms, signs and abnormal clinical and labo...",General symptoms and signs
328,10248673,28164505,2177-06-18 18:20:00,2177-06-25 16:00:00,,EW EMER.,P6512E,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,...,I25.10,I25.10,I25.10,Chronic ischemic heart disease,Atherosclerotic heart disease of native corona...,Atherosclerotic heart disease of native corona...,Atherosclerotic heart disease of native corona...,Atherosclerotic heart disease of native corona...,Diseases of the circulatory system,Ischemic heart diseases
164,10125966,29680994,2191-11-01 13:30:00,2191-11-05 12:00:00,,SURGICAL SAME DAY ADMISSION,P15314,PHYSICIAN REFERRAL,HOME HEALTH CARE,Medicare,...,C34.30,C34.30,C34.30,Malignant neoplasm of bronchus and lung,"Malignant neoplasm of lower lobe, bronchus or ...","Malignant neoplasm of lower lobe, unspecified ...","Malignant neoplasm of lower lobe, unspecified ...","Malignant neoplasm of lower lobe, unspecified ...",Neoplasms,Malignant neoplasms of respiratory and intrath...


# Modelo gemma-2-2B

In [6]:
import transformers
import torch

# Token de huggingface añadido en la terminal

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Verifica si hay GPU disponible
#device = "cuda" if torch.cuda.is_available() else "cpu"

torch.set_float32_matmul_precision('high')

In [8]:
# Extraer los textos del servicio CARDIOTHORACIC
contexts = df['text'].tolist()

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "google/gemma-2-2b-it"
dtype = torch.bfloat16

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=dtype,)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.24it/s]


## Preguntas sin rol a temperatura por defecto (1.0)

In [9]:
def pregunta_sin_rol(context, df):

  chat = [
    { "role": "user", "content": f"Based on {context}. Give me a summary of the diagnostic for the patient" },
  ]

  prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

  inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
  outputs = model.generate(input_ids=inputs.to(model.device), max_new_tokens=150)

  decoded_output = tokenizer.decode(outputs[0])
  #print(decoded_output)

  # Decodificar solo la respuesta (sin el prompt)
  generated_tokens = outputs[0][inputs.shape[-1]:]  # Recorta el prompt
  decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)

  # Añadir las variables de entrada y salida para guardarlas en un df
  new_row = {'input': prompt, 'output': decoded_output}
  df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  return df


In [6]:
# Extraer los textos del servicio CARDIOTHORACIC
contexts = df['text'].tolist()

In [21]:
df_gemma_sin_rol_1 = pd.DataFrame(columns=['input', 'output'])

In [None]:
# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_gemma_sin_rol_1 = pregunta_sin_rol(context, df_gemma_sin_rol_1)
    print(f"Procesadas {len(df_gemma_sin_rol_1)} respuestas")
    if( len(df_gemma_sin_rol_1) % 10 == 0):
        df_gemma_sin_rol_1.to_csv('./analysis/gemma_sin_rol_1.csv')
        print(f"Guardadas {len(df_gemma_sin_rol_1)} respuestas")

In [23]:
df_gemma_sin_rol_1.head()

Unnamed: 0,input,output
0,<bos><start_of_turn>user\nBased on \nName: _...,## Summary of Diagnostic Findings:\n\nThis pat...
1,<bos><start_of_turn>user\nBased on \nName: _...,## Summary of Diagnostic Findings for Mr. ___\...
2,<bos><start_of_turn>user\nBased on \nName: _...,This patient is a 68-year-old female with a hi...
3,<bos><start_of_turn>user\nBased on \nName: _...,This patient is a 70-year-old male with a hist...
4,<bos><start_of_turn>user\nBased on \nName: _...,## Summary of Diagnostic Findings:\n\nThis pat...


In [None]:
df_gemma_sin_rol_1.to_csv('./analysis/gemma_sin_rol_1.csv')

In [26]:
df_gemma_sin_rol_1.shape

(150, 2)

## Preguntas con rol a temperatura por defecto (1.0)

In [12]:
def pregunta_con_rol(context, df):

  chat = [
    { "role": "user", "content": f"Based on {context}. Give me a summary of the diagnostic for the patient" },
    { "role": "expert", "content": "You are a professional medical doctor. You provide clear and concise summaries based on patient's clinical information"}
  ]

  prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

  inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
  outputs = model.generate(input_ids=inputs.to(model.device), max_new_tokens=150)

  decoded_output = tokenizer.decode(outputs[0])
  #print(decoded_output)

  # Decodificar solo la respuesta (sin el prompt)
  generated_tokens = outputs[0][inputs.shape[-1]:]  # Recorta el prompt
  decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True)

  # Añadir las variables de entrada y salida para guardarlas en un df
  new_row = {'input': prompt, 'output': decoded_output}
  df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  return df


In [None]:
df_gemma_con_rol_1 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_gemma_con_rol_1 = pregunta_con_rol(context, df_gemma_con_rol_1)
    print(f"Procesadas {len(df_gemma_con_rol_1)} respuestas")
    if( len(df_gemma_con_rol_1) % 10 == 0):
        df_gemma_con_rol_1.to_csv('./analysis/gemma_con_rol_1.csv')
        print(f"Guardadas {len(df_gemma_con_rol_1)} respuestas")

## Preguntas sin rol a temperatura 0.5

In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

model_id = "google/gemma-2-2b-it"
dtype = torch.bfloat16

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=dtype,
    temperature= 0.5,
    do_sample=True)

chat = [
    { "role": "user", "content": "What do you know about Generative AI applied to medicine?" },
]
prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.15it/s]


In [None]:
df_gemma_sin_rol_0_5 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_gemma_sin_rol_0_5 = pregunta_sin_rol(context, df_gemma_sin_rol_0_5)
    print(f"Procesadas {len(df_gemma_sin_rol_0_5)} respuestas")
    if( len(df_gemma_sin_rol_0_5) % 10 == 0):
        df_gemma_sin_rol_0_5.to_csv('./analysis/gemma_sin_rol_0_5.csv')
        print(f"Guardadas {len(df_gemma_sin_rol_0_5)} respuestas")

In [14]:
df_gemma_sin_rol_0_5.shape

(150, 2)

## Preguntas con rol a temperatura 0.5

In [None]:
df_gemma_con_rol_0_5 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_gemma_con_rol_0_5 = pregunta_con_rol(context, df_gemma_con_rol_0_5)
    print(f"Procesadas {len(df_gemma_con_rol_0_5)} respuestas")
    if( len(df_gemma_con_rol_0_5) % 10 == 0):
        df_gemma_con_rol_0_5.to_csv('./analysis/gemma_con_rol_0_5.csv')
        print(f"Guardadas {len(df_gemma_con_rol_0_5)} respuestas")

## Preguntas sin rol a temperatura 0.1

In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

model_id = "google/gemma-2-2b-it"
dtype = torch.bfloat16

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=dtype,
    temperature= 0.1,
    do_sample=True)

chat = [
    { "role": "user", "content": "What do you know about Generative AI applied to medicine?" },
]
prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.19it/s]


In [None]:
df_gemma_sin_rol_0_1 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_gemma_sin_rol_0_1 = pregunta_sin_rol(context, df_gemma_sin_rol_0_1)
    print(f"Procesadas {len(df_gemma_sin_rol_0_1)} respuestas")
    if( len(df_gemma_sin_rol_0_1) % 10 == 0):
        df_gemma_sin_rol_0_1.to_csv('./analysis/gemma_sin_rol_0_1.csv')
        print(f"Guardadas {len(df_gemma_sin_rol_0_1)} respuestas")

## Preguntas con rol a 0.1

In [None]:
df_gemma_con_rol_0_1 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_gemma_con_rol_0_1 = pregunta_con_rol(context, df_gemma_con_rol_0_1)
    print(f"Procesadas {len(df_gemma_con_rol_0_1)} respuestas")
    if( len(df_gemma_con_rol_0_1) % 10 == 0):
        df_gemma_con_rol_0_1.to_csv('./analysis/gemma_con_rol_0_1.csv')
        print(f"Guardadas {len(df_gemma_con_rol_0_1)} respuestas")

# Modelo Llama-3.2-1B-Instruct

In [8]:
from transformers import pipeline
import torch

# Configura el pipeline con parámetros de generación
pipe = pipeline(
    "text-generation",
    model="meta-llama/Llama-3.2-1B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    do_sample=True,
    temperature=1,
    #top_p=0.9,
)

Device set to use cuda:0


## Preguntas sin rol a temperatura 1.0

In [7]:
def pregunta_sin_rol (context, df):
  messages = [
      { "role": "user", "content": f"Based on {context}. Give me a summary of the diagnostic for the patient" },
  ]

  # Generación con parámetros controlados
  outputs = pipe(
      messages,
      max_new_tokens=150,
  )

  # Procesamiento limpio de la salida
  response = outputs[0]['generated_text'][-1]['content']
  #print(response)

  prompt = messages[0]["content"]
  new_row = {'input': prompt, 'output': response}
  df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  return df

In [None]:
df_llama_sin_rol_1 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_llama_sin_rol_1 = pregunta_sin_rol(context, df_llama_sin_rol_1)
    print(f"Procesadas {len(df_llama_sin_rol_1)} respuestas")
    if( len(df_llama_sin_rol_1) % 10 == 0):
        df_llama_sin_rol_1.to_csv('./analysis/llama_sin_rol_1.csv')
        print(f"Guardadas {len(df_llama_sin_rol_1)} respuestas")

## Preguntas con rol a temperatura 1.0

In [9]:
def pregunta_con_rol (context, df):
  messages = [
      { "role": "user", "content": f"Based on {context}. Give me a summary of the diagnostic for the patient" },
      { "role": "expert", "content": "You are a professional medical doctor. You provide clear and concise summaries based on patient's clinical information"}
  ]

  # Generación con parámetros controlados
  outputs = pipe(
      messages,
      max_new_tokens=150,
  )

  # Procesamiento limpio de la salida
  response = outputs[0]['generated_text'][-1]['content']
  #print(response)

  prompt = messages[0]["content"]
  new_row = {'input': prompt, 'output': response}
  df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  return df

In [None]:
df_llama_con_rol_1 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_llama_con_rol_1 = pregunta_con_rol(context, df_llama_con_rol_1)
    print(f"Procesadas {len(df_llama_con_rol_1)} respuestas")
    if( len(df_llama_con_rol_1) % 10 == 0):
        df_llama_con_rol_1.to_csv('./analysis/llama_con_rol_1.csv')
        print(f"Guardadas {len(df_llama_con_rol_1)} respuestas")

## Preguntas sin rol a temperatura 0.5

In [9]:
from transformers import pipeline
import torch

# Configura el pipeline con parámetros de generación
pipe = pipeline(
    "text-generation",
    model="meta-llama/Llama-3.2-1B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    do_sample=True,
    temperature=0.5,
    #top_p=0.9,
)

Device set to use cuda:0


In [None]:
df_llama_sin_rol_0_5 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_llama_sin_rol_0_5 = pregunta_sin_rol(context, df_llama_sin_rol_0_5)
    print(f"Procesadas {len(df_llama_sin_rol_0_5)} respuestas")
    if( len(df_llama_sin_rol_0_5) % 10 == 0):
        df_llama_sin_rol_0_5.to_csv('./analysis/llama_sin_rol_0_5.csv')
        print(f"Guardadas {len(df_llama_sin_rol_0_5)} respuestas")

## Preguntas con rol a temperatura 0.5

In [None]:
df_llama_con_rol_0_5 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_llama_con_rol_0_5 = pregunta_con_rol(context, df_llama_con_rol_0_5)
    print(f"Procesadas {len(df_llama_con_rol_0_5)} respuestas")
    if( len(df_llama_con_rol_0_5) % 10 == 0):
        df_llama_con_rol_0_5.to_csv('./analysis/llama_con_rol_0_5.csv')
        print(f"Guardadas {len(df_llama_con_rol_0_5)} respuestas")

## Preguntas sin rol a temperatura 0.1

In [12]:
from transformers import pipeline
import torch

# Configura el pipeline con parámetros de generación
pipe = pipeline(
    "text-generation",
    model="meta-llama/Llama-3.2-1B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    do_sample=True,
    temperature=0.1,
    #top_p=0.9,
)

Device set to use cuda:0


In [None]:
df_llama_sin_rol_0_1 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_llama_sin_rol_0_1 = pregunta_sin_rol(context, df_llama_sin_rol_0_1)
    print(f"Procesadas {len(df_llama_sin_rol_0_1)} respuestas")
    if( len(df_llama_sin_rol_0_1) % 10 == 0):
        df_llama_sin_rol_0_1.to_csv('./analysis/llama_sin_rol_0_1.csv')
        print(f"Guardadas {len(df_llama_sin_rol_0_1)} respuestas")

## Preguntas con rol a temperatura 0.1

In [None]:
df_llama_con_rol_0_1 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_llama_con_rol_0_1 = pregunta_con_rol(context, df_llama_con_rol_0_1)
    print(f"Procesadas {len(df_llama_con_rol_0_1)} respuestas")
    if( len(df_llama_con_rol_0_1) % 10 == 0):
        df_llama_con_rol_0_1.to_csv('./analysis/llama_con_rol_0_1.csv')
        print(f"Guardadas {len(df_llama_con_rol_0_1)} respuestas")

# Modelo Qwen3-4B

In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-4B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="cuda"
)

Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.07it/s]


## Preguntas sin rol a temperatura 1.0

In [13]:
def pregunta_sin_rol(context, df):

  messages = [
    { "role": "user", "content": f"Based on {context}. Give me a summary of the diagnostic for the patient" },
  ]

  text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True,
      temperature=1.0,
      enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
  )
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

  # conduct text completion
  generated_ids = model.generate(
      **model_inputs,
      max_new_tokens=32768
  )
  output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

  # parsing thinking content
  try:
      # rindex finding 151668 (</think>)
      index = len(output_ids) - output_ids[::-1].index(151668)
  except ValueError:
      index = 0

  #thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
  content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

  #print("thinking content:", thinking_content)
  #print("content:", content)

  prompt = messages[0]["content"]
  new_row = {'input': prompt, 'output': content}
  df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  return df

In [None]:
df_qwen_sin_rol_1 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_qwen_sin_rol_1 = pregunta_sin_rol(context, df_qwen_sin_rol_1)
    print(f"Procesadas {len(df_qwen_sin_rol_1)} respuestas")
    if( len(df_qwen_sin_rol_1) % 10 == 0):
        df_qwen_sin_rol_1.to_csv('./analysis/qwen_sin_rol_1.csv')
        print(f"Guardadas {len(df_qwen_sin_rol_1)} respuestas")

## Preguntas con rol a temperatura 1.0

In [10]:
def pregunta_con_rol(context, df):

  messages = [
    { "role": "user", "content": f"Based on {context}. Give me a summary of the diagnostic for the patient" },
    { "role": "expert", "content": "You are a professional medical doctor. You provide clear and concise summaries based on patient's clinical information"}
  ]

  text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True,
      enable_thinking=False, # Switches between thinking and non-thinking modes. Default is True.
      temperature=1.0,
      do_sample=True
  )
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

  # conduct text completion
  generated_ids = model.generate(
      **model_inputs,
      max_new_tokens=32768
  )
  output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

  # parsing thinking content
  try:
      # rindex finding 151668 (</think>)
      index = len(output_ids) - output_ids[::-1].index(151668)
  except ValueError:
      index = 0

  #thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
  content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

  #print("thinking content:", thinking_content)
  #print("content:", content)

  prompt = messages[0]["content"]
  new_row = {'input': prompt, 'output': content}
  df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  return df

In [None]:
df_qwen_con_rol_1 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_qwen_con_rol_1 = pregunta_con_rol(context, df_qwen_con_rol_1)
    print(f"Procesadas {len(df_qwen_con_rol_1)} respuestas")
    if( len(df_qwen_con_rol_1) % 10 == 0):
        df_qwen_con_rol_1.to_csv('./analysis/qwen_con_rol_1.csv')
        print(f"Guardadas {len(df_qwen_con_rol_1)} respuestas")

## Preguntas sin rol a temperatura 0.5

In [14]:
def pregunta_sin_rol(context, df):

  messages = [
    { "role": "user", "content": f"Based on {context}. Give me a summary of the diagnostic for the patient" },
  ]

  text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True,
      temperature=0.5,
      enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
  )
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

  # conduct text completion
  generated_ids = model.generate(
      **model_inputs,
      max_new_tokens=32768
  )
  output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

  # parsing thinking content
  try:
      # rindex finding 151668 (</think>)
      index = len(output_ids) - output_ids[::-1].index(151668)
  except ValueError:
      index = 0

  #thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
  content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

  #print("thinking content:", thinking_content)
  #print("content:", content)

  prompt = messages[0]["content"]
  new_row = {'input': prompt, 'output': content}
  df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  return df

In [None]:
df_qwen_sin_rol_0_5 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_qwen_sin_rol_0_5 = pregunta_sin_rol(context, df_qwen_sin_rol_0_5)
    print(f"Procesadas {len(df_qwen_sin_rol_0_5)} respuestas")
    if( len(df_qwen_sin_rol_0_5) % 10 == 0):
        df_qwen_sin_rol_0_5.to_csv('./analysis/qwen_sin_rol_0_5.csv')
        print(f"Guardadas {len(df_qwen_sin_rol_0_5)} respuestas")

## Preguntas con rol a temperatura 0.5

In [12]:
def pregunta_con_rol(context, df):

  messages = [
    { "role": "user", "content": f"Based on {context}. Give me a summary of the diagnostic for the patient" },
    { "role": "expert", "content": "You are a professional medical doctor. You provide clear and concise summaries based on patient's clinical information"}
  ]

  text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True,
      enable_thinking=False, # Switches between thinking and non-thinking modes. Default is True.
      temperature=0.5,
      do_sample=True
  )
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

  # conduct text completion
  generated_ids = model.generate(
      **model_inputs,
      max_new_tokens=32768
  )
  output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

  # parsing thinking content
  try:
      # rindex finding 151668 (</think>)
      index = len(output_ids) - output_ids[::-1].index(151668)
  except ValueError:
      index = 0

  #thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
  content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

  #print("thinking content:", thinking_content)
  #print("content:", content)

  prompt = messages[0]["content"]
  new_row = {'input': prompt, 'output': content}
  df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  return df

In [None]:
df_qwen_con_rol_0_5 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_qwen_con_rol_0_5 = pregunta_con_rol(context, df_qwen_con_rol_0_5)
    print(f"Procesadas {len(df_qwen_con_rol_0_5)} respuestas")
    if( len(df_qwen_con_rol_0_5) % 10 == 0):
        df_qwen_con_rol_0_5.to_csv('./analysis/qwen_con_rol_0_5.csv')
        print(f"Guardadas {len(df_qwen_con_rol_0_5)} respuestas")

## Preguntas sin rol a temperatura 0.1

In [20]:
def pregunta_sin_rol(context, df):

  messages = [
    { "role": "user", "content": f"Based on {context}. Give me a summary of the diagnostic for the patient" },
  ]

  text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True,
      temperature=0.1,
      enable_thinking=False # Switches between thinking and non-thinking modes. Default is True.
  )
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

  # conduct text completion
  generated_ids = model.generate(
      **model_inputs,
      max_new_tokens=32768
  )
  output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

  # parsing thinking content
  try:
      # rindex finding 151668 (</think>)
      index = len(output_ids) - output_ids[::-1].index(151668)
  except ValueError:
      index = 0

  #thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
  content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

  #print("thinking content:", thinking_content)
  #print("content:", content)

  prompt = messages[0]["content"]
  new_row = {'input': prompt, 'output': content}
  df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  return df

In [None]:
df_qwen_sin_rol_0_1 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_qwen_sin_rol_0_1 = pregunta_sin_rol(context, df_qwen_sin_rol_0_1)
    print(f"Procesadas {len(df_qwen_sin_rol_0_1)} respuestas")
    if( len(df_qwen_sin_rol_0_1) % 10 == 0):
        df_qwen_sin_rol_0_1.to_csv('./analysis/qwen_sin_rol_0_1.csv')
        print(f"Guardadas {len(df_qwen_sin_rol_0_1)} respuestas")

## Preguntas con rol a temperatura 0.1

In [22]:
def pregunta_con_rol(context, df):

  messages = [
    { "role": "user", "content": f"Based on {context}. Give me a summary of the diagnostic for the patient" },
    { "role": "expert", "content": "You are a professional medical doctor. You provide clear and concise summaries based on patient's clinical information"}
  ]

  text = tokenizer.apply_chat_template(
      messages,
      tokenize=False,
      add_generation_prompt=True,
      enable_thinking=False, # Switches between thinking and non-thinking modes. Default is True.
      temperature=0.1,
      do_sample=True
  )
  model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

  # conduct text completion
  generated_ids = model.generate(
      **model_inputs,
      max_new_tokens=32768
  )
  output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

  # parsing thinking content
  try:
      # rindex finding 151668 (</think>)
      index = len(output_ids) - output_ids[::-1].index(151668)
  except ValueError:
      index = 0

  #thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
  content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

  #print("thinking content:", thinking_content)
  #print("content:", content)

  prompt = messages[0]["content"]
  new_row = {'input': prompt, 'output': content}
  df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

  return df

In [None]:
df_qwen_con_rol_0_1 = pd.DataFrame(columns=['input', 'output'])

# Obtener las respuestas del modelo para cada contexto
for context in contexts:
    df_qwen_con_rol_0_1 = pregunta_con_rol(context, df_qwen_con_rol_0_1)
    print(f"Procesadas {len(df_qwen_con_rol_0_1)} respuestas")
    if( len(df_qwen_con_rol_0_1) % 10 == 0):
        df_qwen_con_rol_0_1.to_csv('./analysis/qwen_con_rol_0_1.csv')
        print(f"Guardadas {len(df_qwen_con_rol_0_1)} respuestas")

In [2]:
import os
import pandas as pd

folder_path = './analysis' 

for filename in os.listdir(folder_path):
    if filename.startswith('gemma') or filename.startswith('llama') or filename.startswith('qwen'):
        file_path = os.path.join(folder_path, filename)
        try:
            df = pd.read_csv(file_path)
            print(f"{filename}: {df.shape} (filas, columnas)")
        except Exception as e:
            print(f"Error leyendo {filename}: {e}")


qwen_con_rol_0_1.csv: (150, 3) (filas, columnas)
llama_con_rol_0_1.csv: (150, 3) (filas, columnas)
gemma_sin_rol_0_1.csv: (150, 3) (filas, columnas)
llama_sin_rol_0_1.csv: (150, 3) (filas, columnas)
qwen_con_rol_0_5.csv: (150, 3) (filas, columnas)
gemma_sin_rol_0_5.csv: (150, 3) (filas, columnas)
llama_con_rol_1.csv: (150, 3) (filas, columnas)
gemma_con_rol_0_1.csv: (150, 3) (filas, columnas)
llama_sin_rol_1.csv: (150, 3) (filas, columnas)
llama_sin_rol_0_5.csv: (150, 3) (filas, columnas)
gemma_sin_rol_1.csv: (150, 3) (filas, columnas)
llama_con_rol_0_5.csv: (150, 3) (filas, columnas)
qwen_sin_rol_0_5.csv: (150, 3) (filas, columnas)
qwen_sin_rol_1.csv: (150, 3) (filas, columnas)
gemma_con_rol_0_5.csv: (150, 3) (filas, columnas)
qwen_con_rol_1.csv: (150, 4) (filas, columnas)
gemma_con_rol_1.csv: (150, 3) (filas, columnas)
qwen_sin_rol_0_1.csv: (150, 4) (filas, columnas)
