In [1]:
!pip install sentencepiece protobuf datasets transformers trl textstat peft bitsandbytes nltk --quiet
!pip install -U bitsandbytes accelerate --quiet

In [2]:
with open("hf.token", "r") as f:
    hftoken = f.read().strip()  

import os
cache_dir = "/mnt/c/Users/yc/.cache/huggingface"
os.environ['HF_HOME'] = cache_dir

## import

In [3]:
# Standard library imports
import csv
import json
import random
import re
import sys
import time
from collections import Counter
from collections import defaultdict
from collections import defaultdict

# Third-party data and ML libraries
import pandas as pd
import torch
import nltk
from nltk.tokenize import word_tokenize
import re
import torch
# Hugging Face ecosystem
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    pipeline,
    logging,
)

# Fine-tuning and optimization
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# Text analysis and readability metrics
from textstat import (
    flesch_kincaid_grade, 
    flesch_reading_ease,
    smog_index, 
    gunning_fog, 
    dale_chall_readability_score,
    text_standard, 
    syllable_count
)

# Optional: Uncomment if needed
from huggingface_hub import login
login(token=hftoken)  # Move token to environment variable

  from .autonotebook import tqdm as notebook_tqdm


## data

In [4]:

def txt_to_dict(file_path):
    data_dict = {}
    with open(file_path, 'r') as file:
        lines = file.readlines()
        for i in range(0, len(lines) - 1, 2):
            key = lines[i].strip()    # Odd line are key
            value = lines[i + 1].strip()  # Even line are value
            data_dict[key] = value

    return data_dict

txt_file_path = 'formaldef.txt' 
formaldic = txt_to_dict(txt_file_path)
len(formaldic)

meddict={}
for k,v in formaldic.items():
    meddict[k.split('Listen to pronunciation')[0].split('(')[0]]=v


In [6]:
# load data
df = pd.read_csv('/mnt/c/Users/yc/Downloads/coral/unannotated/data/breastca_unannotated.csv')
df=df.head(10)
df

Unnamed: 0,coral_idx,Sex,UCSFDerivedRaceEthnicity_X,BirthDate,note_text
0,140,Female,Native Hawaiian or Other Pacific Islander,1964-03-25,Medical Oncology Consult Note Patient Name:...
1,141,Female,Native Hawaiian or Other Pacific Islander,1975-03-29,This is a shared visit for services provided b...
2,142,Female,Native Hawaiian or Other Pacific Islander,1967-10-06,Medical Oncology Consult Note Video Consult ...
3,143,Female,Native Hawaiian or Other Pacific Islander,1943-12-23,This is an independent visit SUBJECTIVE ...
4,144,Female,Native Hawaiian or Other Pacific Islander,1987-06-23,***** ***** Note Patient Name: ***** ***** ...
5,145,Female,Native American or Alaska Native,1985-02-26,HPI: ***** ***** is a 34 y.o. female with E...
6,146,Female,Native Hawaiian or Other Pacific Islander,1964-03-15,***** ***** with MBC ***** 2008 CC 2nd op...
7,147,Female,Native Hawaiian or Other Pacific Islander,1990-03-26,ID: ***** ***** ***** is a 29 y.o. premenopaus...
8,148,Female,Native American or Alaska Native,1957-05-23,***** ***** Note Patient Name: ***** ***** **...
9,149,Female,Native American or Alaska Native,1954-08-11,***** ***** Note Patient Name: ***** ***** ...


In [13]:
import pandas as pd
import os
import re
import textwrap # Import the textwrap module

def create_inline_annotations(text_content, annotation_content):
    """
    Parses annotations (including discontinuous ones) and inserts them 
    inline into the text.
    Example: "a right breast mass" becomes "a right breast mass[PROBLEM]".
    """
    annotations = []
    for line in annotation_content.strip().split('\n'):
        if line.startswith('T'):
            parts = line.split('\t')
            type_and_spans = parts[1].split()
            ann_type = type_and_spans[0]
            
            span_string = ' '.join(type_and_spans[1:])
            all_span_numbers_str = span_string.replace(';', ' ').split()
            spans = [int(s) for s in all_span_numbers_str]
            end_offset = max(spans)
            
            annotations.append({
                'end': end_offset,
                'type': ann_type
            })

    annotations.sort(key=lambda x: x['end'], reverse=True)

    for ann in annotations:
        end_pos = ann['end']
        label = f"[{ann['type']}]"
        text_content = text_content[:end_pos] + label + text_content[end_pos:]
        
    return text_content

# --- Main script to run the process ---

path_to_annotated_data = '/mnt/c/Users/yc/Downloads/coral/annotated/breastca/'
idx_to_check = 21

text_file_path = os.path.join(path_to_annotated_data, f'{idx_to_check}.txt')
ann_file_path = os.path.join(path_to_annotated_data, f'{idx_to_check}.ann')

try:
    with open(text_file_path, 'r', encoding='utf-8') as f:
        original_text = f.read()
    
    with open(ann_file_path, 'r', encoding='utf-8') as f:
        raw_annotations = f.read()

    inlined_text = create_inline_annotations(original_text, raw_annotations)

    # --- UPDATED PRINTING LOGIC ---
    # Define how many characters wide you want each line to be
    wrap_width = 100
    
    # Take the first 1500 characters of the result
    text_to_print = inlined_text[:]
    
    # Use textwrap.fill() to format the text with line breaks
    wrapped_text = textwrap.fill(text_to_print, width=wrap_width)

    # Print the final, wrapped result
    print("-" * 50)
    print(f"Inline Annotation Result for coral_idx: {idx_to_check} (wrapped at {wrap_width} chars)")
    print("-" * 50)
    print(wrapped_text)

except FileNotFoundError:
    print(f"Error: Could not find files for coral_idx {idx_to_check}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

--------------------------------------------------
Inline Annotation Result for coral_idx: 21 (wrapped at 100 chars)
--------------------------------------------------
                        UCSF ***** ***** FAMILY                        COMPREHENSIVE CANCER CENTER
***** ***** ***** ***** Care Center                             ***** ***** *****
Second *****, ***** *****                    ***** *****, ***** *****-*****               Phone:
(*****) *****-*****          Fax: (*****) *****-*****    01/24/2009    ***** *****, M.D.  *****
***** *****, ***** *****  *****, *****  *****    ***** *****, M.D.  ***** ***** ***** #*****  *****,
*****  *****    ***** *****, M.D.  ***** ***** *****  *****, *****  *****    ***** *****, M.D.
***** ***** *****, ***** *****  *****, *****  *****    RE: *****, ***** *****  U#: *****
DATE[hpi_start] OF SERVICE: 01/24/09    Dear *****:    I had the pleasure of seeing this patient in
consultation regarding the  treatment of her locally recurrent[DiseaseSta