# Data Preprocessing

#### Import Libraries

In [1]:
import json, os, re
import pandas as pd
from pandas import DataFrame
from typing import List, Dict, Any, Set

import numpy as np
np.int = np.int32
np.float = np.float64
np.bool = np.bool_

import nltk as nltk_lib
nltk_lib.download('punkt')

import tokenizations
from textspan import get_original_spans
from clinitokenizer.tokenize import clini_tokenize

from deid_utils import assign_sentence_id, satisfies_regex_rule, REGEX_PATTERNS

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2024-03-14 16:28:11.309239: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


#### Load the Data

In [2]:
JsonType = List[Dict[str, Any]]

path = "../sc/NER_data/"

In [3]:
def get_data(path: str) -> Dict[str, JsonType]:
    json_data: Dict[str, JsonType] = {}
    
    for filename in os.listdir(path):
        if filename.endswith('.json'):
            filepath = os.path.join(path, filename)
            
            with open(filepath, 'r', encoding="utf-8") as file:
                json_data[filename] = json.load(file)
    return json_data

json_data = get_data(path)

#### Join files, filter and harmonize annotations

In [4]:
def simplify_json_structure(file: JsonType) -> JsonType:  
    """
    This function filters elements in the structure of the input json file 
    and returns its' simplified version. It filters the following fields:
        - "id"
        - "data"
        - raw "annonations" (i.e., the results)
    """
    simplified: JsonType = [{} for _ in range(len(file))]
    
    for i, record in enumerate(file):
        simplified[i]["id"] = record["id"]
        simplified[i]["data"] = record["data"]

        annotations: List[Dict[Dict[str, Any]]] = []
        for annotation in record["annotations"][0]["result"]:
            result: Dict[str, Any] = annotation["value"]
            annotations.append(result)
            
        simplified[i]["annotations"] = annotations
    return simplified

In [5]:
def filter_anonymisation_anno(file: JsonType) -> None:
    """
    This function filters through the annotated entities in text
    and only keeps the ones labeled for anonymization.
    """
    for record in file:
        filtered: List[Dict[str, Any]] = []
        
        for annotation in record["annotations"]:
            if annotation["labels"][0] == "anonymizovat":
                filtered.append(annotation)

        record["annotations"] = filtered    

In [6]:
for filename, file in json_data.items():
    json_data[filename] = simplify_json_structure(file)
    filter_anonymisation_anno(json_data[filename])

In [7]:
#print(json.dumps(json_data["fin_ann1.json"], ensure_ascii = False, indent=2))

In [8]:
for filename, file in json_data.items():
    path = os.path.join("/workspace/home/", filename)
    with open(path, "w") as json_file:
        json.dump(file, json_file)

In [9]:
def print_anno_statistics(file: JsonType) -> None:
    count = 0
    ids: List[num] = []
    
    for record in file:
        if len(record["annotations"]) > 0:
            count += 1
            ids.append(record["id"])
    print(f'\t-number of annotated records = {count}')
    print(f'\t-ids of the annotated records = {ids}')    

In [10]:
for filename, file in json_data.items():
    print(f'{filename} ({len(file)} items):')
    print_anno_statistics(file)

fin_ann1.json (40 items):
	-number of annotated records = 0
	-ids of the annotated records = []
fin_ann3.json (40 items):
	-number of annotated records = 11
	-ids of the annotated records = [42, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54]
fin_ann2.json (40 items):
	-number of annotated records = 1
	-ids of the annotated records = [82]


In [11]:
def join_files(json_data: Dict[str, JsonType]) -> JsonType:
    """
    Annotated files include text duplicates.
    This function joins all unique records from all the files. 
    """
    joined_json_data: JsonType = []
    texts_metadata: Set[Tuple[str, str, str]] = set()
    duplicates_count, unique_count = 0, 0

    for filename in sorted(json_data.keys(), reverse=True):  # all annotations are in the last file "fin_ann3.json"
        for record in json_data[filename]:

            # (i, pid, rord) are unique for each text
            i = record["data"]["i"] if "i" in record["data"].keys() else None
            pid = record["data"]["pid"] if "pid" in record["data"].keys() else None
            rord = record["data"]["rord"] if "rord" in record["data"].keys() else None
            metadata = (i, pid, rord)

            if i is None or pid is None or rord is None:  # edge case (e.g. holds for record with id=82)
                duplicates_count += 1
                continue

            if metadata not in texts_metadata:  # unique record
                joined_json_data.append(record)
                texts_metadata.add(metadata)     
                unique_count += 1
            else:
                duplicates_count += 1
    
    print(f'number of unique records: {unique_count}')
    print(f'number of duplicated records: {duplicates_count}')
    print("all the records == duplicated + unique", end=' ')
    print(f'{sum(len(x) for x in json_data.values()) == unique_count + duplicates_count}\n')

    joined_json_data.sort(key=lambda x: x["id"])
    return joined_json_data

In [12]:
joined_json_data: JsonType = join_files(json_data)

print(f'joined_json_data ({len(joined_json_data)} items):')
print_anno_statistics(joined_json_data)

number of unique records: 80
number of duplicated records: 40
all the records == duplicated + unique True

joined_json_data (80 items):
	-number of annotated records = 11
	-ids of the annotated records = [42, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54]


In [13]:
with open("joined_data.json", "w") as file:
    json.dump(joined_json_data, file)

#### ... Annotation of the rest of the dataset ...

In [14]:
with open("joined_data_annotated.json", 'r', encoding="utf-8") as file:
    json_data = json.load(file)

#### Json to CSV with Ground Truth and Regex Label

In [15]:
def label_words(record: Dict[str, Any], temp: DataFrame) -> None:
    """
    This function puts labels to the individual words from the record that is 
    taken as an input according to the annotations provided (in the record).
    The result is stored in the dataframe that it takes as an input.
    """
    for annotation in record['annotations']:
        start, end = annotation["start"], annotation["end"]
        selected_rows = temp[(temp["start"] >= start) & (temp["end"] <= end)]
        temp.loc[selected_rows.index, "true_label"] = "A"
        
    temp["true_label"].fillna("O", inplace=True)

In [16]:
data = pd.DataFrame(columns = ["id", "sentence", "word", "start", "end", "regex_rule", "true_label"])

for record in json_data:
    text = record["data"]["text"].replace('"', " ").replace("'", " ")
    
    tokens: List[str] = nltk_lib.tokenize.word_tokenize(text, language='czech', preserve_line=False)
    offsets = [item for sublist in get_original_spans(tokens, text) for item in sublist]
    tokens_with_offsets: List[Dict[str, Any]] = [{"word": word, 
                                                  "start": offset[0], 
                                                  "end": offset[1]} for word, offset in zip(tokens, offsets)]
    
    temp = pd.DataFrame(tokens_with_offsets, 
                        columns=["id", "sentence", "word", "start", "end", "regex_rule", "true_label"])
    temp["id"] = record["id"]
    label_words(record, temp)

    satisfies_regex_rule(text, temp)

    sentences = clini_tokenize(text)
    assign_sentence_id(temp, sentences)

    data = pd.concat([data, temp], ignore_index=True)

                                                                                                            

In [17]:
data['sentence'] = data['sentence'].astype(int)

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53767 entries, 0 to 53766
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          53767 non-null  object
 1   sentence    53767 non-null  int64 
 2   word        53767 non-null  object
 3   start       53767 non-null  object
 4   end         53767 non-null  object
 5   regex_rule  53767 non-null  object
 6   true_label  53767 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.9+ MB


In [19]:
data.head()

Unnamed: 0,id,sentence,word,start,end,regex_rule,true_label
0,3,0,Ke,0,2,O,O
1,3,0,kontrole,3,11,O,O
2,3,0,",",11,12,O,O
3,3,0,k,13,14,O,O
4,3,0,pokračování,15,26,O,O


In [20]:
rows_with_NaN = data.isnull().any(axis=1)[data.isnull().any(axis=1)].index.tolist()
print("Rows with NaN values:", rows_with_NaN )

Rows with NaN values: []


In [21]:
regex_mapping = data[["id", "sentence", "start", "end", "regex_rule"]].copy()

In [23]:
data.drop(columns=["regex_rule"], inplace=True)

In [25]:
regex_mapping.to_csv("regex_mapping.csv", index=False)

In [26]:
data.to_csv("patient_records.csv", index=False)