# Problem Statement

In imaging studies, PHI is generally wiped out by applying anonymization rules on DICOM tags, combined with the CRPs' efforts to delete series with PHI, or redact information burned on the images.

However, in rare cases, PHI might come through in studies/series' descriptions. For example, the description would include the subjects' names.

# Solution

Using NLTK library, the notebook finds low frequencies words in the studies uploaded to the specified Inteleshare project.

# Code
### 1. Setup Inteleshare project

In [None]:
import AMBRA_Utils
import itertools
import json
import pandas as pd
from datetime import datetime
from nltk import FreqDist, word_tokenize
from phi_data import non_phi, phi

In [2]:
ambra_account_name = "MOST"
ambra = AMBRA_Utils.utilities.get_api()
account = ambra.get_account_by_name(ambra_account_name)
namespace = account.get_location_by_name("3 - Assigned Studies")

In [None]:
phi_json = '../Files/json/phi.json'
non_phi_json = '../Files/json/non_phi.json'

# 2. Get all studies and series' descriptions frequency dist

In [25]:
def create_phi_excel(file_name: str, data: list):
    """
    Create an excel which user can mark which word is PHI
    in `data`.

    Inputs:
    --------
    file_name (str):
        Excel file name.

    data (list):
        List of tuples (`word`, `frequency`).
    """
    df = pd.DataFrame.from_records(data, columns=["word", "frequency"])
    df["PHI"] = ""
    df.to_excel(file_name)

In [3]:
studies = list(namespace.get_studies())

In [None]:
studies_desc_tokens = []
series_desc_tokens = []
series_study_map = dict()

# Get all words from all studies and series' descriptions
for study in studies:
    study_tokens = word_tokenize(
        " ".join(study.formatted_description.split("_")).lower()
    )
    studies_desc_tokens.append(study_tokens)

    series = study.get_series()
    for s in series:
        s_desc_split = s.formatted_description.split("_")
        s_desc_split = [char.lower() for char in s_desc_split]
        for word in s_desc_split:
            if word not in series_study_map:
                series_study_map[word] = []
            series_study_map[word].append(study)

        s_tokens = word_tokenize(" ".join(s_desc_split))
        series_desc_tokens.append(s_tokens)

In [None]:
# Process studies

studies_desc_tokens_flat = list(itertools.chain.from_iterable(studies_desc_tokens))


# Remove words that are non-PHI
studies_desc_tokens_flat = [x for x in studies_desc_tokens_flat if x not in non_phi]

# Warn if PHI found
for token in studies_desc_tokens_flat:
    if token in phi:
        print(f'{token} is PHI!')


studies_desc_freq = FreqDist(studies_desc_tokens_flat)

# Get least frequent
num = 60
studies_least_common = studies_desc_freq.most_common()[-num:]

# Manually check if studies desc contains PHI
studies_least_common

[('mra', 47),
 ('rapid', 39),
 ('hrs', 31),
 ('72', 15),
 ('36', 15),
 ('spine', 9),
 ('c', 8),
 ('perfusion', 8),
 ('summary', 7),
 ('automated', 6),
 ('48', 5),
 ('hours', 4),
 ('viz', 4),
 ('no', 4),
 ('2', 3),
 ('time', 3),
 ('msu', 3),
 ('hemicrani', 2),
 ('braiin', 2),
 ('baseliine', 2),
 ('t', 2),
 ('event', 2),
 ('incomplete', 1),
 ('study', 1),
 ('mrp', 1),
 ('very', 1),
 ('12', 1),
 ('06pm', 1),
 ('vs', 1),
 ('am', 1),
 ('not', 1),
 ('included', 1),
 ('in', 1),
 ('dicom', 1),
 ('header', 1),
 ('correct', 1),
 ('date', 1),
 ('9', 1),
 ('25', 1),
 ('22', 1),
 ('at', 1),
 ('01', 1),
 ('45am', 1),
 ('72hrs', 1),
 ('xa', 1),
 ('reprocessed', 1),
 ('lica', 1),
 ('stenting', 1),
 ('unscheduled1', 1),
 ('l', 1),
 ('2nd', 1),
 ('enrolling', 1),
 ('basline', 1),
 ('baseine', 1),
 ('image', 1),
 ('vpct', 1),
 ('ncct', 1),
 ('processing', 1),
 ('failure', 1),
 ('read', 1)]

In [26]:
create_phi_excel(f"studies_phi_{datetime.now()}.xlsx", studies_least_common)

In [None]:
# Process series

series_desc_tokens_flat = list(itertools.chain.from_iterable(series_desc_tokens))

# Remove hash strings by ignoring strings with more than 10 characters
# and have more than 6 numbers
series_desc_tokens_flat_filtered = []
for series_token in series_desc_tokens_flat:
    num_count = 0
    for char in series_token:
        if char.isdigit():
            num_count += 1

    if not (num_count >= 6 and len(series_desc_tokens) >= 10):
        series_desc_tokens_flat_filtered.append(series_token)

# Remove words that are non-PHI
series_desc_tokens_flat_filtered = [x for x in series_desc_tokens_flat_filtered if x not in non_phi]

# Warn if PHI found
for token in series_desc_tokens_flat_filtered:
    if token in phi:
        print(f'{token} is PHI!')

series_desc_freq = FreqDist(series_desc_tokens_flat_filtered)

# Get least frequent
num = 500
series_least_common = series_desc_freq.most_common()[-num:]

# Manually check if studies desc contains PHI
series_least_common

# 3. Mark PHI or not

There should be two new files generated, one for series description, and one for studies description. Mark PHI by following these steps:

1. Open the files in Excel.
2. Skim through each row, and for rows with PHI, mark 'x' in `PHI` column.
- When not obvious, check the entire description of the series/studies by looking it up. 
    + Study lookup: Lookup the word on Inteleshare in the specified namespace.
    + Series lookup: Run `series_studies_map[{word_to_lookup}]`
3. Save the files.

# 4. Import marked Excel into data for PHI and non-PHI files

In [39]:
# Get marked Excel files

studies_phi_df = pd.read_excel('./studies_phi_2025-03-27 12:51:57.838670.xlsx')
series_phi_df = pd.read_excel('./series_phi_2025-03-27 15:32:37.548191.xlsx')

In [None]:
def import_phi_data(df: pd.DataFrame):
    """
    Import data from `df` to assign into PHI and non-PHI files.

    Inputs:
    --------
    df (pd.DataFrame): 
        A dataframe with words, its frequency and whether it's marked as PHI.
    """
    with open(phi_json, 'r') as phi_f, open(non_phi_json) as non_phi_f:
        phi_json = json.load(phi_f)
        non_phi_json = json.load(non_phi_f)

        # Add PHI and non-PHI data into json
        for index, row in df.iterrows():
            row_data = {
                'word': row['word'],
                'trial': ambra_account_name
            }
            if row['PHI'] == 'x':
                phi_json['data'].append(row_data)
                phi.add(row['word'])
            else:
                non_phi_json['data'].append(row_data)
                non_phi.add(row['word'])

        phi_json['trials'].append(ambra_account_name)
        non_phi_json['trials'].append(ambra_account_name)

    # Write to JSON
    with open(phi_json, 'w', encoding='utf-8') as f:
        json.dump(phi_json, f, ensure_ascii=False, indent=4)
        
    with open(non_phi_json, 'w', encoding='utf-8') as f:
        json.dump(non_phi_json, f, ensure_ascii=False, indent=4)



In [52]:
import_phi_data(series_phi_df)