In [1]:
import pydicom
from pydicom import dcmread
from pydicom.dataset import Dataset
from pydicom.sequence import Sequence
from pydicom.data import get_testdata_file
import os
import re
import hashlib
import csv
import pandas as pd
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from shutil import copyfile
from pydicom.valuerep import IS

In [3]:
#需要remove的tags(不包含需要删除部分内容的tag)
tags_to_clear = [
    (0x0008, 0x0050),           #Accession Number
    (0x0008, 0x0054),           #Retrieve AE Title    
    (0x0008, 0x0080),           #Institution Name
    (0x0008, 0x0081),           #Institution Address
    (0x0008, 0x0090),           #Referring Physician Name
    (0x0008, 0x0201),           #Timezone Offset From UTC 
    (0x0008, 0x1010),           #Station Name
    (0x0008, 0x1048),           #Physician(s) of Record 
    (0x0008, 0x1050),           #Performing Physician's Name 
    (0x0008, 0x1070),           #Operators' Name 
    (0x0008, 0x1120),           #Referenced Patient Sequence
    (0x0010, 0x0030),            #Patient Birth Date
    (0x0010, 0x1000),
    (0x0010, 0x1040),   
    (0x0012, 0x0020),
    (0x0012, 0x0021), 
    (0x0012, 0x0030),
    (0x0012, 0x0031),
    (0x0012, 0x0040),
    (0x0012, 0x0042),
    (0x0012, 0x0050),  
    (0x0018, 0x1200),
    (0x0020, 0x0010),
    (0x0032, 0x1021),
    (0x0040, 0x0009),
    (0x0040, 0x0241),
    (0x0040, 0x0275),
    (0x0070, 0x0084),
    (0x0088, 0x0220),
    (0x0040, 0xA075),
    (0x0040, 0xA123),
    (0x0012, 0x0051),
    (0x0043, 0x1005),
    (0x0043, 0x1029),
    (0x0043, 0x1060),
    (0x0043, 0x1080),
    (0x0009, 0x1002),
    (0x0009, 0x1030),
    (0x0009, 0x1037),
    (0x0018, 0xa001),
    (0x0008, 0x1040),
    (0x0029, 0x1131),
    (0x0029, 0x1134),
    (0x0040, 0xA027),
    (0x0040, 0xA073),
    (0x0040, 0xA730),
    (0x0021, 0x1035),
    (0x0021, 0x1003),
    (0x0040, 0x1001),
    (0x0032, 0x1020),
    (0x0040, 0x0242),
    (0x0040, 0x0280),
    (0x0040, 0x2001),
]
tags_to_shift = [0x00080012, 0x00080020, 0x00080021,0x00080022,0x00080023,0x00080024,0x00080025,
                 0x0008002A,0x001021D0,0x00181012,0x0018700C,0x00380020,0x00400002,0x00400004,
                 0x00400244,0x00400250,0x0040A032,
                 0x00091005,0x0009100D,0x0009100E,
                 0x00540016,0x00540410,0x00540414,0x0040a730,0x0040a073,0x0018a001,0x0018a002,0x0019109d,0x00091039,0x0009103b,0x0009103d,0x00091068,0x0009106c,0x0009107b,0x000910e9,0x00171004,#discrepancy,
                 0x00181078,0x00181079,0x0040a030,0x0040a120,0x0040a121,0x00540300,0x00540412,0x00080106,0x00191010,0x00321040,0x00321050
                ]
tags_to_hash = [0x0020000D,0x0020000E,0x00200052,0x00200200,
                0x0040A124,0x00880140,0x00400555,0x0040A073,0x0040A730,
                0x00080014,0x00080018,0x00080118,0x00081155,0x00081120,0x00083010,
                0x00081110,0x00081111,0x00081140,0x00082112,0x00081250,0x00089121,
                0x00400513,0x00400562,0x00400610,0x00340001,0x00081084,0x0009100A,0x00091013,0x00091056,0x00091057,0x00091059,0x0009105C,0x0009105D,0x0009105E,0x00091098,
                0x00091097,0x000910AD,0x00091007,0x000910e3,0x00431088,0x00431098,0x00451050,0x00451051,0x0008010c,   #discrepancy
                0x00080110,0x0040a375,0x00081199,0x0040a504,0x00081115]     #discrepancy sequence
tag_to_add = [0x00080068,0x00180060,0x00185101,0x00187004,0x00200012,0x00201040,0x20500020]
tag_not_null = [0x00080068,0x00181000,0x00200062,0x20500020,0x00402001]
sensitive_tags = [0x00081030,0x0008103e,0x001021b0,0x00102000,0x00104000,0x00181030,0x00184000,0x00204000,
                  0x00321060,0x00324000,0x011710c4,0x01171024,0x00400275,0x00400007,0x00401400,0x00400310,0x00200011,0x0040a730,0x0040a160]

In [5]:
#为了满足text_notnull，在tag_not_null中的tag值为空时，置为'NA'
def set_empty_tags_to_add(ds, tags):
    for tag in tags:
        tag = pydicom.tag.Tag(tag)  # Convert tag to pydicom.Tag object if necessary
        if tag in ds:
            value = ds[tag].value

            # Print the original value for debugging
            print(f"Original value for tag {tag}: {value}")

            # Check if the value is truly empty
            if value is None or (isinstance(value, str) and value.strip() == '') or (isinstance(value, list) and len(value) == 0):
                ds[tag].value = "NA"
                print(f"Modified value for tag {tag} to 'NA'")

In [7]:
def apply_id_mapping(ds, id_lookup):
    """处理 PatientID 和 PatientName 的映射"""
    if 'PatientID' in ds:
        original_id = ds.PatientID
        new_id = id_lookup.get(original_id, original_id)  # 使用映射的值，若无映射则使用原值
        ds.PatientID = new_id

def modify_patient_name_to_id(ds):
    if 'PatientID' in ds and 'PatientName' in ds:
        patient_id = ds.PatientID
        ds.PatientName = patient_id  # 将 PatientName 设置为 PatientID

In [9]:
def clear_tags(ds, tags_to_clear):
    if isinstance(ds, Dataset):
        for elem in ds:
            if isinstance(elem.value, Sequence):
                # 处理序列中的每一个项
                for item in elem.value:
                    if isinstance(item, Dataset):
                        # 递归处理数据集
                        clear_tags(item, tags_to_clear)
            elif isinstance(elem.value, Dataset):
                # 递归处理嵌套数据集
                clear_tags(elem.value, tags_to_clear)
            elif elem.tag in tags_to_clear:
                # 清除最底层标签的值
                try:
                    # 处理不同的数据类型
                    value = ds[elem.tag].value
                    if isinstance(value, str):
                        ds[elem.tag].value = ''
                    elif isinstance(value, (int, float, bool)):
                        ds[elem.tag].value = None
                    else:
                        # 对于其他复杂数据类型，可以设置为空字节流或空数据
                        ds[elem.tag].value = b'' if isinstance(value, bytes) else None
                except Exception as e:
                    print(f"Error processing tag {elem.tag}: {e}")


In [11]:
def load_offsets_from_csv(shift_csv_path):
    offsets = {}
    with open(shift_csv_path, 'r') as f:
        reader = csv.reader(f)
        header = next(reader, None) 
        for row in reader:
            if len(row) != 2:
                continue 
            patient_id, offset_str = row
            try:
                offset = int(offset_str)
                offsets[patient_id] = offset
            except ValueError:
                print(f"Warning: Invalid offset value '{offset_str}' for PatientID '{patient_id}'. Skipping.")
    return offsets

In [13]:
def add_missing_tags(ds, tags, vr='LO'):
    for tag in tags:
        tag_tuple = (tag >> 16, tag & 0xFFFF) 
        
        if tag_tuple not in ds:
            ds.add_new(tag_tuple, vr, '')
            print(f"Added missing tag {tag_tuple} with an empty value and VR '{vr}' to dataset.")
        else:
            print(f"Tag {tag_tuple} already exists in the dataset.")

In [15]:
def shift_datetime(date_str, days):
    """Shift the datetime or date by a certain number of days."""
    try:
        # Ensure 'days' is an integer
        if not isinstance(days, int):
            raise ValueError(f"Date increment should be an integer, got {type(days)}")
        
        if isinstance(date_str, str):
            if len(date_str) == 8:  # Format YYYYMMDD
                dt = datetime.strptime(date_str, '%Y%m%d')
                shifted_dt = dt - timedelta(days=days)  
                return shifted_dt.strftime('%Y%m%d')
            elif len(date_str) == 14:  # Format YYYYMMDDHHMMSS
                dt = datetime.strptime(date_str, '%Y%m%d%H%M%S')
                shifted_dt = dt - timedelta(days=days)
                return shifted_dt.strftime('%Y%m%d%H%M%S')
            elif len(date_str) == 17:  # Format YYYYMMDDHHMMSS.FF
                date_part = date_str[:14]  # 日期时间部分
                fractional_part = date_str[14:]  # 小数部分
                dt = datetime.strptime(date_part, '%Y%m%d%H%M%S')
                shifted_dt = dt - timedelta(days=days)
                return shifted_dt.strftime('%Y%m%d%H%M%S') + fractional_part
            elif len(date_str) == 10 and date_str.isdigit():  # Format Unix timestamp
                timestamp = int(date_str) 
                dt = datetime.utcfromtimestamp(timestamp)
                shifted_dt = dt - timedelta(days=days)
                return str(int(shifted_dt.timestamp()))
            else:
                print(f"Unknown date format: {date_str}") 
                return date_str  # Return original if format is unknown
        elif isinstance(date_str, int):  # Handle integer input (e.g., Unix timestamp)
            dt = datetime.utcfromtimestamp(date_str)
            shifted_dt = dt + timedelta(days=days)
            return str(int(shifted_dt.timestamp()))  
        else:
            print(f"Unhandled date input type: {type(date_str)}")  
            return date_str  # Return original if format is unknown
    except ValueError as e:
        print(f"ValueError processing date string '{date_str}': {e}")
        return date_str  # Return original if date parsing fails
    except Exception as e:
        print(f"Error processing date string '{date_str}': {e}")
        return date_str  # Return original if an unexpected error occurs

def update_tag(ds, tag, date_increment):
    if tag in ds:
        old_value = ds[tag].value

        # Ensure date_increment is an integer
        if not isinstance(date_increment, int):
            print(f"Invalid date increment: {date_increment}")
            return

        # Ensure old_value is processed as string if necessary
        if isinstance(old_value, int):
            old_value = str(old_value)
        elif not isinstance(old_value, str):
            return

        print(f"Original value for tag {tag}: {old_value}") 
        new_value = shift_datetime(old_value, date_increment)
        print(f"New value for tag {tag}: {new_value}")  

        if isinstance(ds[tag].value, str):
            ds[tag].value = new_value
        elif isinstance(ds[tag].value, int) and new_value.isdigit():
            ds[tag].value = int(new_value)
        else:
            print(f"Cannot set new value for tag {tag}: {new_value}")

def process_item(item, tags_to_shift, date_increment):
    if isinstance(item, Dataset):
        for tag in tags_to_shift:
            update_tag(item, tag, date_increment)
        for element in item:
            if isinstance(element.value, (Dataset, Sequence)):
                process_item(element.value, tags_to_shift, date_increment)
    elif isinstance(item, Sequence):
        for sub_item in item:
            process_item(sub_item, tags_to_shift, date_increment)

def shift_dates(ds, tags_to_shift, patient_offsets):
    try:
        if (0x0010, 0x0020) in ds:
            patient_id = ds[(0x0010, 0x0020)].value
            date_increment = patient_offsets.get(patient_id, 0)
            for tag in tags_to_shift:
                if tag in ds:
                    if isinstance(ds[tag].value, (Dataset, Sequence)):
                        process_item(ds[tag].value, tags_to_shift, date_increment)
                    else:
                        update_tag(ds, tag, date_increment)
    except Exception as e:
        print(f"Error shifting dates: {e}")


In [17]:
def hash_uid(uid):
    hashed_uid = hashlib.sha256(uid.encode()).hexdigest()
    numeric_hash = int(hashed_uid, 16)
    numeric_hash_str = str(numeric_hash).zfill(19)[:19]
    return numeric_hash_str

def hash_sequence(ds, tags_to_hash, uid_root, uid_mapping, patient_id=None):
    if isinstance(ds, pydicom.Dataset):
        for tag in tags_to_hash:
            if tag in ds and ds[tag].value:
                if isinstance(ds[tag].value, pydicom.sequence.Sequence):
                    for item in ds[tag].value:
                        hash_sequence(item, tags_to_hash, uid_root, uid_mapping, patient_id)
                else:
                    # Hash the UID value
                    original_uid = ds[tag].value

                    # Define the UID root with patient ID
                    uid_root_with_patient = f"{uid_root}{patient_id}.8.117."

                    # Check if original_uid is already in the uid_mapping
                    if original_uid in uid_mapping:
                        # Use the existing mapped value and extract the last 19 characters
                        new_uid = uid_mapping[original_uid]
                        ds[tag].value = new_uid
                    else:
                        # Generate a new UID using the hash_uid function
                        new_uid = hash_uid(original_uid)
                        # Construct the full new UID
                        full_new_uid = f"{uid_root_with_patient}{new_uid}"
                        # Update the uid_mapping
                        uid_mapping[original_uid] = full_new_uid
                        ds[tag].value = f"{uid_root_with_patient}{new_uid}"


In [19]:
def save_uid_mapping(uid_mapping, file_path):
    with open(file_path, mode='w', newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['id_old', 'id_new'])
        for original_uid, hashed_uid in uid_mapping.items():
            # Check if hashed_uid already exists as a key (to avoid conflicts)
            if hashed_uid in uid_mapping:
                print(f"Warning: {hashed_uid} already exists in uid_mapping.")
            else:
                writer.writerow([original_uid, hashed_uid])

In [21]:
# 定义正则表达式以识别敏感信息
NUMBER1_PATTERN = re.compile(r'\b\d{3}-\d{2}-\d{4}\b')
date_pattern = re.compile(r'(19|20)\d{6}')        # 日期格式（例如20160730）固定前两位19/20
NAME1_PATTERN = re.compile(r'\bfor [A-Z][a-z]+\s[A-Z][a-z]+\b')   # 匹配两个首字母大写的人名  限制只有首位大写 /两个全大写人名^
NAME2_PATTERN = re.compile(r'\b[A-Z][a-z]+\s[A-Z][a-z]+ :')
#NAME2_PATTERN = re.compile(r'\b[A-Z]+\^[A-Z]+\b')
#UPPERCASE_WORD_PATTERN = re.compile(r'\b[A-Z]+\b')  # 匹配全大写字母的单词 写具体的字符串
#PID_PATTERN = re.compile(r'\b\d{9}\b|\b\d{10}\b')  # 匹配九位数和十位数，把当前pid提取出来

PHONE1_PATTERN = re.compile(r'\d{1}-\d{3}-\d{3}-\d{4}x\d{1}')
PHONE2_PATTERN = re.compile(r'\(\d{3}\)\d{3}-\d{4}x\d{2}')           #(区号)交换机号-分机号
PHONE3_PATTERN = re.compile(r'\(\d{3}\)\d{3}-\d{4}')
PHONE4_PATTERN = re.compile(r'\d{3}.\d{3}.\d{4}x\d{3}')              #(区号)交换机号-分机号
PHONE5_PATTERN = re.compile(r'\+\d{1}-\d{3}-\d{3}-\d{4}')             #在5之前识别
PHONE6_PATTERN = re.compile(r'\b\d{1}-\d{3}-\d{3}-\d{4}\b')
PHONE7_PATTERN = re.compile(r'\d{3}.\d{3}.\d{4}')
PHONE8_PATTERN = re.compile(r'\d{3}-\d{3}-\d{4}')
PHONE9_PATTERN = re.compile(r'call \d{10}')

ABS1_PATTERN = re.compile(r'\bat\s[A-Z]+\b')                             #by AH替换为 by/at
ABS2_PATTERN = re.compile(r'\bby\s[A-Z]+\b')

ADDRESS_PATTERN1 = re.compile(r'\d+ [A-Za-z0-9\s]+, [A-Z]{2} \d{5}')
ADDRESS_PATTERN2 = re.compile(r'[A-Za-z0-9\s]+, [A-Z]{2} \d{5}')
#最终版：
HISTORY1_PATTERN = re.compile(r'\b\w+,\s\w+\sand\s\w+\sMedical Center\b')     #Aa, Bb and Cc Medical Center
HISTORY2_PATTERN = re.compile(r'\b\w+\sand\s\w+\sMedical Center\b')           #Aa and Bb Medical Center
HISTORY3_PATTERN = re.compile(r'\b[A-Z][a-z]+-[A-Z][a-z]+ Medical Center\b')  #Aa-Bb Medical Center
HISTORY4_PATTERN = re.compile(r'\b[A-Z][a-z]+ Medical Center\b')              #Aa Medical Center
HISTORY5_PATTERN = re.compile(r'\b\w+,\s\w+\sand\s\w+\sMedical Clinic\b')    #Aa, Bb and Cc Medical Clinic
HISTORY6_PATTERN = re.compile(r'\b\w+,\s\w+\sand\s\w+\sGeneral Hospital\b')     #Aa, Bb and Cc General Hospital
HISTORY7_PATTERN = re.compile(r'\b\w+\sand\s\w+\sGeneral Hospital\b')           #Aa and Bb General Hospital
HISTORY8_PATTERN = re.compile(r'\b[A-Z][a-z]+-[A-Z][a-z]+ General Hospital\b')  #Aa-Bb General Hospital
HISTORY9_PATTERN = re.compile(r'\b[A-Z][a-z]+ General Hospital\b')              #Aa General Hospital
HISTORY10_PATTERN = re.compile(r'\b\w+,\s\w+\sand\s\w+\sCommunity Clinic\b')       #Aa, Bb and Cc Community Clinic
HISTORY11_PATTERN = re.compile(r'\b\w+\sand\s\w+\sCommunity Clinic\b')             #Aa and Bb Community Clinic
HISTORY12_PATTERN = re.compile(r'\b[A-Z][a-z]+-[A-Z][a-z]+ Community Clinic\b')   #Aa-Bb Community Clinic
HISTORY13_PATTERN = re.compile(r'\b\w+,\s\w+\sand\s\w+\sCommunity Hospital\b')      #Aa, Bb and Cc Community Hospital
HISTORY14_PATTERN = re.compile(r'\b\w+\sand\s\w+\sCommunity Hospital\b')             #Aa and Bb Community Hospital
HISTORY15_PATTERN = re.compile(r'\b[A-Z][a-z]+-[A-Z][a-z]+ Community Hospital\b')   #Aa-Bb Community Hospital
HISTORY16_PATTERN = re.compile(r'\b\w+,\s\w+\sand\s\w+\sMemorial\b')    #Aa, Bb and Cc Memorial
HISTORY17_PATTERN = re.compile(r'\b\w+\sand\s\w+\sMemorial\b')          #Aa and Bb Memorial
HISTORY18_PATTERN = re.compile(r'\b[A-Z][a-z]+-[A-Z][a-z]+ Memorial\b')  #Aa-Bb Memorial
HISTORY19_PATTERN = re.compile(r'\b[A-Z][a-z]+\s[A-Z][a-z]+ Memorial\b') #Aa Bb Memorial
HISTORY20_PATTERN = re.compile(r'\b[A-Z][a-z]+ Memorial\b')             #Aa Memorial
HISTORY21_PATTERN = re.compile(r'\b\w+,\s\w+\sand\s\w+\sGeneral\b')    #Aa, Bb and Cc General
HISTORY22_PATTERN = re.compile(r'\b\w+\sand\s\w+\sGeneral\b')          #Aa and Bb General
HISTORY23_PATTERN = re.compile(r'\b[A-Z][a-z]+-[A-Z][a-z]+ General\b') #Aa-Bb General
HISTORY24_PATTERN = re.compile(r'\b[A-Z][a-z]+ General\b')             #Aa General
HISTORY25_PATTERN = re.compile(r'\b\w+,\s\w+\sand\s\w+\sClinic\b')       #Aa, Bb and Cc Clinic
HISTORY26_PATTERN = re.compile(r'\b[A-Z][a-z]+\s[A-Z][a-z]+ Clinic\b')   #Aa Bb Clinic
HISTORY27_PATTERN = re.compile(r'\b[A-Z][a-z]+-[A-Z][a-z]+ Clinic\b')    #Aa-Bb Clinic
HISTORY28_PATTERN = re.compile(r'\b[A-Z][a-z]+ Clinic\b')               #Aa Clinic
HISTORY29_PATTERN = re.compile(r'\b\w+,\s\w+\sand\s\w+\sHospital\b')    #Aa, Bb and Cc Hospital
HISTORY30_PATTERN = re.compile(r'\b\w+\sand\s\w+\sHospital\b')          #Aa and Bb Hospital
HISTORY31_PATTERN = re.compile(r'\b[A-Z][a-z]+\s[A-Z][a-z]+ Hospital\b')   #Aa Bb Hospital
HISTORY32_PATTERN = re.compile(r'\b[A-Z][a-z]+-[A-Z][a-z]+ Hospital\b')    #Aa-Bb Hospital
HISTORY33_PATTERN = re.compile(r'\b[A-Z][a-z]+ Hospital\b')               #Aa Hospital

DOCTOR1_PATTERN = re.compile(r'\bDr\.\s[A-Z][a-z]+\b')
DOCTOR2_PATTERN = re.compile(r'\bDR\.[A-Z]+\b')
DOCTOR3_PATTERN = re.compile(r'DR_[A-Z]+')

str1 = "% green/purple high"
str2 = "% white high"
str3 = "3792105090"
str4 = "2929551111"
str5 = "5932656543" 

In [23]:
def remove_sensitive_info_from_value(value, patient_id):
    if isinstance(value, str):  
        # 处理敏感信息
        if(value in ("AL","CW","CH","RD","RS","DL","KM")):
            value=''
        value = value.replace(patient_id, "")   #提前，防止误删成日期
        value = ADDRESS_PATTERN1.sub('', value)
        value = ADDRESS_PATTERN2.sub('', value)
        value = HISTORY1_PATTERN.sub('', value)
        value = HISTORY2_PATTERN.sub('', value)
        value = HISTORY3_PATTERN.sub('', value)
        value = HISTORY4_PATTERN.sub('', value)
        value = HISTORY5_PATTERN.sub('', value)
        value = HISTORY6_PATTERN.sub('', value)
        value = HISTORY7_PATTERN.sub('', value)
        value = HISTORY8_PATTERN.sub('', value)
        value = HISTORY9_PATTERN.sub('', value)
        value = HISTORY10_PATTERN.sub('', value)
        value = HISTORY11_PATTERN.sub('', value)
        value = HISTORY12_PATTERN.sub('', value)
        value = HISTORY13_PATTERN.sub('', value)
        value = HISTORY14_PATTERN.sub('', value)
        value = HISTORY15_PATTERN.sub('', value)
        value = HISTORY16_PATTERN.sub('', value)
        value = HISTORY17_PATTERN.sub('', value)
        value = HISTORY18_PATTERN.sub('', value)
        value = HISTORY19_PATTERN.sub('', value)
        value = HISTORY20_PATTERN.sub('', value)
        value = HISTORY21_PATTERN.sub('', value)
        value = HISTORY22_PATTERN.sub('', value)
        value = HISTORY23_PATTERN.sub('', value)
        value = HISTORY24_PATTERN.sub('', value)
        value = HISTORY25_PATTERN.sub('', value)
        value = HISTORY26_PATTERN.sub('', value)
        value = HISTORY27_PATTERN.sub('', value)
        value = HISTORY28_PATTERN.sub('', value)
        value = HISTORY29_PATTERN.sub('', value)
        value = HISTORY30_PATTERN.sub('', value) 
        value = HISTORY31_PATTERN.sub('', value) 
        value = HISTORY32_PATTERN.sub('', value) 
        value = HISTORY33_PATTERN.sub('', value) 
        value = NUMBER1_PATTERN.sub('', value)
        value = date_pattern.sub('', value)
        value = NAME1_PATTERN.sub('for', value)
        value = NAME2_PATTERN.sub(':', value) 
        value = PHONE1_PATTERN.sub('', value)
        value = PHONE2_PATTERN.sub('', value)
        value = PHONE3_PATTERN.sub('', value)
        value = PHONE4_PATTERN.sub('', value)
        value = PHONE5_PATTERN.sub('', value)
        value = PHONE6_PATTERN.sub('', value)
        value = PHONE7_PATTERN.sub('', value)
        value = PHONE8_PATTERN.sub('', value)
        value = PHONE9_PATTERN.sub('call', value)
        value = DOCTOR1_PATTERN.sub('', value)
        value = DOCTOR2_PATTERN.sub('', value)
        value = DOCTOR3_PATTERN.sub('', value)
        value = ABS1_PATTERN.sub('at', value)
        value = ABS2_PATTERN.sub('by', value)
        value = value.replace(str1, "")
        value = value.replace(str2, "")
        value = value.replace(str3, "")
        value = value.replace(str4, "")
        value = value.replace(str5, "")
    
    elif isinstance(value, IS):
        # 将 IS 类型值转换为字符串进行处理
        value = str(value)
        value = remove_sensitive_info_from_value(value, patient_id)
    return value

def remove_sensitive_info(ds, sensitive_tags, patient_id):
    def clear_value(value):
        if isinstance(value, Sequence):
            for item in value:
                if isinstance(item, Dataset):
                    remove_sensitive_info(item, sensitive_tags, patient_id)
        elif isinstance(value, Dataset):
            remove_sensitive_info(value, sensitive_tags, patient_id)
        elif isinstance(value, (str, IS, list)):
            return remove_sensitive_info_from_value(value, patient_id)
        return value

    for tag in sensitive_tags:
        tag = pydicom.tag.Tag(tag)  
        if tag in ds:
            ds[tag].value = clear_value(ds[tag].value)

In [25]:
def deid_dicom_directory(input_directory, output_directory, tags_to_clear, id_lookup_file, shift_csv_path, tags_to_shift, tags_to_hash, uid_root, tags_to_add):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    uid_mapping = {}
    
    # Load ID mapping and date offsets
    id_lookup = {}
    with open(id_lookup_file, 'r') as f:
        for line in f:
            original_id, new_id = line.strip().split(',')
            id_lookup[original_id] = new_id
    
    patient_offsets = load_offsets_from_csv(shift_csv_path)
    
    for root, dirs, files in os.walk(input_directory):
        for file in files:
            if file.lower().endswith('.dcm'):
                input_path = os.path.join(root, file)
                relative_path = os.path.relpath(input_path, input_directory)
                output_path = os.path.join(output_directory, relative_path)

                # Create output directory if it does not exist
                os.makedirs(os.path.dirname(output_path), exist_ok=True)

                ds = pydicom.dcmread(input_path, force=True)

                patient_id = ds.PatientID
                #part remove
                remove_sensitive_info(ds, sensitive_tags,patient_id)
                
                # Apply ID mapping
                apply_id_mapping(ds, id_lookup)
                modify_patient_name_to_id(ds)
                
                # Extract patient ID for UID hashing
                patient_id = ds.PatientID if 'PatientID' in ds else 'UnknownPatientID'
                uid_root = f"1.2.397.4.5."

                # Clear specified tags
                clear_tags(ds, tags_to_clear)
                
                # Shift dates based on patient offsets
                shift_dates(ds, tags_to_shift, patient_offsets)

                # Check if file meta information exists and update (0002,0003) tag if present
                
                if ds.file_meta and 'MediaStorageSOPInstanceUID' in ds.file_meta:
                    original_uid = ds.file_meta.MediaStorageSOPInstanceUID
                    new_uid = hash_uid(original_uid)
                    ds.file_meta.MediaStorageSOPInstanceUID = f"{uid_root}{patient_id}.8.117.{new_uid}" #修改，新增{patient_id}.
                    uid_mapping[original_uid] = ds.file_meta.MediaStorageSOPInstanceUID
                
                
                # Hash other specified tags in the dataset and recursively process sequences
                hash_sequence(ds, tags_to_hash, uid_root, uid_mapping, patient_id)

                # Add the tag if it does not exist
                add_missing_tags(ds, tags_to_add)

                # Set empty tag values to "add" for tags in `tag_not_null`
                set_empty_tags_to_add(ds, tag_not_null)
                
                

                # Save modified DICOM file
                ds.save_as(output_path)
                print(f"Processed file saved: {output_path}")

    print("UID Mapping Contents:")
    for original_uid, new_uid in uid_mapping.items():
        print(f"Original UID: {original_uid}, New UID: {new_uid}")
    # Save the UID mapping to a CSV file
    save_uid_mapping(uid_mapping, file_path)

In [1]:
directory = "C:/Users/珊珊/Desktop/test/input"
id_lookup_file =  "C:/Users/珊珊/Desktop/test/patient_id_mapping.csv"
output_dir = "C:/Users/珊珊/Desktop/test/output"
csv_path = 'C:/Users/珊珊/Desktop/test/shift.csv'
uid_root="1.2.375.4.5."
file_path='C:/Users/珊珊/Desktop/test/uid_mapping.csv'

In [31]:
deid_dicom_directory(directory, output_dir, tags_to_clear, id_lookup_file, csv_path, tags_to_shift, tags_to_hash, uid_root,tag_to_add)

Original value for tag 524306: 20161013
New value for tag 524306: 20160930
Original value for tag 524320: 20161013
New value for tag 524320: 20160930
Original value for tag 524321: 20161013
New value for tag 524321: 20160930
Original value for tag 524322: 20161013
New value for tag 524322: 20160930
Original value for tag 524323: 
Unknown date format: 
New value for tag 524323: 
Original value for tag 524330: 20161013125441
New value for tag 524330: 20160930125441
Original value for tag 4194884: 20161013
New value for tag 4194884: 20160930
Added missing tag (8, 104) with an empty value and VR 'LO' to dataset.
Added missing tag (24, 96) with an empty value and VR 'LO' to dataset.
Tag (24, 20737) already exists in the dataset.
Added missing tag (24, 28676) with an empty value and VR 'LO' to dataset.
Added missing tag (32, 18) with an empty value and VR 'LO' to dataset.
Added missing tag (32, 4160) with an empty value and VR 'LO' to dataset.
Tag (8272, 32) already exists in the dataset.
Or

