<a href="https://colab.research.google.com/github/ykitaguchi77/manipulate_CSV/blob/master/Extract_specific_disease_files_local.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 特定の疾患の患者の画像ファイルを、眼位写真Newから抜き出す

In [None]:
import pandas as pd

# shift_jis での読み込みを試みる
try:
    df = pd.read_csv("F:\●DiseaseInfo_all_connected_20230907.csv", encoding="shift_jis", header=None)
except UnicodeDecodeError:
    # cp932 での読み込みを試みる
    df = pd.read_csv("F:\●DiseaseInfo_all_connected_20230907.csv", encoding="cp932", header=None)

# 「眼窩腫瘍」あるいは「涙腺腫瘍」が含まれている行を抜き出す
filtered_df = df[df.apply(lambda row: row.astype(str).str.contains('眼窩腫瘍|涙腺腫瘍').any(), axis=1)]

filtered_df

filtered_df.to_csv(r"D:\ダウンロード\filtered_diseases_眼窩腫瘍_涙腺腫瘍.csv", index=False, header=0, encoding="shift_jis")



In [None]:
# C列 (2番目のカラム) のデータをリストに変換
patient_number_list = filtered_df.iloc[:, 2].tolist()
patient_number_list


In [None]:
"""
特定の数字リストに基づいて眼位写真NEW内のファイルをフィルタリングし、マッチするファイルのフルパスをリストに保存

"""

import os
import re

# 1. Create an empty list
resulting_files = []

# 2. Iterate over all files in the specified directory
# Note: This code cannot be run in this environment, but is provided for reference.
for filename in os.listdir("F:\\眼位写真NEW"):
    # 3. Extract the number from the basename
    base_name = os.path.basename(filename)
    match = re.match(r"^\d+", base_name)
    if match:
        number = int(match.group(0))
        # 4. Check if the number is in the specified list
        if number in patient_number_list:
            resulting_files.append(os.path.join("F:\\眼位写真NEW", filename))

# Output the resulting_files list (commented out since we cannot run the os.listdir in this environment)
resulting_files


In [None]:
def extract_dates_and_years(file_paths):
    # 日付を抜き出す
    dates = []
    for path in file_paths:
        date_match = re.search(r'(\d{8})', path)
        if date_match:
            dates.append(date_match.group(1))

    # 日付から年を抜き出す
    years = [date[:4] for date in dates]

    return dates, years

def filter_files_by_year(file_paths, start_year):
    _, years = extract_dates_and_years(file_paths)

    # Filtering files based on the year
    filtered_files = [file for file, year in zip(file_paths, years) if int(year) >= start_year]

    return filtered_files

# Filtering the resulting_files list for files from 2017 onwards
filtered_files_2017_onwards = filter_files_by_year(resulting_files, 2017)
filtered_files_2017_onwards


In [None]:
len(filtered_files_2017_onwards)

1533

In [None]:
from tqdm import tqdm

destination_folder = "F:\\眼位写真\\眼位写真_眼窩涙腺腫瘍"

# If the folder already exists, delete it
if os.path.exists(destination_folder):
    shutil.rmtree(destination_folder)
os.makedirs(destination_folder)

# Copy files to the destination folder with tqdm progress bar
for file_path in tqdm(filtered_files_2017_onwards, desc="Copying files"):
    shutil.copy(file_path, destination_folder)

Copying files: 100%|███████████████████████████████████████████████████████████████| 1533/1533 [00:20<00:00, 74.83it/s]


In [None]:
data = []

for path in resulting_files:
    matches = re.findall(r'(\d+)-(\d+)-\d+-\d+_(\w+)\.', path)
    if matches:
        row = []
        for match in matches[0]:
            if match.isdigit():
                row.append(int(match))
            else:
                row.append(match)
        data.append(row)

df = pd.DataFrame(data, columns=['Patient_num', 'date', 'Hash'])
df[['Patient_id', 'Hertel_R', 'Hertel_L']] = None #空の行を追加
df = df[['Patient_id', 'Patient_num', 'date', 'Hash', 'Hertel_R', 'Hertel_L']] #並べ替え


df

In [None]:
filtered_df

In [69]:
# filtered_dfの2列目とdfのPatient_numが一致する場合、
# filtered_dfの0列目の内容をdfのPatient_id列に書き込む
for index, row in df.iterrows():
    matching_rows = filtered_df[filtered_df.iloc[:, 2] == row['Patient_num']]
    if not matching_rows.empty:
        df.at[index, 'Patient_id'] = "{:08}".format(int(matching_rows.iloc[0, 0]))

# Drop the 'Hash' column
try:
    df = df.drop(columns=['Hash'])
except:
    pass

# Drop duplicate rows based on 'Patient_num'
df.drop_duplicates(subset=['Patient_num', 'date'], inplace=True)

# Sort the dataframe by 'date'
df = df.sort_values(by='date')

# Sort the dataframe by 'Patient_num'
df = df.sort_values(by='Patient_num')



# Reset the index
df = df.reset_index(drop=True)



df


Unnamed: 0,Patient_id,Patient_num,date,Hertel_R,Hertel_L
0,11194169,14,20190604,,
1,11194169,14,20190717,,
2,11194169,14,20200311,,
3,11194169,14,20190613,,
4,07955398,55,20020123,,
...,...,...,...,...,...
1225,11798971,9767,20210203,,
1226,11798971,9767,20200108,,
1227,11798971,9767,20200527,,
1228,11798971,9767,20200701,,


In [68]:
df.to_csv(r"D:\ダウンロード\filtered_diseases_眼窩涙腺腫瘍_Hertel.csv", index=False, header=True, encoding="shift_jis")


In [49]:
df

Unnamed: 0,Patient_id,Patient_num,date,Hertel_R,Hertel_L
0,11194169,14,20200311,,
1,11194169,14,20190604,,
2,11194169,14,20190613,,
3,11194169,14,20190717,,
4,07955398,55,20020205,,
...,...,...,...,...,...
1225,11798971,9767,20201007,,
1226,11798971,9767,20200701,,
1227,11798971,9767,20200108,,
1228,11798971,9767,20191108,,
