<a href="https://colab.research.google.com/github/ykitaguchi77/Strabismus_AI_project/blob/main/DataSplit(stratified_one_subject_leave_out).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Data_split for one-subject-leave-out stratified 5-fold crossvalidation**

In [None]:
"""
Leave one subject out cross validation + 5-fold stratified cross validation

・1症例を抜き出し、その症例のすべての画像をテスト画像とする
・残りの症例の内斜視、外斜視、斜視なし群を、同じ症例が群をまたがないように5分割する。
・5分割したデータセットのうち4つをtraining、1つをvalidationとして用いてトレーニングを行い、抜き出した1症例のそれぞれの画像のおける正解率を算出する。これを5回繰り返してcross validationとする。

"""

In [7]:
import codecs
import pandas as pd
import os
import numpy as np
import shutil
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import StratifiedGroupKFold
import pandas as pd
from PIL import Image
pd.set_option('display.max_rows', 500)

gla_ortho_path = r"F:\先天性緑内障\data_list\Disease_ortho.csv"
gla_eso_path = r"F:\先天性緑内障\data_list\Disease_ET.csv"
gla_exo_path = r"F:\先天性緑内障\data_list\Disease_XT.csv"
cont_ortho_path = r"F:\先天性緑内障\data_list\Disease_ortho.csv"
cont_eso_path = r"F:\先天性緑内障\data_list\Control_ET.csv"
cont_exo_path = r"F:\先天性緑内障\data_list\Control_XT.csv"
dst_path = r"F:\先天性緑内障\OneGroupLeaveOut"

def opencsv(path, classname):
    #with codecs.open(path, "r", "utf-8", "ignore") as file:
    with codecs.open(path, "r", "Shift-JIS", "ignore") as file:
        df = pd.read_csv(file, index_col=None, header=None)
        df.insert(0, 'ID', '')
        df.insert(0, 'classes', '')
        for row in range(len(df)):
            df.iloc[row,0] = classname #class
            df.iloc[row,1] = os.path.basename(df.iloc[row,2]).split("_")[0] #ID
    return df

def resize_and_save_img(in_path, out_path):
    img = Image.open(in_path)
    img_new = expand2square(img, (0, 0, 0)).resize((250, 250))
    img_new.save(out_path)


def expand2square(pil_img, background_color):
    width, height = pil_img.size
    if width == height:
        return pil_img
    elif width > height:
        result = Image.new(pil_img.mode, (width, width), background_color)
        result.paste(pil_img, (0, (width-height)//2))
        return result
    else:
        result = Image.new(pil_img.mode, (height, height), background_color)
        result.paste(pil_img, (0, (height - width) // 2))
        return result


#症例のリストをpandasで開く
df_gla_ortho = opencsv(gla_ortho_path, "gla_ortho") 
df_gla_eso = opencsv(gla_eso_path, "gla_eso")
df_gla_exo = opencsv(gla_exo_path, "gla_exo")
df_cont_ortho = opencsv(cont_ortho_path, "cont_ortho")
df_cont_eso = opencsv(cont_eso_path, "gla_eso")
df_cont_exo = opencsv(cont_exo_path, "gla_exo")

df_gla_all = pd.concat([df_gla_ortho, df_gla_eso, df_gla_exo], axis=0)
df_cont_all = pd.concat([df_cont_ortho, df_cont_eso, df_cont_exo], axis=0)
df_all = pd.concat([df_cont_ortho, df_cont_eso, df_cont_exo], axis=0)

#df_all.to_csv(r"F:\先天性緑内障\data_list\df_all.csv", encoding='utf-8-sig', index=0, header=None)

In [10]:
#それぞれの項目（path, classes, ID）をリスト化
gla_dataset_path = df_gla_all.iloc[:,2].transpose().values
gla_classes = df_gla_all.iloc[:,0].transpose().values
gla_id = df_gla_all.iloc[:,1].transpose().values
cont_dataset_path = df_cont_all.iloc[:,2].transpose().values
cont_classes = df_cont_all.iloc[:,0].transpose().values
cont_id = df_cont_all.iloc[:,1].transpose().values

#print(len(gla_dataset_path))



#保存先フォルダを作成
if os.path.exists(dst_path):
    pass
    #shutil.rmtree(dst_path)
os.makedirs(dst_path, exist_ok=True)


#まずglaのデータセットから1人分を抜き出す（LeaveOneGroupOut)
logo = LeaveOneGroupOut()
logo.get_n_splits(gla_dataset_path, gla_classes, gla_id)
logo.get_n_splits(groups=gla_id)  # 'groups' is always required

for remain_index, test_index in logo.split(gla_dataset_path, gla_classes, gla_id):
    #print("TRAIN:", train_index, "TEST:", test_index)
    gla_dataset_path_remain, gla_dataset_path_test = gla_dataset_path[remain_index], gla_dataset_path[test_index]
    gla_classes_remain, gla_classes_test = gla_classes[remain_index], gla_classes[test_index]
    gla_id_remain, gla_id_test = gla_id[remain_index], gla_id[test_index]
    #print(gla_dataset_path, gla_dataset_path_test, gla_id_train, gla_id_test)
    print(gla_id_test)
    #print(gla_id_train)

    #抜き出したデータのIDでフォルダを作成
    os.makedirs(os.path.join(dst_path, gla_id_test[0]), exist_ok=True)
    os.chdir(os.path.join(dst_path, gla_id_test[0]))
    for i in range(5):
        for j in ["train", "val"]:
            for k in ["gla", "cont"]:
                os.makedirs(os.path.join(str(i), j, k), exist_ok=True)
    os.makedirs("test", exist_ok=True) #判定のための画像
    
    #testフォルダにコピー
    for file in gla_dataset_path_test:
        #print(file)
        shutil.copyfile(file, "./test/"+ os.path.basename(file))
    

    #抜き出した残りのglaについてStratified group 5-foldをかける
    cv = StratifiedGroupKFold(n_splits=5)
    m=0
    for train_idxs, val_idxs in cv.split(gla_dataset_path_remain, gla_classes_remain, gla_id_remain):
        #print("TRAIN:", gla_classes_remain[train_idxs])
        #print("      ", gla_id_remain[train_idxs])
        #print("      ", gla_dataset_path_remain[train_idxs])
        #print(" TEST:", gla_classes_remain[val_idxs])
        #print("      ", gla_id_remain[val_idxs])
        #print("      ", gla_dataset_path_remain[val_idxs])
        for idx in train_idxs:
            #print(gla_dataset_path_remain[idx])
            #print("./"+str(m)+"/train/gla/"+os.path.basename(gla_dataset_path_remain[idx]))
            print("gla_train", str(idx), str(m))
            #250pxにリサイズして保存
            #resize_and_save_img(gla_dataset_path_remain[idx], "./"+str(m)+"/train/gla/"+os.path.basename(gla_dataset_path_remain[idx]))
            #shutil.copyfile(gla_dataset_path_remain[idx], "./"+str(m)+"/train/gla/"+os.path.basename(gla_dataset_path_remain[idx]))
        for idx in val_idxs:
            print("gla_val", str(idx), str(m))
            #resize_and_save_img(gla_dataset_path_remain[idx], "./"+str(m)+"/val/gla/"+os.path.basename(gla_dataset_path_remain[idx]))
            #shutil.copyfile(gla_dataset_path_remain[idx], "./"+str(m)+"/val/gla/"+os.path.basename(gla_dataset_path_remain[idx]))
        print("Making "+str(m+1)+"/5 crossvalidation folders")
        m+=1

    #cont全体についてもStratified group-foldをかける
    m=0
    for train_idxs, val_idxs in cv.split(cont_dataset_path, cont_classes, cont_id):
        #print("TRAIN:", cont_classes[train_idxs])
        #print("      ", cont_id[train_idxs])
        #print("      ", cont_dataset_path[train_idxs])
        #print(" TEST:", cont_classes[val_idxs])
        #print("      ", cont_id[val_idxs])
        #print("      ", cont_dataset_path[val_idxs])
        for idx in train_idxs:
            #print(cont_dataset_path[idx])
            #print("./"+str(m)+"/train/cont/"+os.path.basename(cont_dataset_path_remain[idx]))
            
            print("cont_train", str(idx), str(m))
            #resize_and_save_img(cont_dataset_path[idx], "./"+str(m)+"/train/cont/"+os.path.basename(cont_dataset_path[idx]))
            #shutil.copyfile(cont_dataset_path[idx], "./"+str(m)+"/train/cont/"+os.path.basename(cont_dataset_path[idx]))
        for idx in val_idxs:
            #resize_and_save_img(cont_dataset_path[idx], "./"+str(m)+"/val/cont/"+os.path.basename(cont_dataset_path[idx]))
            #shutil.copyfile(cont_dataset_path[idx], "./"+str(m)+"/val/cont/"+os.path.basename(cont_dataset_path[idx]))
            print("cont_val", str(idx), str(m))
        
        print("Making "+str(m+1)+"/5 crossvalidation folders")
        m+=1


#Contのデータセットでも同じことをやる
#まずcontのデータセットから1人分を抜き出す（LeaveOneGroupOut)
logo = LeaveOneGroupOut()
logo.get_n_splits(cont_dataset_path, cont_classes, cont_id)
logo.get_n_splits(groups=cont_id)  # 'groups' is always required

for remain_index, test_index in logo.split(cont_dataset_path, cont_classes, cont_id):
    #print("TRAIN:", train_index, "TEST:", test_index)
    cont_dataset_path_remain, cont_dataset_path_test = cont_dataset_path[remain_index], cont_dataset_path[test_index]
    cont_classes_remain, cont_classes_test = cont_classes[remain_index], cont_classes[test_index]
    cont_id_remain, cont_id_test = cont_id[remain_index], cont_id[test_index]
    #print(cont_dataset_path, cont_dataset_path_test, cont_id_train, cont_id_test)
    print(cont_id_test)
    #print(cont_id_train)

    #抜き出したデータのIDでフォルダを作成
    os.makedirs(os.path.join(dst_path, cont_id_test[0]), exist_ok=True)
    os.chdir(os.path.join(dst_path, cont_id_test[0]))
    for i in range(5):
        for j in ["train", "val"]:
            for k in ["cont", "cont"]:
                os.makedirs(os.path.join(str(i), j, k), exist_ok=True)
    os.makedirs("test", exist_ok=True) #判定のための画像
    
    """
    #testフォルダにコピー
    for file in cont_dataset_path_test:
        #print(file)
        shutil.copyfile(file, "./test/"+ os.path.basename(file))
    """

    #抜き出した残りのcontについてStratified group 5-foldをかける
    cv = StratifiedGroupKFold(n_splits=5)
    m=0
    for train_idxs, val_idxs in cv.split(cont_dataset_path_remain, cont_classes_remain, cont_id_remain):
        #print("TRAIN:", cont_classes_remain[train_idxs])
        #print("      ", cont_id_remain[train_idxs])
        #print("      ", cont_dataset_path_remain[train_idxs])
        #print(" TEST:", cont_classes_remain[val_idxs])
        #print("      ", cont_id_remain[val_idxs])
        #print("      ", cont_dataset_path_remain[val_idxs])
        for idx in train_idxs:
            #print(cont_dataset_path_remain[idx])
            #print("./"+str(m)+"/train/cont/"+os.path.basename(cont_dataset_path_remain[idx]))
            
            print("cont_train", str(idx), str(m))
            #resize_and_save_img(cont_dataset_path_remain[idx], "./"+str(m)+"/train/cont/"+os.path.basename(cont_dataset_path_remain[idx]))
            #shutil.copyfile(cont_dataset_path_remain[idx], "./"+str(m)+"/train/cont/"+os.path.basename(cont_dataset_path_remain[idx]))
        for idx in val_idxs:
            #resize_and_save_img(cont_dataset_path_remain[idx], "./"+str(m)+"/val/cont/"+os.path.basename(cont_dataset_path_remain[idx]))
            #shutil.copyfile(cont_dataset_path_remain[idx], "./"+str(m)+"/val/cont/"+os.path.basename(cont_dataset_path_remain[idx]))
            print("cont_val", str(idx), str(m))
        
        print("Making "+str(m+1)+"/5 crossvalidation folders")
        m+=1

    #gla全体についてもStratified group-foldをかける
    m=0
    for train_idxs, val_idxs in cv.split(gla_dataset_path, gla_classes, gla_id):
        #print("TRAIN:", gla_classes[train_idxs])
        #print("      ", gla_id[train_idxs])
        #print("      ", gla_dataset_path[train_idxs])
        #print(" TEST:", gla_classes[val_idxs])
        #print("      ", gla_id[val_idxs])
        #print("      ", gla_dataset_path[val_idxs])
        for idx in train_idxs:
            #print(gla_dataset_path[idx])
            #print("./"+str(m)+"/train/gla/"+os.path.basename(gla_dataset_path_remain[idx]))
            
            print("gla_train", str(idx), str(m))
            #resize_and_save_img(gla_dataset_path[idx], "./"+str(m)+"/train/gla/"+os.path.basename(gla_dataset_path[idx]))
            #shutil.copyfile(gla_dataset_path[idx], "./"+str(m)+"/train/gla/"+os.path.basename(gla_dataset_path[idx]))
        for idx in val_idxs:
            #resize_and_save_img(gla_dataset_path[idx], "./"+str(m)+"/val/gla/"+os.path.basename(gla_dataset_path[idx]))
            #shutil.copyfile(gla_dataset_path[idx], "./"+str(m)+"/val/gla/"+os.path.basename(gla_dataset_path[idx]))
            print("gla_val", str(idx), str(m))
        
        print("Making "+str(m+1)+"/5 crossvalidation folders")
        m+=1

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
gla_val 165 4
gla_val 166 4
gla_val 167 4
gla_val 168 4
gla_val 201 4
gla_val 202 4
gla_val 203 4
Making 5/5 crossvalidation folders
['994']
cont_train 0 0
cont_train 1 0
cont_train 2 0
cont_train 3 0
cont_train 4 0
cont_train 5 0
cont_train 6 0
cont_train 7 0
cont_train 8 0
cont_train 9 0
cont_train 10 0
cont_train 11 0
cont_train 12 0
cont_train 13 0
cont_train 14 0
cont_train 17 0
cont_train 18 0
cont_train 19 0
cont_train 20 0
cont_train 21 0
cont_train 22 0
cont_train 23 0
cont_train 24 0
cont_train 25 0
cont_train 26 0
cont_train 27 0
cont_train 28 0
cont_train 29 0
cont_train 30 0
cont_train 31 0
cont_train 32 0
cont_train 33 0
cont_train 34 0
cont_train 35 0
cont_train 36 0
cont_train 37 0
cont_train 38 0
cont_train 39 0
cont_train 40 0
cont_train 41 0
cont_train 42 0
cont_train 43 0
cont_train 44 0
cont_train 45 0
cont_train 46 0
cont_train 47 0
cont_train 48 0
cont_train 49 0
cont_train 50 0
cont_train 51 0
cont_train 52 0
cont_tr

In [None]:
os.getcwd()
os.listdir()
shutil.copyfile(r"F:\先天性緑内障\データ引継ぎ\children_control\8_1.jpg", "./"+str(1)+"/train/gla/"+os.path.basename(r"F:\先天性緑内障\データ引継ぎ\children_control\8_1.jpg"))

'./1/train/gla/8_1.jpg'

In [None]:
os.getcwd()

path = "./test"
os.listdir(path)

['2004_12.jpg',
 '2004_28.jpg',
 '2004_33.jpg',
 '2004_34.jpg',
 '2004_14.jpg',
 '2004_17.jpg',
 '2004_19.jpg',
 '2004_22.jpg',
 '2004_24.jpg',
 '2004_30.jpg']

In [None]:
# one group leave out 見本
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LeaveOneGroupOut.html#sklearn.model_selection.LeaveOneGroupOut
# 今回のケースでは、groupがIDに該当
import numpy as np
from sklearn.model_selection import LeaveOneGroupOut
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([1, 2, 1, 2, 1, 1])
groups = np.array([1, 1, 2, 3, 3, 4])
logo = LeaveOneGroupOut()
logo.get_n_splits(X, y, groups)
logo.get_n_splits(groups=groups)  # 'groups' is always required
print(logo)
LeaveOneGroupOut()
for train_index, test_index in logo.split(X, y, groups):
     print("TRAIN:", train_index, "TEST:", test_index)
     X_train, X_test = X[train_index], X[test_index]
     y_train, y_test = y[train_index], y[test_index]
     print(X_train, X_test, y_train, y_test)

LeaveOneGroupOut()
TRAIN: [2 3 4 5] TEST: [0 1]
[[ 5  6]
 [ 7  8]
 [ 9 10]
 [11 12]] [[1 2]
 [3 4]] [1 2 1 1] [1 2]
TRAIN: [0 1 3 4 5] TEST: [2]
[[ 1  2]
 [ 3  4]
 [ 7  8]
 [ 9 10]
 [11 12]] [[5 6]] [1 2 2 1 1] [1]
TRAIN: [0 1 2 5] TEST: [3 4]
[[ 1  2]
 [ 3  4]
 [ 5  6]
 [11 12]] [[ 7  8]
 [ 9 10]] [1 2 1 1] [2 1]
TRAIN: [0 1 2 3 4] TEST: [5]
[[ 1  2]
 [ 3  4]
 [ 5  6]
 [ 7  8]
 [ 9 10]] [[11 12]] [1 2 1 2 1] [1]


In [None]:
# example of stratified group Kfold　見本
# 今回のケースでは、groupがID、yがclassesに該当
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
X = np.ones((17, 2))
y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8])
cv = StratifiedGroupKFold(n_splits=3)

print(X)
print(y)
print(groups)

for train_idxs, test_idxs in cv.split(X, y, groups):
    print("TRAIN:", groups[train_idxs])
    print("      ", y[train_idxs])
    print(" TEST:", groups[test_idxs])
    print("      ", y[test_idxs])

In [None]:
# one group leave out 
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.LeaveOneGroupOut.html#sklearn.model_selection.LeaveOneGroupOut
# 今回のケースでは、groupがIDに該当
import numpy as np
from sklearn.model_selection import LeaveOneGroupOut
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
y = np.array([1, 2, 1, 2, 1, 1])
groups = np.array([1, 1, 2, 3, 3, 4])
logo = LeaveOneGroupOut()
logo.get_n_splits(X, y, groups)
logo.get_n_splits(groups=groups)  # 'groups' is always required
print(logo)
LeaveOneGroupOut()
for train_index, test_index in logo.split(X, y, groups):
     print("TRAIN:", train_index, "TEST:", test_index)
     X_train, X_test = X[train_index], X[test_index]
     y_train, y_test = y[train_index], y[test_index]
     print(X_train, X_test, y_train, y_test)

LeaveOneGroupOut()
TRAIN: [2 3 4 5] TEST: [0 1]
[[ 5  6]
 [ 7  8]
 [ 9 10]
 [11 12]] [[1 2]
 [3 4]] [1 2 1 1] [1 2]
TRAIN: [0 1 3 4 5] TEST: [2]
[[ 1  2]
 [ 3  4]
 [ 7  8]
 [ 9 10]
 [11 12]] [[5 6]] [1 2 2 1 1] [1]
TRAIN: [0 1 2 5] TEST: [3 4]
[[ 1  2]
 [ 3  4]
 [ 5  6]
 [11 12]] [[ 7  8]
 [ 9 10]] [1 2 1 1] [2 1]
TRAIN: [0 1 2 3 4] TEST: [5]
[[ 1  2]
 [ 3  4]
 [ 5  6]
 [ 7  8]
 [ 9 10]] [[11 12]] [1 2 1 2 1] [1]
