In [1]:
import glob
import numpy as np
import pandas as pd
import os
import shutil
from sklearn.model_selection import train_test_split

In [2]:
class Config:
    def __init__(self):
        self.workdir = "D:/bioinformatics/python/bioinformation/机器学习/EsophagealCancerResnet/"
        os.chdir(self.workdir)
        self.original_data_folder = self.workdir + "data/" + "original_data/"
        self.available_data_folder = self.workdir + "data/" + "available_data/"
        self.folder_classes = ["train", "test", "ind"]
        if not os.path.exists(self.available_data_folder):
            for folder_class in self.folder_classes:
                os.makedirs(self.available_data_folder + folder_class)        

In [3]:
config = Config()

### 划分训练集、测试集和独立检验集

In [4]:
# 读取数据
cancer_t1_data_file_paths = glob.glob(config.original_data_folder + "cancer_t1/cut_1_*.jpg")
cancer_t2_data_file_paths = glob.glob(config.original_data_folder + "cancer_t2/cut_1_*.jpg")
cancer_t3_data_file_paths = glob.glob(config.original_data_folder + "cancer_t3/cut_1_*.jpg")
cancer_t4_data_file_paths = glob.glob(config.original_data_folder + "cancer_t4/cut_1_*.jpg")
normal_data_file_paths = glob.glob(config.original_data_folder + "normal/cut_3_*.jpg")
t1_num, t2_num, t3_num, t4_num, normal_num = len(cancer_t1_data_file_paths), \
    len(cancer_t2_data_file_paths), len(cancer_t3_data_file_paths), \
    len(cancer_t4_data_file_paths), len(normal_data_file_paths)
print("癌症四个时期和正常数据图片的数量分别为：{}，{}，{}，{}, {}".format(\
                                    t1_num, t2_num, t3_num, t4_num, normal_num))

癌症四个时期和正常数据图片的数量分别为：4383，243，3985，1102, 14302


In [5]:
# 合并数据
cancer_all_data_file_paths = normal_data_file_paths + cancer_t1_data_file_paths + cancer_t2_data_file_paths\
                                + cancer_t3_data_file_paths + cancer_t4_data_file_paths
label = [0] * normal_num + [1] * t1_num + [2] * t2_num + [3] * t3_num + [4] * t4_num
print("所有样本数量统计：")
print(pd.Series(label).value_counts())
print("正样本数量：{}\n负样本数量：{}\n合计：{}".format(pd.Series(label).value_counts()[1:].sum(), 
        pd.Series(label).value_counts()[0].sum(),  pd.Series(label).value_counts().sum()))

所有样本数量统计：
0    14302
1     4383
3     3985
4     1102
2      243
dtype: int64
正样本数量：9713
负样本数量：14302
合计：24015


In [6]:
# 划分训练集、测试集和独立检验集
train_and_test_x, ind_x, train_and_test_y, ind_y = train_test_split(cancer_all_data_file_paths, label,\
                                                                    test_size=1/6, shuffle=True, stratify=label)
train_x, test_x, train_y, test_y = train_test_split(train_and_test_x, train_and_test_y,\
                                                     test_size=1/5, shuffle=True, stratify=train_and_test_y)
print("训练集样本数量：{}\t测试集样本数量：{}\t独立验证集样本数量：{}".format(\
    len(train_x), len(test_x), len(ind_x)))

训练集样本数量：16009	测试集样本数量：4003	独立验证集样本数量：4003


In [7]:
# 转移整理好的数据
for index, img_path in enumerate(train_x):
    destination_path = "./data/available_data/train/" + img_path.split("\\")[-1][:-4] + "_" + str(train_y[index]) + '.jpg'
    shutil.copy(img_path, destination_path)
for index, img_path in enumerate(test_x):
    destination_path = "./data/available_data/test/" + img_path.split("\\")[-1][:-4] + "_" + str(test_y[index]) + '.jpg'
    shutil.copy(img_path, destination_path)
for index, img_path in enumerate(ind_x):
    destination_path = "./data/available_data/ind/" + img_path.split("\\")[-1][:-4] + "_" + str(ind_y[index]) + '.jpg'
    shutil.copy(img_path, destination_path)