In [1]:
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

### Read and preprocess data
Filter out nodule-related samples by judging whether the characters in the original_image column (positions 6–7 are "NI").

In [2]:
def is_nodule(row):
    if row[5:7] == 'NI':
        return True
    else:
        return False

In [3]:
meta = pd.read_csv(r"D:\aMaster\github_code\VAE_lung_lesion_BMVC\Data\Meta\meta_info.csv")
meta = meta.drop("is_clean",axis=1)
meta['is_nodule'] = meta['original_image'].apply(lambda row: is_nodule(row))
meta = meta[meta['is_nodule']==True]
meta.reset_index(inplace=True)
print(meta["patient_id"].nunique())
meta

862


Unnamed: 0,index,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_nodule
0,0,1,0,0,0001_NI000_slice000,0001_MA000_slice000,5,True,True
1,1,1,0,1,0001_NI000_slice001,0001_MA000_slice001,5,True,True
2,2,1,0,2,0001_NI000_slice002,0001_MA000_slice002,5,True,True
3,3,1,0,3,0001_NI000_slice003,0001_MA000_slice003,5,True,True
4,4,1,0,4,0001_NI000_slice004,0001_MA000_slice004,5,True,True
...,...,...,...,...,...,...,...,...,...
13602,13602,1011,3,0,1011_NI003_slice000,1011_MA003_slice000,2,False,True
13603,13603,1011,3,1,1011_NI003_slice001,1011_MA003_slice001,2,False,True
13604,13604,1012,0,0,1012_NI000_slice000,1012_MA000_slice000,2,False,True
13605,13605,1012,0,1,1012_NI000_slice001,1012_MA000_slice001,2,False,True


### Data Filtering.
Removed samples with malignancy == 3 (Ambiguous).

In [4]:
print("Malignancy distributed:")
print(meta['malignancy'].value_counts())
print(meta['is_cancer'].value_counts())

Malignancy distributed:
malignancy
3    5307
5    2626
4    2448
2    1800
1    1426
Name: count, dtype: int64
is_cancer
Ambiguous    5307
True         5074
False        3226
Name: count, dtype: int64


In [5]:
meta = meta[meta['malignancy'] != 3]
meta

Unnamed: 0,index,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_nodule
0,0,1,0,0,0001_NI000_slice000,0001_MA000_slice000,5,True,True
1,1,1,0,1,0001_NI000_slice001,0001_MA000_slice001,5,True,True
2,2,1,0,2,0001_NI000_slice002,0001_MA000_slice002,5,True,True
3,3,1,0,3,0001_NI000_slice003,0001_MA000_slice003,5,True,True
4,4,1,0,4,0001_NI000_slice004,0001_MA000_slice004,5,True,True
...,...,...,...,...,...,...,...,...,...
13602,13602,1011,3,0,1011_NI003_slice000,1011_MA003_slice000,2,False,True
13603,13603,1011,3,1,1011_NI003_slice001,1011_MA003_slice001,2,False,True
13604,13604,1012,0,0,1012_NI000_slice000,1012_MA000_slice000,2,False,True
13605,13605,1012,0,1,1012_NI000_slice001,1012_MA000_slice001,2,False,True


In [6]:
print("Malignancy distributed:")
print(meta['malignancy'].value_counts())
print(meta['is_cancer'].value_counts())

Malignancy distributed:
malignancy
5    2626
4    2448
2    1800
1    1426
Name: count, dtype: int64
is_cancer
True     5074
False    3226
Name: count, dtype: int64


In [7]:
meta_patient_id = list(np.unique(meta['patient_id']))
len(meta_patient_id)

669

### Splitting the dataset
Extract unique patient IDs and convert into a list. Split the dataset based on patient ID.
Divide patients into 80% training + 20% testing;
Take 13% from the training set for validation;
Use the is_train function to assign labels (Train, Validation, Test) to each data sample;

In [8]:
def is_train(row,train,val,test):
    if row in train:
        return 'Train'
    elif row in val:
        return 'Validation'
    else:
        return 'Test'

In [9]:
def create_label_segmentation(meta):
    patient_id = list(np.unique(meta['patient_id']))
    train_patient , test_patient = train_test_split(patient_id,test_size= 0.2)
    train_patient, val_patient = train_test_split(train_patient,test_size= 0.13)
    print(len(train_patient),len(val_patient),len(test_patient))
    meta['data_split']= meta['patient_id'].apply(lambda row : is_train(row,train_patient,val_patient,test_patient))
    return meta

In [10]:
meta = create_label_segmentation(meta)
print('total_patients', len(meta_patient_id))
meta.to_csv(r"D:\aMaster\github_code\VAE_lung_lesion_BMVC\Data\Meta\meta.csv")

465 70 134
total_patients 669


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta['data_split']= meta['patient_id'].apply(lambda row : is_train(row,train_patient,val_patient,test_patient))


In [11]:
meta.head(15)

Unnamed: 0,index,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_nodule,data_split
0,0,1,0,0,0001_NI000_slice000,0001_MA000_slice000,5,True,True,Train
1,1,1,0,1,0001_NI000_slice001,0001_MA000_slice001,5,True,True,Train
2,2,1,0,2,0001_NI000_slice002,0001_MA000_slice002,5,True,True,Train
3,3,1,0,3,0001_NI000_slice003,0001_MA000_slice003,5,True,True,Train
4,4,1,0,4,0001_NI000_slice004,0001_MA000_slice004,5,True,True,Train
5,5,1,0,5,0001_NI000_slice005,0001_MA000_slice005,5,True,True,Train
6,6,1,0,6,0001_NI000_slice006,0001_MA000_slice006,5,True,True,Train
7,7,1,0,7,0001_NI000_slice007,0001_MA000_slice007,5,True,True,Train
8,8,2,0,0,0002_NI000_slice000,0002_MA000_slice000,5,True,True,Test
9,9,2,0,1,0002_NI000_slice001,0002_MA000_slice001,5,True,True,Test


In [12]:
print("Dataset distributed:")
print(meta['data_split'].value_counts())

Dataset distributed:
data_split
Train         5738
Test          1649
Validation     913
Name: count, dtype: int64


### Filter the data
Only keep the records of image files in the directory.

In [13]:
IMAGE_DIR = r"D:\aMaster\github_code\VAE_lung_lesion_BMVC\Data\Images"
all_files_list = [f[:-4] for f in os.listdir(IMAGE_DIR)]
all_files_list.sort()       
print(len(all_files_list))
print(all_files_list[0:10])
meta = meta[meta.original_image.isin(all_files_list)]
meta

13564
['0001_NI000_slice000', '0001_NI000_slice001', '0001_NI000_slice002', '0001_NI000_slice003', '0001_NI000_slice004', '0001_NI000_slice005', '0001_NI000_slice006', '0001_NI000_slice007', '0002_NI000_slice000', '0002_NI000_slice001']


Unnamed: 0,index,patient_id,nodule_no,slice_no,original_image,mask_image,malignancy,is_cancer,is_nodule,data_split
0,0,1,0,0,0001_NI000_slice000,0001_MA000_slice000,5,True,True,Train
1,1,1,0,1,0001_NI000_slice001,0001_MA000_slice001,5,True,True,Train
2,2,1,0,2,0001_NI000_slice002,0001_MA000_slice002,5,True,True,Train
3,3,1,0,3,0001_NI000_slice003,0001_MA000_slice003,5,True,True,Train
4,4,1,0,4,0001_NI000_slice004,0001_MA000_slice004,5,True,True,Train
...,...,...,...,...,...,...,...,...,...,...
13602,13602,1011,3,0,1011_NI003_slice000,1011_MA003_slice000,2,False,True,Test
13603,13603,1011,3,1,1011_NI003_slice001,1011_MA003_slice001,2,False,True,Test
13604,13604,1012,0,0,1012_NI000_slice000,1012_MA000_slice000,2,False,True,Train
13605,13605,1012,0,1,1012_NI000_slice001,1012_MA000_slice001,2,False,True,Train


In [14]:
meta.to_csv(r"D:\aMaster\github_code\VAE_lung_lesion_BMVC\Data\Meta\meta_mal_ben.csv")

### Save label data
Count the data of cancer and no cancer, and record the label of cancer as 1 and no cancer as 2.

In [15]:
meta_cancer = meta[meta['is_cancer']=='True']
len(meta_cancer)

5074

In [16]:
meta_noncancer = meta[meta['is_cancer']=='False'] 
len(meta_noncancer)

3216

In [17]:
patient_cancer_status = meta.groupby('patient_id')['is_cancer'].apply(lambda x: 'True' if 'True' in x.values else 'False')

print("Total patients:", len(patient_cancer_status))
print("Cancer patients:", sum(patient_cancer_status == 'True'))
print("Non-cancer patients:", sum(patient_cancer_status == 'False'))

Total patients: 669
Cancer patients: 383
Non-cancer patients: 286


In [18]:
cancer = list(meta_cancer['index'])
non_cancer = list(meta_noncancer['index'])
len(cancer + non_cancer)

8290

In [20]:
labels = []
for i in range(13916):
    if i in cancer:
        labels.append(1) #'cancer'
    if i in non_cancer:
        labels.append(0) #'non_cancer'
len(labels)

8290

In [21]:
np.save(r"D:\aMaster\github_code\VAE_lung_lesion_BMVC\Data" + '/' + 'labels', labels)