In [1]:
import pandas as pd
import os

# Selecting from Paraguai's dataset

In [45]:
parag = pd.read_csv('data/OT_paraguai/dataset_labels.csv')
parag

Unnamed: 0,Image name,Label,Source
0,1.jpg,healthy,Hospital de Clínicas Medical Center
1,2.jpg,healthy,Hospital de Clínicas Medical Center
2,3.jpg,healthy,Hospital de Clínicas Medical Center
3,4.jpg,healthy,Hospital de Clínicas Medical Center
4,5.jpg,healthy,Hospital de Clínicas Medical Center
...,...,...,...
407,OS0660RE.JPG,inactive,Hospital General Pediátrico Acosta Ñu Medical ...
408,OS0661RE.JPG,inactive,Hospital General Pediátrico Acosta Ñu Medical ...
409,OS0662RE.JPG,inactive,Hospital General Pediátrico Acosta Ñu Medical ...
410,OS0609RE.JPG,healthy,Hospital General Pediátrico Acosta Ñu Medical ...


In [30]:
parag['Label'].value_counts()

inactive             187
healthy              132
active/inactive       57
active                34
active/active          1
inactive/inactive      1
Name: Label, dtype: int64

"_The dataset also includes a csv file with the labels for each image: healthy, active and inactive lesions, this two being non healthy._" <br>
Therefore, all images classified as "healthy" will be excluded.

In [46]:
parag = parag[parag.Label != 'healthy']
parag

Unnamed: 0,Image name,Label,Source
80,81.jpg,inactive,Hospital de Clínicas Medical Center
81,82.jpg,inactive,Hospital de Clínicas Medical Center
82,83.jpg,inactive,Hospital de Clínicas Medical Center
83,84.jpg,active/inactive,Hospital de Clínicas Medical Center
84,85.jpg,inactive,Hospital de Clínicas Medical Center
...,...,...,...
406,OS0659RE.JPG,inactive,Hospital General Pediátrico Acosta Ñu Medical ...
407,OS0660RE.JPG,inactive,Hospital General Pediátrico Acosta Ñu Medical ...
408,OS0661RE.JPG,inactive,Hospital General Pediátrico Acosta Ñu Medical ...
409,OS0662RE.JPG,inactive,Hospital General Pediátrico Acosta Ñu Medical ...


In [31]:
parag.Source.unique()

array(['Hospital de Clínicas Medical Center',
       'Hospital General Pediátrico Acosta Ñu Medical Center'],
      dtype=object)

In [71]:
def remove_from_folder(img_to_keep, data_dir):
    for img in os.listdir(data_dir):
        try:
            if img not in img_to_keep:
                os.remove(os.path.join(data_dir, img))
        except Exception as e:
            print(f'Error with image: {e.strerror}')

# remove_from_folder(list(parag['Image name']), 'data/OT_paraguai/images')

# ... from BRSET

In [21]:
brset = pd.read_csv('data/BRSET/labels_jan15.csv')
brset

Unnamed: 0,image_id,patient_id,camera,patient_age,comorbidities,diabetes_time_y,insuline,patient_sex,exam_eye,optic_disc,...,hypertensive_retinopathy,drusens,hemorrhage,retinal_detachment,myopic_fundus,increased_cup_disc,other,quality,diabetes,nationality
0,img00001,1,Canon CR,48.0,diabetes1,12,yes,1,1,1,...,0,0,0,0,0,1,0,Adequate,yes,Brazil
1,img00002,1,Canon CR,48.0,diabetes1,12,yes,1,2,2,...,0,0,0,0,0,1,0,Adequate,yes,Brazil
2,img00003,2,Canon CR,18.0,diabetes1,7,yes,2,1,1,...,0,0,0,0,0,0,0,Adequate,yes,Brazil
3,img00004,2,Canon CR,18.0,diabetes1,7,yes,2,2,1,...,0,0,0,0,0,0,0,Adequate,yes,Brazil
4,img00005,3,Canon CR,22.0,diabetes1,11,yes,1,1,2,...,0,0,0,0,0,0,0,Adequate,yes,Brazil
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16261,img16262,8522,Canon CR,33.0,diabetes1,14,yes,1,2,2,...,0,0,0,0,0,1,0,Adequate,yes,Brazil
16262,img16263,8523,Canon CR,19.0,diabetes1,4,yes,1,1,1,...,0,0,0,0,0,0,0,Adequate,yes,Brazil
16263,img16264,8523,Canon CR,19.0,diabetes1,4,yes,1,2,1,...,0,0,0,0,0,0,0,Adequate,yes,Brazil
16264,img16265,8524,Canon CR,20.0,diabetes1,,yes,1,1,2,...,0,0,0,0,0,1,0,Adequate,yes,Brazil


Features to keep: image_id, scar

In [22]:
brset = brset[['image_id', 'scar']]
brset

Unnamed: 0,image_id,scar
0,img00001,0
1,img00002,0
2,img00003,0
3,img00004,0
4,img00005,0
...,...,...
16261,img16262,0
16262,img16263,0
16263,img16264,0
16264,img16265,0


Although there's +16k rows in our dataframe, there's actually only 2k images in our folder. <br>
Therefore, we need to drop some rows from the dataframe in order to match with the actual number of retina images available

In [24]:
imgs_in_folder = [img for img in os.listdir('data/BRSET/fundus_imgs')]
brset['image_id'] = brset['image_id'].apply(lambda x: f"{x}.jpg")
brset = brset[brset['image_id'].isin(imgs_in_folder)]
brset

Unnamed: 0,image_id,scar
1018,img01019.jpg,0
2718,img02719.jpg,0
2720,img02721.jpg,0
2721,img02722.jpg,0
2723,img02724.jpg,0
...,...,...
11159,img11160.jpg,0
11171,img11172.jpg,0
11174,img11175.jpg,0
11203,img11204.jpg,0


In [72]:
remove_from_folder(list(brset['image_id']), 'data/BRSET/fundus_imgs')

In [25]:
brset.scar.value_counts()

0    2079
1      33
Name: scar, dtype: int64

# Joining all available images and its info in one folder/file

In [33]:
print(brset, parag)

           image_id  scar
1018   img01019.jpg     0
2718   img02719.jpg     0
2720   img02721.jpg     0
2721   img02722.jpg     0
2723   img02724.jpg     0
...             ...   ...
11159  img11160.jpg     0
11171  img11172.jpg     0
11174  img11175.jpg     0
11203  img11204.jpg     0
11239  img11240.jpg     0

[2112 rows x 2 columns]        Image name            Label  \
80         81.jpg         inactive   
81         82.jpg         inactive   
82         83.jpg         inactive   
83         84.jpg  active/inactive   
84         85.jpg         inactive   
..            ...              ...   
406  OS0659RE.JPG         inactive   
407  OS0660RE.JPG         inactive   
408  OS0661RE.JPG         inactive   
409  OS0662RE.JPG         inactive   
411  OD0658RE.JPG         inactive   

                                                Source  
80                 Hospital de Clínicas Medical Center  
81                 Hospital de Clínicas Medical Center  
82                 Hospital de Clín

In [47]:
parag.drop(labels=['Source'], axis=1, inplace=True)
parag.Label = 1
parag

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  parag.drop(labels=['Source'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  parag.Label = 1


Unnamed: 0,Image name,Label
80,81.jpg,1
81,82.jpg,1
82,83.jpg,1
83,84.jpg,1
84,85.jpg,1
...,...,...
406,OS0659RE.JPG,1
407,OS0660RE.JPG,1
408,OS0661RE.JPG,1
409,OS0662RE.JPG,1


In [51]:
parag.rename(columns={'Image name':'image_id', 'Label':'scar'}, inplace=True)
df = pd.concat([brset, parag], ignore_index=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  parag.rename(columns={'Image name':'image_id', 'Label':'scar'}, inplace=True)


Unnamed: 0,image_id,scar
0,img01019.jpg,0
1,img02719.jpg,0
2,img02721.jpg,0
3,img02722.jpg,0
4,img02724.jpg,0
...,...,...
2387,OS0659RE.JPG,1
2388,OS0660RE.JPG,1
2389,OS0661RE.JPG,1
2390,OS0662RE.JPG,1


In [52]:
df.to_csv('data/brset_n_paraguai.csv', index=False)

# Adding info about the third image source (OT_unifesp)

In [54]:
imgs_in_folder = [img for img in os.listdir('data/OT_unifesp')]
len(imgs_in_folder)

72

In [64]:
unifesp = pd.DataFrame(imgs_in_folder, columns=['image_id'])
unifesp['scar'] = [1] * len(imgs_in_folder)
unifesp

Unnamed: 0,image_id,scar
0,45LMS-L.JPG,1
1,58FAS-R.JPG,1
2,17TPS-L.JPG,1
3,1SG-R.JPG,1
4,66LSF-R.JPG,1
...,...,...
67,29JPM-L.JPG,1
68,44SSC-L.JPG,1
69,35CCS-L.JPG,1
70,12MAF-R.JPG,1


In [67]:
merged_df = pd.read_csv('data/brset_n_paraguai.csv')
df = pd.concat([merged_df, unifesp], ignore_index=True)
df

Unnamed: 0,image_id,scar
0,img01019.jpg,0
1,img02719.jpg,0
2,img02721.jpg,0
3,img02722.jpg,0
4,img02724.jpg,0
...,...,...
2459,29JPM-L.JPG,1
2460,44SSC-L.JPG,1
2461,35CCS-L.JPG,1
2462,12MAF-R.JPG,1


In [68]:
df.scar.value_counts()

0    2079
1     385
Name: scar, dtype: int64

This is the final dataset, from which labelling information will be retrieved.

In [69]:
# dividing into 2 folders
origin = 'data/all_imgs'
pos_folder_path = 'data/all_imgs/OT_positive'
neg_folder_path = 'data/all_imgs/OT_negative'
import shutil

for i in df.index:
    file_path = os.path.join(origin, df['image_id'][i])
    if not os.path.exists(file_path):
        print(f"row {i} with label {df['scar'][i]} and name {df['image_id'][i]} eliminated")
        df.drop(index=i, inplace=True)
        continue
    if df['scar'][i] == 0:
        shutil.move(file_path, neg_folder_path)
    else:
        shutil.move(file_path, pos_folder_path)
df.to_csv('data/labels.csv', index=False)