In [1]:
import pandas as pd
import os
import numpy as np
import cv2
from tqdm import tqdm
from glob import glob
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from imblearn.over_sampling import SMOTE
from timm import create_model

In [2]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [42]:
_HYPER_KVASIR = 'D:\\data\\endoscopic\\hyper_kvasir\\labeled_images'
# _HYPER_KVASIR_LABELED_DEFORM = 'D:\\data\\endoscopic\\hyper_kvasir\\labeled_images_deform'

In [4]:
df_anno = pd.read_csv('../resource/hyper_kvasir/full_labeled_anno.csv')
df_anno.head()

Unnamed: 0,Organ,Classification,Finding,Image,Categories,is_valid
0,upper-gi-tract,anatomical-landmarks,z-line,upper-gi-tract/anatomical-landmarks/z-line/c7e...,0,False
1,lower-gi-tract,therapeutic-interventions,dyed-lifted-polyps,lower-gi-tract/therapeutic-interventions/dyed-...,0,False
2,lower-gi-tract,quality-of-mucosal-views,bbps-2-3,lower-gi-tract/quality-of-mucosal-views/bbps-2...,0,False
3,lower-gi-tract,quality-of-mucosal-views,bbps-2-3,lower-gi-tract/quality-of-mucosal-views/bbps-2...,0,False
4,upper-gi-tract,anatomical-landmarks,z-line,upper-gi-tract/anatomical-landmarks/z-line/b98...,0,False


In [19]:
categories = pd.read_csv('../resource/hyper_kvasir/categories.csv')
categories

Unnamed: 0,target,findings,path
0,0,barretts,upper-gi-tract/pathological-findings/barretts
1,1,bbps-0-1,lower-gi-tract/quality-of-mucosal-views/bbps-0-1
2,2,bbps-2-3,lower-gi-tract/quality-of-mucosal-views/bbps-2-3
3,3,dyed-lifted-polyps,lower-gi-tract/therapeutic-interventions/dyed-...
4,4,dyed-resection-margins,lower-gi-tract/therapeutic-interventions/dyed-...
5,5,hemorrhoids,lower-gi-tract/pathological-findings/hemorrhoids
6,6,ileum,lower-gi-tract/anatomical-landmarks/ileum
7,7,impacted-stool,lower-gi-tract/quality-of-mucosal-views/impact...
8,8,cecum,lower-gi-tract/anatomical-landmarks/cecum
9,9,pylorus,upper-gi-tract/anatomical-landmarks/pylorus


In [12]:
# list_paths = []
# for c in categories['findings']:
#     path = df_anno[df_anno['Finding']==c]['Image'].iloc[0]
#     path = '/'.join(path.split('/')[:-1])
#     list_paths.append(path)

In [14]:
# categories['path'] = list_paths
# categories.to_csv('../resource/hyper_kvasir/categories.csv', index=False, header=True)

## Read k-fold split

In [5]:
df_kfold = pd.read_csv('../resource/hyper_kvasir/2_fold_split.csv', sep = ';')
df_kfold.head()

Unnamed: 0,file-name,class-name,split-index
0,4bf58f1c-8233-41fc-9614-344e6d0fc351.jpg,impacted-stool,0
1,9c3e5380-9621-4e74-b937-0c46f37e7ef3.jpg,impacted-stool,0
2,cf3667be-0c86-4ce7-a0f3-721144b67230.jpg,impacted-stool,0
3,8dbf6ead-8801-4a28-b811-62567dd2edb5.jpg,impacted-stool,0
4,af09c241-1417-4a42-91ef-df86bab65f7c.jpg,impacted-stool,0


In [22]:
dict_correct_cat_name = {'oesophagitis-b-d': 'esophagitis-b-d',
                         'short-segment-barretts': 'barretts-short-segment',
                         'oesophagitis-a': 'esophagitis-a',
                         'normal-pylorus': 'pylorus',
                         'hemorroids': 'hemorrhoids',
                         'normal-cecum': 'cecum',
                         'polyp': 'polyps',
                         'normal-z-line': 'z-line'}

In [47]:
list_img_path = []
list_targets = []
for i in range(len(df_kfold)):
    cls_name = df_kfold['class-name'].iloc[i]
    if cls_name in dict_correct_cat_name.keys():
        df_kfold['class-name'].iloc[i] = dict_correct_cat_name[cls_name]
        
    cls_name = df_kfold['class-name'].iloc[i]
    dir_path = categories[categories['findings']==cls_name]['path'].iloc[0]
    img_path = os.path.join(dir_path, df_kfold['file-name'].iloc[i])
    list_img_path.append(img_path)
    target = categories[categories['findings']==cls_name]['target'].iloc[0]
    list_targets.append(target)

In [25]:
# list_cate_kfold = list(set(list(df_kfold['class-name'])))
# for item in list_cate_kfold:
#     if not item in list(categories['findings']):
#         print(item)

In [49]:
df_kfold['path'] = list_img_path
df_kfold['target'] = list_targets
df_train = df_kfold[df_kfold['split-index']==0]
df_train['is_valid'] = False
df_valid = df_kfold[df_kfold['split-index']==1]
df_valid['is_valid'] = True

df_full = pd.concat([df_train, df_valid], axis=0)
df_full.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,file-name,class-name,split-index,path,target,is_valid
0,4bf58f1c-8233-41fc-9614-344e6d0fc351.jpg,impacted-stool,0,lower-gi-tract/quality-of-mucosal-views/impact...,7,False
1,9c3e5380-9621-4e74-b937-0c46f37e7ef3.jpg,impacted-stool,0,lower-gi-tract/quality-of-mucosal-views/impact...,7,False
2,cf3667be-0c86-4ce7-a0f3-721144b67230.jpg,impacted-stool,0,lower-gi-tract/quality-of-mucosal-views/impact...,7,False
3,8dbf6ead-8801-4a28-b811-62567dd2edb5.jpg,impacted-stool,0,lower-gi-tract/quality-of-mucosal-views/impact...,7,False
4,af09c241-1417-4a42-91ef-df86bab65f7c.jpg,impacted-stool,0,lower-gi-tract/quality-of-mucosal-views/impact...,7,False


In [None]:
df_full.to_csv('../resource/hyper_kvasir/df_full_kfold.csv', index=False, header=True)

In [6]:
list_cat_abn = []
for c in df_anno['Categories']:
    if c != 0:
        list_cat_abn.append(1)
    else:
        list_cat_abn.append(c)
df_anno['Abnormal'] = list_cat_abn

In [7]:
df_anno['Abnormal'].value_counts()

0    8020
1    2642
Name: Abnormal, dtype: int64

In [8]:
# df_anno

In [9]:
df_norm = df_anno[df_anno['Abnormal'] == 0]
df_patho = df_anno[df_anno['Abnormal'] == 1]

In [10]:
## set the new categories
# for idx, c in enumerate(df_patho['Categories']):
#     df_patho['Categories'].iloc[idx] = c - 1

In [11]:
dict_cate = {}
for i in range(13):
    df_ = df_anno[df_anno['Categories']==i]
    cat = set(df_['Finding'])
    dict_cate[i] = cat

In [12]:
dict_cate

{0: {'bbps-0-1',
  'bbps-2-3',
  'cecum',
  'dyed-lifted-polyps',
  'dyed-resection-margins',
  'ileum',
  'impacted-stool',
  'pylorus',
  'retroflex-rectum',
  'retroflex-stomach',
  'z-line'},
 1: {'barretts-short-segment'},
 2: {'ulcerative-colitis-grade-3'},
 3: {'ulcerative-colitis-grade-0-1'},
 4: {'ulcerative-colitis-grade-1'},
 5: {'polyps'},
 6: {'ulcerative-colitis-grade-1-2'},
 7: {'ulcerative-colitis-grade-2'},
 8: {'ulcerative-colitis-grade-2-3'},
 9: {'barretts'},
 10: {'esophagitis-a'},
 11: {'hemorrhoids'},
 12: {'esophagitis-b-d'}}

In [13]:
# df_patho['Categories'].value_counts()[:6]

In [14]:
# major_classes = dict(df_patho['Categories'].value_counts()[:6])
# sum(major_classes.values())

In [15]:
# df_major_patho = pd.DataFrame([])
# for i, c in enumerate(major_classes.keys()):
#     df_ = df_patho[df_patho['Categories']==c]
#     df_['Categories'] = i
#     df_major_patho = pd.concat([df_major_patho, df_], axis=0)

In [16]:
# df_major_patho

## Split all data

In [17]:
# df_norm_train, df_norm_valid = train_test_split(df_norm, test_size=0.2, random_state=0)
# x_abno = df_patho['Image']
# y_abno = df_patho['Categories']
# x_train, x_valid, y_train, y_valid = train_test_split(x_abno, y_abno, test_size=0.2, random_state=0)

# df_train_abno = pd.DataFrame([])
# df_train_abno['image'] = list(df_norm_train['Image']) + list(x_train)
# df_train_abno['target'] = list(df_norm_train['Categories']) + [1]*len(y_train)
# df_train_abno['is_valid'] = False

# df_valid_abno = pd.DataFrame([])
# df_valid_abno['image'] = list(df_norm_valid['Image']) + list(x_valid)
# df_valid_abno['target'] = list(df_norm_valid['Categories']) + [1]*len(y_valid)
# df_valid_abno['is_valid'] = True


# df_train_patho = pd.DataFrame([])
# df_train_patho['image'] = x_train
# df_train_patho['target'] = [(y - 1) for y in list(y_train)]
# df_train_patho['is_valid'] = False

# df_valid_patho = pd.DataFrame([])
# df_valid_patho['image'] = x_valid
# df_valid_patho['target'] = [(y - 1) for y in list(y_valid)]
# df_valid_patho['is_valid'] = True

# df_full_abno = pd.concat([df_train_abno, df_valid_abno], axis=0).reset_index()
# df_full_patho = pd.concat([df_train_patho, df_valid_patho], axis=0).reset_index()

In [18]:
# df_full_abno.to_csv('../resource/hyper_kvasir/df_abnomalies.csv', index=False, header=True)
# df_full_patho.to_csv('../resource/hyper_kvasir/df_pathologies.csv', index=False, header=True)

## Split mock size

In [19]:
df_full_patho = pd.read_csv('../resource/hyper_kvasir/df_pathologies.csv')
x_train = df_full_patho[df_full_patho['is_valid']==False]['image']
y_train = df_full_patho[df_full_patho['is_valid']==False]['target']

df_valid_patho = df_full_patho[df_full_patho['is_valid']==True]

In [20]:
df_valid_patho['is_labeled'] = True
df_valid_patho.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,index,image,target,is_valid,is_labeled
2113,3050,lower-gi-tract/pathological-findings/polyps/dc...,4,True,True
2114,5461,lower-gi-tract/pathological-findings/polyps/f8...,4,True,True
2115,8895,lower-gi-tract/pathological-findings/ulcerativ...,6,True,True
2116,8464,lower-gi-tract/pathological-findings/ulcerativ...,6,True,True
2117,6077,lower-gi-tract/pathological-findings/polyps/ca...,4,True,True


In [48]:
x_train_unlab, x_train_lab, y_train_unlab, y_train_lab  = train_test_split(x_train, 
                                                                          y_train, 
                                                                          test_size=0.1,
                                                                          random_state=10)


df_train_lab = pd.DataFrame([])
df_train_lab['image'] = x_train_lab
df_train_lab['target'] = y_train_lab
df_train_lab['is_valid'] = False
df_train_lab['is_labeled'] = True

df_train_unlab = pd.DataFrame([])
df_train_unlab['image'] = x_train_unlab
df_train_unlab['target'] = y_train_unlab
df_train_unlab['is_valid'] = False
df_train_unlab['is_labeled'] = False

df_full_mock = pd.concat([df_train_lab, df_train_unlab, df_valid_patho], axis=0)
del df_full_mock['index']
set(y_train_lab)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}

In [49]:
df_  = df_full_mock[df_full_mock['is_labeled']==True]
df_[df_['is_valid']==False]['target'].value_counts()

4     74
9     50
6     33
11    17
1     15
3     11
8      4
5      2
2      2
0      2
10     1
7      1
Name: target, dtype: int64

In [50]:
df_full_mock.to_csv('../resource/hyper_kvasir/df_pathologies_mock_9_1.csv', index=False, header = True)

In [62]:
# df_train_lab['target'].value_counts()

In [63]:
# df_valid

In [22]:
# df_full[df_full['is_labeled']==True]

In [23]:
# df_full['target'].value_counts()

In [24]:
# df_full.to_csv('../resource/hyper_kvasir/df_pathologies.csv', index=False, header=True)

In [11]:
# x = df_anno['path']
# y = df_anno['Categories']

# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
# x_train_lab, x_train_unlab, y_train_lab, y_train_unlab = train_test_split(x_train, 
#                                                                           y_train, 
#                                                                           test_size=0.32, 
#                                                                           random_state=0)


# df_train_lab = pd.DataFrame([])
# df_train_lab = pd.DataFrame([])
# df_train_lab['image'] = x_train_lab
# df_train_lab['target'] = y_train_lab
# df_train_lab['is_valid'] = False
# df_train_lab['is_labeled'] = True

# df_train_unlab = pd.DataFrame([])
# df_train_unlab = pd.DataFrame([])
# df_train_unlab['image'] = x_train_unlab
# df_train_unlab['target'] = y_train_unlab
# df_train_unlab['is_valid'] = False
# df_train_unlab['is_labeled'] = False


# df_valid['image'] = x_test
# df_valid['target'] = y_test
# df_valid['is_valid'] = True
# df_valid['is_labeled'] = True

In [12]:
# df_full = pd.concat([df_train_lab, df_train_unlab, df_valid], axis=0).reset_index()
# del df_full['index']
# df_full

In [13]:
# fig = df_full['target'].value_counts().plot(kind='bar', figsize=(10, 8), fontsize=10, rot=0).get_figure()

In [23]:
# df_full.to_csv('../resource/hyper_kvasir/df_abnormal.csv', index=False, header=True)

In [15]:
# cnt_train = df_train['target'].value_counts()
# cnt_valid = df_valid['target'].value_counts()

# df_cnt = pd.DataFrame({'train': cnt_train,'valid': cnt_valid})
# chart = df_cnt.plot.bar(rot=0, figsize=(10, 8), fontsize=15, stacked=True).get_figure()
# # chart.savefig('../resource/visualize/visual_sup.png')

In [69]:
# cnt_train = df_train['Groupby_Categories']
# cnt_train_labeled, cnt_train_unlabeled = train_test_split(cnt_train, test_size = 0.9, random_state = 0)
# cnt_train_labeled, cnt_train_unlabeled = cnt_train_labeled.value_counts(), cnt_train_unlabeled.value_counts()
# cnt_valid = df_valid['Groupby_Categories'].value_counts()

# df_cnt = pd.DataFrame({'train_labeled': cnt_train_labeled, 
                       
#                        'train_unlabeled': cnt_train_unlabeled,
#                        'valid': cnt_valid,
#                        })
# chart_semi = df_cnt.plot.bar(rot=0, 
#                         figsize=(10, 8), 
#                         fontsize=15,
#                         color={"train_labeled": "#1f76b5", "train_unlabeled": "green", "valid":"#fe7e0e"},
#                         stacked=True).get_figure()
# chart_semi.savefig('../resource/visualize/visual_semi_1_9.png')

In [70]:
# cnt_train_labeled, cnt_train_unlabeled

In [71]:
# df_train['Groupby_Categories'].value_counts()
# fig_train.savefig('../resource/visualize/groupcat_train.png')

In [72]:
# df_valid['Groupby_Categories'].value_counts()
# fig_valid.savefig('../resource/visualize/groupcat_valid.png')

In [14]:
# fig.savefig('../resource/visualize/groupcat.png')

In [15]:
# df_full.to_csv('../resource/hyper_kvasir/full_labeled_group.csv', index=False, header=True)