In [1]:
import pandas as pd
import os
import numpy as np
import cv2
from tqdm import tqdm
from glob import glob
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from imblearn.over_sampling import SMOTE
from timm import create_model

In [2]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [3]:
_HYPER_KVASIR = 'D:\\data\\endoscopic\\hyper_kvasir'
# _HYPER_KVASIR_LABELED_DEFORM = 'D:\\data\\endoscopic\\hyper_kvasir\\labeled_images_deform'

In [4]:
df_anno = pd.read_csv('../resource/hyper_kvasir/full_labeled_anno.csv')
df_anno.head()

Unnamed: 0,Organ,Classification,Finding,Image,Categories,is_valid
0,upper-gi-tract,anatomical-landmarks,z-line,upper-gi-tract/anatomical-landmarks/z-line/c7e...,0,False
1,lower-gi-tract,therapeutic-interventions,dyed-lifted-polyps,lower-gi-tract/therapeutic-interventions/dyed-...,0,False
2,lower-gi-tract,quality-of-mucosal-views,bbps-2-3,lower-gi-tract/quality-of-mucosal-views/bbps-2...,0,False
3,lower-gi-tract,quality-of-mucosal-views,bbps-2-3,lower-gi-tract/quality-of-mucosal-views/bbps-2...,0,False
4,upper-gi-tract,anatomical-landmarks,z-line,upper-gi-tract/anatomical-landmarks/z-line/b98...,0,False


In [5]:
# df_anno['Categories'].value_counts()

In [6]:
list_cat_abn = []
for c in df_anno['Categories']:
    if c != 0:
        list_cat_abn.append(1)
    else:
        list_cat_abn.append(c)
df_anno['Abnormal'] = list_cat_abn

In [7]:
df_anno['Abnormal'].value_counts()

0    8020
1    2642
Name: Abnormal, dtype: int64

In [9]:
df_patho = df_anno[df_anno['Abnormal'] == 1]
df_patho.head()

Unnamed: 0,Organ,Classification,Finding,Image,Categories,is_valid,Abnormal
6,lower-gi-tract,pathological-findings,ulcerative-colitis-grade-1,lower-gi-tract/pathological-findings/ulcerativ...,4,False,1
11,lower-gi-tract,pathological-findings,ulcerative-colitis-grade-1,lower-gi-tract/pathological-findings/ulcerativ...,4,False,1
12,upper-gi-tract,pathological-findings,esophagitis-b-d,upper-gi-tract/pathological-findings/esophagit...,12,False,1
23,upper-gi-tract,pathological-findings,esophagitis-b-d,upper-gi-tract/pathological-findings/esophagit...,12,False,1
24,lower-gi-tract,pathological-findings,polyps,lower-gi-tract/pathological-findings/polyps/35...,5,False,1


In [11]:
for idx, c in enumerate(df_patho['Categories']):
    df_patho['Categories'].iloc[idx] = c - 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:
df_patho['Categories'].value_counts()

4     1028
6      443
9      403
11     260
3      201
1      133
0       53
8       41
2       35
7       28
5       11
10       6
Name: Categories, dtype: int64

In [13]:
x = df_patho['Image']
y = df_patho['Categories']
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=0)

x_train_lab, x_train_unlab, y_train_lab, y_train_unlab = train_test_split(x_train, 
                                                                          y_train, 
                                                                          test_size=0.32, 
                                                                          random_state=0)

# df_train = pd.DataFrame([])
# df_train['image'] = x_train
# df_train['target'] = y_train
# df_train['is_valid'] = False

df_valid = pd.DataFrame([])
df_valid['image'] = x_valid
df_valid['target'] = y_valid
df_valid['is_valid'] = True
df_valid['is_labeled'] = True
# df_full = pd.concat([df_train, df_valid], axis=0).reset_index()
# df_full

df_train_lab = pd.DataFrame([])
df_train_lab['image'] = x_train_lab
df_train_lab['target'] = y_train_lab
df_train_lab['is_valid'] = False
df_train_lab['is_labeled'] = True

df_train_unlab = pd.DataFrame([])
df_train_unlab['image'] = x_train_unlab
df_train_unlab['target'] = y_train_unlab
df_train_unlab['is_valid'] = False
df_train_unlab['is_labeled'] = False



df_full = pd.concat([df_train_lab, df_train_unlab, df_valid], axis=0).reset_index()

In [16]:
df_full

Unnamed: 0,index,image,target,is_valid,is_labeled
0,5611,lower-gi-tract/pathological-findings/ulcerativ...,3,False,True
1,8744,upper-gi-tract/pathological-findings/esophagit...,9,False,True
2,915,lower-gi-tract/pathological-findings/polyps/96...,4,False,True
3,2369,upper-gi-tract/pathological-findings/esophagit...,11,False,True
4,2239,lower-gi-tract/pathological-findings/ulcerativ...,6,False,True
...,...,...,...,...,...
2637,9965,lower-gi-tract/pathological-findings/polyps/87...,4,True,True
2638,4471,upper-gi-tract/pathological-findings/esophagit...,9,True,True
2639,1549,lower-gi-tract/pathological-findings/polyps/aa...,4,True,True
2640,8782,lower-gi-tract/pathological-findings/ulcerativ...,6,True,True


In [15]:
df_full.to_csv('../resource/hyper_kvasir/df_pathologies.csv', index=False, header=True)

In [11]:
# x = df_anno['path']
# y = df_anno['Categories']

# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
# x_train_lab, x_train_unlab, y_train_lab, y_train_unlab = train_test_split(x_train, 
#                                                                           y_train, 
#                                                                           test_size=0.32, 
#                                                                           random_state=0)


# df_train_lab = pd.DataFrame([])
# df_train_lab = pd.DataFrame([])
# df_train_lab['image'] = x_train_lab
# df_train_lab['target'] = y_train_lab
# df_train_lab['is_valid'] = False
# df_train_lab['is_labeled'] = True

# df_train_unlab = pd.DataFrame([])
# df_train_unlab = pd.DataFrame([])
# df_train_unlab['image'] = x_train_unlab
# df_train_unlab['target'] = y_train_unlab
# df_train_unlab['is_valid'] = False
# df_train_unlab['is_labeled'] = False


# df_valid['image'] = x_test
# df_valid['target'] = y_test
# df_valid['is_valid'] = True
# df_valid['is_labeled'] = True

In [12]:
# df_full = pd.concat([df_train_lab, df_train_unlab, df_valid], axis=0).reset_index()
# del df_full['index']
# df_full

In [13]:
# fig = df_full['target'].value_counts().plot(kind='bar', figsize=(10, 8), fontsize=10, rot=0).get_figure()

In [23]:
# df_full.to_csv('../resource/hyper_kvasir/df_abnormal.csv', index=False, header=True)

In [15]:
# cnt_train = df_train['target'].value_counts()
# cnt_valid = df_valid['target'].value_counts()

# df_cnt = pd.DataFrame({'train': cnt_train,'valid': cnt_valid})
# chart = df_cnt.plot.bar(rot=0, figsize=(10, 8), fontsize=15, stacked=True).get_figure()
# # chart.savefig('../resource/visualize/visual_sup.png')

In [69]:
# cnt_train = df_train['Groupby_Categories']
# cnt_train_labeled, cnt_train_unlabeled = train_test_split(cnt_train, test_size = 0.9, random_state = 0)
# cnt_train_labeled, cnt_train_unlabeled = cnt_train_labeled.value_counts(), cnt_train_unlabeled.value_counts()
# cnt_valid = df_valid['Groupby_Categories'].value_counts()

# df_cnt = pd.DataFrame({'train_labeled': cnt_train_labeled, 
                       
#                        'train_unlabeled': cnt_train_unlabeled,
#                        'valid': cnt_valid,
#                        })
# chart_semi = df_cnt.plot.bar(rot=0, 
#                         figsize=(10, 8), 
#                         fontsize=15,
#                         color={"train_labeled": "#1f76b5", "train_unlabeled": "green", "valid":"#fe7e0e"},
#                         stacked=True).get_figure()
# chart_semi.savefig('../resource/visualize/visual_semi_1_9.png')

In [70]:
# cnt_train_labeled, cnt_train_unlabeled

In [71]:
# df_train['Groupby_Categories'].value_counts()
# fig_train.savefig('../resource/visualize/groupcat_train.png')

In [72]:
# df_valid['Groupby_Categories'].value_counts()
# fig_valid.savefig('../resource/visualize/groupcat_valid.png')

In [14]:
# fig.savefig('../resource/visualize/groupcat.png')

In [15]:
# df_full.to_csv('../resource/hyper_kvasir/full_labeled_group.csv', index=False, header=True)