In [1]:
import warnings
warnings.filterwarnings('ignore')

from glob import glob
import pandas as pd
import numpy as np

from tqdm import tqdm
import os

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import joblib

In [2]:
# Config
train_png_path    = "./data/train/*.png"
test_png_path     = "./data/test/*.png"
train_pandas_path = "./data/train_df.csv"

In [3]:
train_png = sorted(glob(train_png_path))
test_png = sorted(glob(test_png_path))
train_data = pd.read_csv(train_pandas_path)

In [4]:
train_data

Unnamed: 0,index,file_name,class,state,label
0,0,10000.png,transistor,good,transistor-good
1,1,10001.png,capsule,good,capsule-good
2,2,10002.png,transistor,good,transistor-good
3,3,10003.png,wood,good,wood-good
4,4,10004.png,bottle,good,bottle-good
...,...,...,...,...,...
4272,4272,14272.png,transistor,good,transistor-good
4273,4273,14273.png,transistor,good,transistor-good
4274,4274,14274.png,grid,good,grid-good
4275,4275,14275.png,zipper,good,zipper-good


In [5]:
file_names = train_data["file_name"]

In [6]:
def binarySearch(x):
    l, r = 0, len(file_names) - 1
    while l <= r:
        mid = (l + r) // 2
        if file_names[mid] == x:
            return True
        if x < file_names[mid]:
            r = mid - 1
        else:
            l = mid + 1
    return False

In [7]:
data_path = []
for path in train_png:
    ppath = os.path.abspath(path)
    filename = ppath.split('/')[-1]
    if binarySearch(filename):
        data_path.append(ppath)
    else:
        data_path.append('-')
        assert False

In [8]:
kf = StratifiedKFold(n_splits=4)

In [9]:
kfold = [-1] * len(train_data['file_name'])
for k, (_, val_index) in enumerate(kf.split(train_data['file_name'], train_data['label'])):
    for x in val_index:
        kfold[x] = k

In [10]:
train_data['path'] = data_path
train_data['kfold'] = kfold

In [11]:
train_data

Unnamed: 0,index,file_name,class,state,label,path,kfold
0,0,10000.png,transistor,good,transistor-good,/dacon/dacon/anomaly-detection/data/train/1000...,0
1,1,10001.png,capsule,good,capsule-good,/dacon/dacon/anomaly-detection/data/train/1000...,0
2,2,10002.png,transistor,good,transistor-good,/dacon/dacon/anomaly-detection/data/train/1000...,0
3,3,10003.png,wood,good,wood-good,/dacon/dacon/anomaly-detection/data/train/1000...,0
4,4,10004.png,bottle,good,bottle-good,/dacon/dacon/anomaly-detection/data/train/1000...,0
...,...,...,...,...,...,...,...
4272,4272,14272.png,transistor,good,transistor-good,/dacon/dacon/anomaly-detection/data/train/1427...,3
4273,4273,14273.png,transistor,good,transistor-good,/dacon/dacon/anomaly-detection/data/train/1427...,3
4274,4274,14274.png,grid,good,grid-good,/dacon/dacon/anomaly-detection/data/train/1427...,3
4275,4275,14275.png,zipper,good,zipper-good,/dacon/dacon/anomaly-detection/data/train/1427...,3


In [12]:
encoder = LabelEncoder()
train_data['encoded_label'] = encoder.fit_transform(train_data['label'])

In [13]:
with open('encoder.pickle', 'wb') as f:
    joblib.dump(encoder, f)

In [14]:
train_data.to_csv('train_kfold.csv', index=None)