In [101]:
import torch
from torch.utils.data import DataLoader
from torchvision.transforms import transforms
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold

import pandas as pd
import numpy as np

from PIL import Image
import random
from tqdm.notebook import tqdm
from script.tool import ROOT_NFS, ROOT_NFS_DATA, ROOT_NFS_TEST

In [2]:
path_dataset = ROOT_NFS_DATA / 'Cosmenet_product_20231018'
device = torch.device("cpu")
df_pd = pd.read_csv(path_dataset / 'datas_20231018.csv')
df_pd.head(1)

Unnamed: 0,file_names,labels,images_path
0,14624_14.jpg,14624,/app/nfs_clientshare/Datasets/Cosmenet_product...


In [3]:
group_df = df_pd.groupby(['labels'])['labels'].count().reset_index(name='count').sort_values(['count'], ascending=False)
group_df.head(1)

Unnamed: 0,labels,count
4172,50348,100


In [4]:
filter_img_2_to_8 = group_df[(group_df['count'] <= 8) & (group_df['count'] > 1)]['labels'].values
filter_img_1_to_8 = group_df[group_df['count'] <= 8]['labels'].values

df_more_8 = df_pd[~df_pd['labels'].isin(filter_img_1_to_8)]
df_2_to_8 = df_pd[df_pd['labels'].isin(filter_img_2_to_8)]

In [110]:
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
train_2_to_8, test_2_to_8 = skf.split(df_2_to_8, df_2_to_8['labels']).__next__()
df_2_to_8_train = df_2_to_8.iloc[train_2_to_8]
df_2_to_8_test = df_2_to_8.iloc[test_2_to_8]
df_2_to_8_train.head(1)

Unnamed: 0,file_names,labels,images_path
322,39856_2.png,39856,/app/nfs_clientshare/Datasets/Cosmenet_product...


In [5]:
sss_train = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = sss_train.split(df_more_8, df_more_8['labels']).__next__()
df_train = df_more_8.iloc[train_idx]
df_test = df_more_8.iloc[test_idx]
df_train.head(1)

Unnamed: 0,file_names,labels,images_path
3177,11596_2.jpg,11596,/app/nfs_clientshare/Datasets/Cosmenet_product...


In [6]:
print(f"amount of all data : {df_pd.__len__()}")
print(f"amount of all class : {group_df.__len__()}")
print(f"amount of data 2-8 img : {df_2_to_8.__len__()}")
print(f"amount of 2-8 img class : {filter_img_2_to_8.__len__()}")
print(f"amount of data more 8 img : {df_more_8.__len__()}")
print(f"amount of more 8 img class : {group_df[group_df['count'] > 8]['labels'].__len__()}")
print(f"amount of data & class only one : {group_df[group_df['count'] == 1]['labels'].__len__()}")

amount of all data : 60196
amount of all class : 4178
amount of data 2-8 img : 1548
amount of 2-8 img class : 278
amount of data more 8 img : 58631
amount of more 8 img class : 3883
amount of data & class only one : 17


In [7]:
df_count_8 = group_df[(group_df['count'] > 8) & (group_df['count'] > 1)]
group_df_count_8 = df_count_8.groupby(['count'])['count'] \
    .count().reset_index(name='counter_count').sort_values(['counter_count'], ascending=False)
counter_count_1 = group_df_count_8[group_df_count_8["counter_count"] == 1]["count"].values
ind_c = group_df_count_8[group_df_count_8["counter_count"] == 1]["count"].index
df_count_8.loc[df_count_8["count"].isin(counter_count_1), "count"] = 101

In [88]:
sss_val = StratifiedShuffleSplit(n_splits=1, test_size=0.18, random_state=42)
split_idx, val_idx = sss_val.split(df_count_8, df_count_8['count']).__next__()
split_class = df_count_8.iloc[split_idx]["labels"].values
val_class = df_count_8.iloc[val_idx]["labels"].values
val_class[:5]

array([46783, 44043, 40575, 44033, 48695])

In [94]:
df_train_split = df_train[df_train["labels"].isin(split_class)]
df_test_split = df_test[df_test["labels"].isin(split_class)]
df_train_val = df_train[df_train["labels"].isin(val_class)]
df_test_val = df_test[df_test["labels"].isin(val_class)]

In [111]:
df_train_val_mix = pd.concat([df_train_val, df_2_to_8_train])
df_test_val_mix = pd.concat([df_test_val, df_2_to_8_test])

In [112]:
print(f"amount of train split : {len(df_train_split)}")
print(f"amount of train split class : {df_train_split['labels'].nunique()}")
print(f"amount of test split : {len(df_test_split)}")
print(f"amount of test split class : {df_test_split['labels'].nunique()}")
print(f"amount of train val : {len(df_train_val)}")
print(f"amount of train val class : {df_train_val['labels'].nunique()}")
print(f"amount of test val : {len(df_test_val)}")
print(f"amount of test val class : {df_test_val['labels'].nunique()}")
print(f"amount of train val mix : {len(df_train_val_mix)}")
print(f"amount of train val mix class : {df_train_val_mix['labels'].nunique()}")
print(f"amount of test val mix : {len(df_test_val_mix)}")
print(f"amount of test val mix class : {df_test_val_mix['labels'].nunique()}")

amount of train split : 38474
amount of train split class : 3184
amount of test split : 9620
amount of test split class : 3184
amount of train val : 8430
amount of train val class : 699
amount of test val : 2107
amount of test val class : 699
amount of train val mix : 9204
amount of train val mix class : 977
amount of test val mix : 2881
amount of test val mix class : 977
