In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'D2Coding' 
plt.rcParams['axes.unicode_minus'] = False 

import sys
sys.path.append('../')
from utils.util import get_parent_path

In [None]:
import matplotlib.font_manager as fm

for font in fm.fontManager.ttflist:
    print(f"{font.name} : {font.fname}")

In [None]:
print(get_parent_path())

In [None]:
data_dir = os.path.join(get_parent_path(), 'data', 'train')

In [None]:
data_dir

In [None]:
folder_ls = os.listdir(data_dir)
folder_ls = sorted(folder_ls)
print(len(folder_ls))

In [None]:
folder_ls_fl = [folder for folder in folder_ls if not folder.startswith('.')]

In [None]:
len(folder_ls_fl)

In [None]:
# make file list each folder

file_ls = []
for folder in folder_ls_fl:
    folder_path = os.path.join(data_dir, folder)
    file_ls += [os.path.join(folder_path, file) for file in os.listdir(folder_path)]

In [None]:
len(file_ls)

In [None]:
print(file_ls[:5])

In [None]:
file_ls = sorted(file_ls)

In [None]:
print(file_ls[:5])

In [None]:
df = pd.DataFrame(file_ls, columns=['image_path'])
df['image_name'] = df['image_path'].apply(lambda x: os.path.basename(x))
df['folder_name'] = df['image_path'].apply(lambda x: os.path.basename(os.path.dirname(x)))

In [None]:
df['label'] = df['folder_name'].astype('category')

In [None]:
unique_labels = sorted(df['label'].unique())
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
df['label_index'] = df['label'].map(label_to_index)


print("총 클래스 수:", len(unique_labels))
print(df.head())
df.to_csv('train_mapped.csv', index=False)

In [None]:
label_counts = df['label'].value_counts()

# 상위/하위 클래스 시각화
top_n = 50
fig, ax = plt.subplots(figsize=(12, 6))
label_counts.head(top_n).plot(kind='bar', ax=ax)
ax.set_title(f'Top {top_n} Most Frequent Classes')
ax.set_ylabel('Image Count')
ax.set_xlabel('Class Label')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
label_counts = df['label'].value_counts()

# 상위/하위 클래스 시각화
top_n = 50
fig, ax = plt.subplots(figsize=(12, 6))
label_counts.tail(top_n).plot(kind='bar', ax=ax)
ax.set_title(f'Top {top_n} Less Frequent Classes')
ax.set_ylabel('Image Count')
ax.set_xlabel('Class Label')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
data_frame_path = get_parent_path() / "data" /"train_csv" / "train_mapped.csv"
origin = pd.read_csv(data_frame_path)

In [None]:
origin.head()

In [None]:
from sklearn.model_selection import train_test_split
import torch

df = origin.copy()
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label_index'], random_state=42)

unique_labels = sorted(df['label'].unique())
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}

train_df['label_index'] = train_df['label'].map(label_to_index)
val_df['label_index'] = val_df['label'].map(label_to_index)

class_counts = train_df['label_index'].value_counts().sort_index()
print(class_counts)

total = class_counts.sum()

print(total)

class_weights = [total / c for c in class_counts]
print(class_weights)


In [None]:
y_train = train_df['label_index'].values
num_clasees = len(np.unique(y_train))

In [None]:
from sklearn.utils.class_weight import compute_class_weight
class_weight = compute_class_weight(class_weight='balanced', classes=np.arange(len(np.unique(train_df['label_index'].values))), y=train_df['label_index'].values)

In [None]:
print(len(class_weight))

In [None]:
# 전체 클래스 분포 히스토그램
fig, ax = plt.subplots(figsize=(14, 6))
class_counts.plot(kind='bar', ax=ax)
ax.set_title('Class Distribution in Train Set (After Mapping)')
ax.set_xlabel('Class Index')
ax.set_ylabel('Number of Images')
plt.tight_layout()
plt.show()

In [None]:
min_class_index = class_counts.idxmin()
min_class_count = class_counts[min_class_index]

min_class_label = [label for label, idx in label_to_index.items() if idx == min_class_index]

(min_class_index, min_class_label, min_class_count)

In [None]:
train_df = pd.read_csv('/Users/iyongjeong/WORK/dacon/img_clf/data/train_csv/train_mapped.csv')

In [None]:
train_df

## Filtering Noise Data

### Strategies

- 1. Pseudo Labeling
- 2. Imgae Quality Assessment
- 3. Image Embedding
- 4. Image Clustering
- 5. Checking Manually

In [17]:
import os
from PIL import Image
import cv2
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
plt.rcParams['font.family'] = 'D2Coding' 
plt.rcParams['axes.unicode_minus'] = False 

In [18]:
train_df = pd.read_csv(r'C:\works\dacon\img_clf\data\train_csv\train_mapped.csv')

In [19]:
train_df['label_index'].isna().sum()

0

In [None]:
iamge_stats = pd.read_csv(r'C:\works\dacon\img_clf\data\image_stats.csv')

In [None]:
iamge_stats.describe()

In [None]:
iamge_stats.loc[iamge_stats['mean_brightness'] < 60]

In [None]:
train_df

In [None]:
# make image path dict in train df
image_dict = {os.path.basename(path): path for path in train_df['image_path'].values}

In [None]:
# mapping image path in image_stats df

iamge_stats['image_path'] = iamge_stats['image_name'].map(image_dict)

In [None]:
iamge_stats.head()

In [None]:
# mean brightness가 60 이하인 이미지 출력
for index, row in iamge_stats[iamge_stats['mean_brightness'] < 60].iterrows():
    img_path = row['image_path']
    stream = np.fromfile(img_path, dtype=np.uint8)
    image = cv2.imdecode(stream, cv2.IMREAD_COLOR)
    if image is not None:
        plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        plt.title(f"Image: {row['image_name']}, Brightness: {row['mean_brightness']:.2f}")
        plt.axis('off')
        plt.show()
    else:
        print(f"Could not read image at {img_path}")

In [None]:
# blur scores
for index, row in iamge_stats[iamge_stats['blur_score'] > 15000].iterrows():
    img_path = row['image_path']
    stream = np.fromfile(img_path, dtype=np.uint8)
    img = cv2.imdecode(stream, cv2.IMREAD_COLOR)
    if img is not None:
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        plt.title(f"Image: {row['image_name']}, Brightness: {row['blur_score']:.2f}")
        plt.axis('off')
        plt.show()
    else:
        print(f"Could not read image at {img_path}")

In [None]:
# blur scores
for index, row in iamge_stats[iamge_stats['exposure'] < 50].iterrows():
    img_path = row['image_path']
    stream = np.fromfile(img_path, dtype=np.uint8)
    img = cv2.imdecode(stream, cv2.IMREAD_COLOR)
    if img is not None:
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        plt.title(f"Image: {row['image_name']}, exposure: {row['exposure']:.2f}")
        plt.axis('off')
        plt.show()
    else:
        print(f"Could not read image at {img_path}")

## 1. Pseudo Labeling

## 2.Image Quality Assessment

## 3. Image Embedding

## 4. Image clustering

## 5. Checking manually

In [8]:
train_df = pd.read_csv('/workspace/img_clf/data/train_csv/train_mapped_2.csv')

In [9]:
train_df

Unnamed: 0,image_path,image_name,folder_name,label,label_index
0,C:\works\dacon\img_clf\data\train\1시리즈_F20_201...,1시리즈_F20_2013_2015_0000.jpg,1시리즈_F20_2013_2015,1시리즈_F20_2013_2015,0
1,C:\works\dacon\img_clf\data\train\1시리즈_F20_201...,1시리즈_F20_2013_2015_0001.jpg,1시리즈_F20_2013_2015,1시리즈_F20_2013_2015,0
2,C:\works\dacon\img_clf\data\train\1시리즈_F20_201...,1시리즈_F20_2013_2015_0002.jpg,1시리즈_F20_2013_2015,1시리즈_F20_2013_2015,0
3,C:\works\dacon\img_clf\data\train\1시리즈_F20_201...,1시리즈_F20_2013_2015_0003.jpg,1시리즈_F20_2013_2015,1시리즈_F20_2013_2015,0
4,C:\works\dacon\img_clf\data\train\1시리즈_F20_201...,1시리즈_F20_2013_2015_0004.jpg,1시리즈_F20_2013_2015,1시리즈_F20_2013_2015,0
...,...,...,...,...,...
32758,C:\works\dacon\img_clf\data\train\프리우스_C_2018_...,프리우스_C_2018_2020_0081.jpg,프리우스_C_2018_2020,프리우스_C_2018_2020,395
32759,C:\works\dacon\img_clf\data\train\프리우스_C_2018_...,프리우스_C_2018_2020_0082.jpg,프리우스_C_2018_2020,프리우스_C_2018_2020,395
32760,C:\works\dacon\img_clf\data\train\프리우스_C_2018_...,프리우스_C_2018_2020_0083.jpg,프리우스_C_2018_2020,프리우스_C_2018_2020,395
32761,C:\works\dacon\img_clf\data\train\프리우스_C_2018_...,프리우스_C_2018_2020_0084.jpg,프리우스_C_2018_2020,프리우스_C_2018_2020,395


In [6]:
import os
import shutil

# 파일 이름 리스트
file_names = [
    "5시리즈_G60_2024_2025_0010.jpg",
    "6시리즈_GT_G32_2018_2020_0018.jpg",
    "7시리즈_G11_2016_2018_0040.jpg",
    "911_992_2020_2024_0030.jpg",
    "E_클래스_W212_2010_2016_0022.jpg",
    "K5_2세대_2016_2018_0007.jpg",
    "F150_2004_2021_0018.jpg",
    "G_클래스_W463b_2019_2025_0030.jpg",
    "GLE_클래스_W167_2019_2024_0068.jpg",
    "Q5_FY_2021_2024_0032.jpg",
    "Q30_2017_2019_0075.jpg",
    "Q50_2014_2017_0031.jpg",
    "SM7_뉴아트_2008_2011_0053.jpg",
    "X3_G01_2022_2024_0029.jpg",
    "XF_X260_2016_2020_0023.jpg",
    "뉴_ES300h_2013_2015_0000.jpg",
    "뉴_G80_2025_2026_0042.jpg",
    "뉴_G80_2025_2026_0043.jpg",
    "뉴_SM5_임프레션_2008_2010_0033.jpg",
    "더_기아_레이_EV_2024_2025_0078.jpg",
    "더_뉴_K3_2세대_2022_2024_0001.jpg",
    "더_뉴_그랜드_스타렉스_2018_2021_0078.jpg",
    "더_뉴_그랜드_스타렉스_2018_2021_0079.jpg",
    "더_뉴_그랜드_스타렉스_2018_2021_0080.jpg",
    "더_뉴_아반떼_2014_2016_0031.jpg",
    "더_뉴_파사트_2012_2019_0067.jpg",
    "레니게이드_2019_2023_0041.jpg",
    "박스터_718_2017_2024_0011.jpg",
    "싼타페_TM_2019_2020_0009.jpg",
    "아반떼_MD_2011_2014_0081.jpg",
    "아반떼_N_2022_2023_0064.jpg",
    "익스플로러_2016_2017_0072.jpg",
    "콰트로포르테_2017_2022_0074.jpg",
    "프리우스_4세대_2019_2022_0052.jpg",
    "아반떼_N_2022_2023_0035.jpg"
]

In [16]:
file_list = [
    "E_클래스_W212_2010_2016_0069.jpg",
    "ES300h_7세대_2019_2026_0028.jpg",
    "G_클래스_W463_2009_2017_0011.jpg",
    "GLB_클래스_X247_2020_2023_0008.jpg",
    "GLS_클래스_X167_2020_2024_0013.jpg",
    "K3_2013_2015_0045.jpg",
    "K5_3세대_2020_2023_0081.jpg",
    "Q7_4M_2020_2023_0011.jpg",
    "RAV4_5세대_2019_2024_0020.jpg",
    "S_클래스_W223_2021_2025_0008.jpg",
    "S_클래스_W223_2021_2025_0071.jpg",
    "X4_F26_2015_2018_0068.jpg",
    "그랜드_체로키_WL_2021_2023_0018.jpg",
    "레이_2012_2017_0063.jpg",
    "레인지로버_5세대_2023_2024_0030.jpg",
    "레인지로버_스포츠_2세대_2018_2022_0014.jpg",
    "레인지로버_스포츠_2세대_2018_2022_0017.jpg",
    "마칸_2019_2021_0035.jpg",
    "머스탱_2015_2023_0086.jpg",
    "아반떼_MD_2011_2014_0009.jpg",
    "아반떼_MD_2011_2014_0082.jpg",
    "컨티넨탈_GT_3세대_2018_2023_0007.jpg",
    "타이칸_2021_2025_0065.jpg",
    "파나메라_2010_2016_0000.jpg",
    "파나메라_2010_2016_0036.jpg",
    "3시리즈_F30_2013_2018_0036.jpg",
    "4시리즈_F32_2014_2020_0027.jpg",
    "5시리즈_G60_2024_2025_0056.jpg",
    "7시리즈_F01_2009_2015_0029.jpg",
    "7시리즈_F01_2009_2015_0044.jpg",
    "911_992_2020_2024_0006.jpg",
    "C_클래스_W204_2008_2015_0068.jpg",
    "CLS_클래스_C257_2019_2023_0021.jpg"
]

In [20]:
file_list_2 = [
    "Q30_2017_2019_0074.jpg",
    "글래디에이터_JT_2020_2023_0075.jpg",
    "뉴_CC_2012_2016_0001.jpg",
    "뉴_CC_2012_2016_0002.jpg",
    "더_뉴_코나_2021_2023_0081.jpg",
    "2시리즈_액티브_투어러_U06_2022_2024_0004.jpg",
    "A8_D5_2018_2023_0084.jpg"
]

In [11]:
len(file_names)

35

In [23]:
filter_df = train_df.loc[~(train_df['image_name'].isin(file_names))]

In [24]:
filter_filter_Df= filter_df.loc[~(filter_df['image_name'].isin(file_list))]

In [25]:
len(filter_filter_Df)

32719

In [26]:
noise_filter_df = filter_filter_Df.loc[~(filter_filter_Df['image_name'].isin(file_list_2))]

In [27]:
noise_filter_df

Unnamed: 0,image_path,image_name,folder_name,label,label_index
0,C:\works\dacon\img_clf\data\train\1시리즈_F20_201...,1시리즈_F20_2013_2015_0000.jpg,1시리즈_F20_2013_2015,1시리즈_F20_2013_2015,0
1,C:\works\dacon\img_clf\data\train\1시리즈_F20_201...,1시리즈_F20_2013_2015_0001.jpg,1시리즈_F20_2013_2015,1시리즈_F20_2013_2015,0
2,C:\works\dacon\img_clf\data\train\1시리즈_F20_201...,1시리즈_F20_2013_2015_0002.jpg,1시리즈_F20_2013_2015,1시리즈_F20_2013_2015,0
3,C:\works\dacon\img_clf\data\train\1시리즈_F20_201...,1시리즈_F20_2013_2015_0003.jpg,1시리즈_F20_2013_2015,1시리즈_F20_2013_2015,0
4,C:\works\dacon\img_clf\data\train\1시리즈_F20_201...,1시리즈_F20_2013_2015_0004.jpg,1시리즈_F20_2013_2015,1시리즈_F20_2013_2015,0
...,...,...,...,...,...
32758,C:\works\dacon\img_clf\data\train\프리우스_C_2018_...,프리우스_C_2018_2020_0081.jpg,프리우스_C_2018_2020,프리우스_C_2018_2020,395
32759,C:\works\dacon\img_clf\data\train\프리우스_C_2018_...,프리우스_C_2018_2020_0082.jpg,프리우스_C_2018_2020,프리우스_C_2018_2020,395
32760,C:\works\dacon\img_clf\data\train\프리우스_C_2018_...,프리우스_C_2018_2020_0083.jpg,프리우스_C_2018_2020,프리우스_C_2018_2020,395
32761,C:\works\dacon\img_clf\data\train\프리우스_C_2018_...,프리우스_C_2018_2020_0084.jpg,프리우스_C_2018_2020,프리우스_C_2018_2020,395


In [29]:
noise_filter_df.to_csv('/workspace/img_clf/data/train_csv/train_mapped_3.csv', index=False)