In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'D2Coding' 
plt.rcParams['axes.unicode_minus'] = False 

import sys
sys.path.append('../')
from utils.util import get_parent_path

In [None]:
import matplotlib.font_manager as fm

for font in fm.fontManager.ttflist:
    print(f"{font.name} : {font.fname}")

In [None]:
print(get_parent_path())

In [None]:
data_dir = os.path.join(get_parent_path(), 'data', 'train')

In [None]:
data_dir

In [None]:
folder_ls = os.listdir(data_dir)
folder_ls = sorted(folder_ls)
print(len(folder_ls))

In [None]:
folder_ls_fl = [folder for folder in folder_ls if not folder.startswith('.')]

In [None]:
len(folder_ls_fl)

In [None]:
# make file list each folder

file_ls = []
for folder in folder_ls_fl:
    folder_path = os.path.join(data_dir, folder)
    file_ls += [os.path.join(folder_path, file) for file in os.listdir(folder_path)]

In [None]:
len(file_ls)

In [None]:
print(file_ls[:5])

In [None]:
file_ls = sorted(file_ls)

In [None]:
print(file_ls[:5])

In [None]:
df = pd.DataFrame(file_ls, columns=['image_path'])
df['image_name'] = df['image_path'].apply(lambda x: os.path.basename(x))
df['folder_name'] = df['image_path'].apply(lambda x: os.path.basename(os.path.dirname(x)))

In [None]:
df['label'] = df['folder_name'].astype('category')

In [None]:
unique_labels = sorted(df['label'].unique())
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
df['label_index'] = df['label'].map(label_to_index)


print("총 클래스 수:", len(unique_labels))
print(df.head())
df.to_csv('train_mapped.csv', index=False)

In [None]:
label_counts = df['label'].value_counts()

# 상위/하위 클래스 시각화
top_n = 50
fig, ax = plt.subplots(figsize=(12, 6))
label_counts.head(top_n).plot(kind='bar', ax=ax)
ax.set_title(f'Top {top_n} Most Frequent Classes')
ax.set_ylabel('Image Count')
ax.set_xlabel('Class Label')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
label_counts = df['label'].value_counts()

# 상위/하위 클래스 시각화
top_n = 50
fig, ax = plt.subplots(figsize=(12, 6))
label_counts.tail(top_n).plot(kind='bar', ax=ax)
ax.set_title(f'Top {top_n} Less Frequent Classes')
ax.set_ylabel('Image Count')
ax.set_xlabel('Class Label')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
data_frame_path = get_parent_path() / "data" /"train_csv" / "train_mapped.csv"
origin = pd.read_csv(data_frame_path)

In [None]:
origin.head()

In [None]:
from sklearn.model_selection import train_test_split
import torch

df = origin.copy()
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label_index'], random_state=42)

unique_labels = sorted(df['label'].unique())
label_to_index = {label: idx for idx, label in enumerate(unique_labels)}

train_df['label_index'] = train_df['label'].map(label_to_index)
val_df['label_index'] = val_df['label'].map(label_to_index)

class_counts = train_df['label_index'].value_counts().sort_index()
print(class_counts)

total = class_counts.sum()

print(total)

class_weights = [total / c for c in class_counts]
print(class_weights)


In [None]:
y_train = train_df['label_index'].values
num_clasees = len(np.unique(y_train))

In [None]:
from sklearn.utils.class_weight import compute_class_weight
class_weight = compute_class_weight(class_weight='balanced', classes=np.arange(len(np.unique(train_df['label_index'].values))), y=train_df['label_index'].values)

In [None]:
print(len(class_weight))

In [None]:
# 전체 클래스 분포 히스토그램
fig, ax = plt.subplots(figsize=(14, 6))
class_counts.plot(kind='bar', ax=ax)
ax.set_title('Class Distribution in Train Set (After Mapping)')
ax.set_xlabel('Class Index')
ax.set_ylabel('Number of Images')
plt.tight_layout()
plt.show()

In [None]:
min_class_index = class_counts.idxmin()
min_class_count = class_counts[min_class_index]

min_class_label = [label for label, idx in label_to_index.items() if idx == min_class_index]

(min_class_index, min_class_label, min_class_count)

In [None]:
train_df = pd.read_csv('/Users/iyongjeong/WORK/dacon/img_clf/data/train_csv/train_mapped.csv')

In [None]:
train_df

## Filtering Noise Data

### Strategies

- 1. Pseudo Labeling
- 2. Imgae Quality Assessment
- 3. Image Embedding
- 4. Image Clustering
- 5. Checking Manually

In [17]:
import os
from PIL import Image
import cv2
from pathlib import Path
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
plt.rcParams['font.family'] = 'D2Coding' 
plt.rcParams['axes.unicode_minus'] = False 

In [18]:
train_df = pd.read_csv(r'C:\works\dacon\img_clf\data\train_csv\train_mapped.csv')

In [19]:
train_df['label_index'].isna().sum()

0

In [None]:
iamge_stats = pd.read_csv(r'C:\works\dacon\img_clf\data\image_stats.csv')

In [None]:
iamge_stats.describe()

In [None]:
iamge_stats.loc[iamge_stats['mean_brightness'] < 60]

In [None]:
train_df

In [None]:
# make image path dict in train df
image_dict = {os.path.basename(path): path for path in train_df['image_path'].values}

In [None]:
# mapping image path in image_stats df

iamge_stats['image_path'] = iamge_stats['image_name'].map(image_dict)

In [None]:
iamge_stats.head()

In [None]:
# mean brightness가 60 이하인 이미지 출력
for index, row in iamge_stats[iamge_stats['mean_brightness'] < 60].iterrows():
    img_path = row['image_path']
    stream = np.fromfile(img_path, dtype=np.uint8)
    image = cv2.imdecode(stream, cv2.IMREAD_COLOR)
    if image is not None:
        plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        plt.title(f"Image: {row['image_name']}, Brightness: {row['mean_brightness']:.2f}")
        plt.axis('off')
        plt.show()
    else:
        print(f"Could not read image at {img_path}")

In [None]:
# blur scores
for index, row in iamge_stats[iamge_stats['blur_score'] > 15000].iterrows():
    img_path = row['image_path']
    stream = np.fromfile(img_path, dtype=np.uint8)
    img = cv2.imdecode(stream, cv2.IMREAD_COLOR)
    if img is not None:
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        plt.title(f"Image: {row['image_name']}, Brightness: {row['blur_score']:.2f}")
        plt.axis('off')
        plt.show()
    else:
        print(f"Could not read image at {img_path}")

In [None]:
# blur scores
for index, row in iamge_stats[iamge_stats['exposure'] < 50].iterrows():
    img_path = row['image_path']
    stream = np.fromfile(img_path, dtype=np.uint8)
    img = cv2.imdecode(stream, cv2.IMREAD_COLOR)
    if img is not None:
        plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
        plt.title(f"Image: {row['image_name']}, exposure: {row['exposure']:.2f}")
        plt.axis('off')
        plt.show()
    else:
        print(f"Could not read image at {img_path}")

## 1. Pseudo Labeling

## 2.Image Quality Assessment

## 3. Image Embedding

## 4. Image clustering

## 5. Checking manually