# Package Import

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
from torch.utils.data import DataLoader,SubsetRandomSampler
import torchvision
from torchvision.datasets import ImageFolder
from torchvision import datasets, transforms
from sklearn.model_selection import train_test_split
import torch.optim as optim
import torch.nn.functional as F
import os
from PIL import Image
import albumentations as A
import albumentations.pytorch as A1
import pandas as pd
import time
from tqdm.auto import tqdm
from sklearn.model_selection import KFold
# torch.manual_seed(0)

In [6]:
import random                                      ##random변수 시드 값 고정
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = True  
seed_everything()

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')   #GPU 쿠다 사용을 위한 준비
print(device)

cuda


# Dataset 준비하기

In [8]:
train_df = pd.read_csv('./data.csv')   #csv 파일 불러오는 코드
test_df = pd.read_csv('./testdata.csv')

FileNotFoundError: [Errno 2] No such file or directory: './data.csv'

In [None]:
for index, row in train_df.iterrows():
    print('Image : {}'.format(row["Image"]))
    print('Label : {}'.format(row["Label"]))
    print('Age   : {}'.format(row["Age"]))
    break

In [None]:
path = './dataset/{}/{}'
train_image=[]
for index, row in train_df.iterrows():
    image_path = row['Image']
    image_label = row['Label']
    image = Image.open(path.format(image_label, image_path)).convert('RGB')
    train_image.append(image)

In [None]:
class CustomDataset(torch.utils.data.Dataset): 
    def __init__(self, dataframe, train='train', transform=None):
        if train == 'train':
            self.image_list = []
            self.label_list = []
            self.other_list = []
            path = './dataset/{}/{}'
            for index, row in dataframe.iterrows():
                image_path = row['Image']
                image_label = row['Label']
                image_age = row['Age']
                image_gender = row['Gender']
                image_race = row['Race']
                image = Image.open(path.format(image_label, image_path)).convert('RGB')
                if transform != None:
                    image=np.array(image) #albumentation 사용하기 위해서 형태 변환
                    image = transform(image=image)['image']
                self.image_list.append(image)
                self.label_list.append(image_label)
                self.other_list.append((image_age, image_gender, image_race))
        elif train == 'test':
            self.image_list = []
            self.label_list = [] # 이미지의 경로
            self.other_list = []
            path = './testset/{}'
            for index, row in dataframe.iterrows():
                image_path = row['Image']
                image_gender = row['Gender']
                image_race = row['Race']
                image = Image.open(path.format(image_path)).convert('RGB')
                if transform != None:
                    image=np.array(image)
                    image = transform(image=image)['image']
                self.image_list.append(image)
                self.label_list.append(image_path)
                self.other_list.append((image_gender, image_race))
                
    def __len__(self):
        return len(self.image_list)
    
    def __getitem__(self, idx):
        idx=int(idx)
        return self.image_list[idx], self.label_list[idx], self.other_list[idx]

NameError: name 'test_df' is not defined

# Dataset에 대한 Data Loaders 구성

In [None]:
batch_size = 16
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


# Model 설정

In [None]:

import torch.nn.functional as F

class VGG16(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 64, 3, padding=1)  
        self.conv1_bn = nn.BatchNorm2d(64)
        
        
        self.conv2 = nn.Conv2d(64, 64, 3, padding=1)
        self.conv2_bn = nn.BatchNorm2d(64)
        
# ---------   
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.conv3_bn = nn.BatchNorm2d(128)

        self.conv4 = nn.Conv2d(128, 128, 3, padding=1)
        self.conv4_bn = nn.BatchNorm2d(128)
        
# ---------  
        self.conv5 = nn.Conv2d(128, 256, 3, padding=1)
        self.conv5_bn = nn.BatchNorm2d(256)

        self.conv6 = nn.Conv2d(256, 256, 3, padding=1)
        self.conv6_bn = nn.BatchNorm2d(256)
        
        self.conv7 = nn.Conv2d(256, 512, 3, padding=1)
        self.conv7_bn = nn.BatchNorm2d(512)
        
        self.conv8 = nn.Conv2d(512, 1024, 3, padding=1)
        self.conv8_bn = nn.BatchNorm2d(1024)
        
        self.conv9 = nn.Conv2d(1024, 1024, 3, padding=1)
        self.conv9_bn = nn.BatchNorm2d(1024)
        
# ---------  

        self.fc1 = nn.Linear(25600+2,12800)
        self.fc1_bn = nn.BatchNorm1d(12800)
        
        self.fc2 = nn.Linear(12800, 4096)
        self.fc2_bn = nn.BatchNorm1d(4096)
    
        self.fc3 = nn.Linear(4096, 1024)
        self.fc3_bn = nn.BatchNorm1d(1024)
        
        self.fc5 = nn.Linear(1024, 5)
        
        self.pool = nn.MaxPool2d(2, 2)
        self.pool5 = nn.MaxPool2d(5, 5)
        self.dropout = nn.Dropout(0.5)
        self.relu = nn.ReLU(True)
        
        
    def forward(self, x,x_meta=None):    
       
    
        # 3 x 224 x 224
        x = self.relu(self.conv1_bn(self.conv1(x)))
        
        # 64 x 224 x 224
        x = self.pool(self.relu(self.conv2_bn(self.conv2(x))))

        # 64x 112 x 112
        x = self.relu(self.conv3_bn(self.conv3(x)))

        # 128 x 112 x 112
        x = self.pool(self.relu(self.conv4_bn(self.conv4(x))))
        
        # 128 x 64 x 64        
        x = self.relu(self.conv5_bn(self.conv5(x)))
        
        # 256 x 64 x 64
        
        x = self.relu(self.conv6_bn(self.conv6(x)))

        # 256 x 64 x 64
        x = self.relu(self.conv7_bn(self.conv7(x)))
        
        # 512 x 64 x 64
        x = self.pool(self.relu(self.conv8_bn(self.conv8(x))))

        # 1024 x 32 x 32
        x = self.relu(self.conv9_bn(self.conv9(x)))
        
        # 1024 x 32 x 32
        x = self.pool5(x)

        # 1024 x 5 x 5
        x = x.view(x.shape[0], -1)

        x = self.dropout(x)
        x = torch.cat((x,x_meta),dim=1) # 여기서 CNN연산과 meta데이터를 합쳐서 연산을 한다.
        x = self.relu(self.fc1_bn(self.fc1(x)))

        x = self.dropout(x)
        x = self.relu(self.fc2_bn(self.fc2(x)))

        x = self.relu(self.fc3_bn(self.fc3(x)))
        
        
        x = self.fc5(x)
        
        return x

In [None]:
model = VGG16()
model.to(device)

# Train and Validation

In [None]:
from sklearn.model_selection import StratifiedKFold
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

k=5

skf = StratifiedKFold(n_splits=k, shuffle=True)
labels = np.array(train_df['Label']) #불균형으로 나눠지지 않게 하기 위해 라벨을 사용해서 나눌 때 사용한다.


for fold,(train_index,valid_index) in enumerate(skf.split(train_df,labels)):

    print(f'Fold {fold+1}/{k}')
    
    n_epochs = 60

    #valid_loss가 가장 낮은 값 저장
    valid_loss_min = np.Inf 
    
    #train과 valid를 따로 전처리를 하기 위해 새로운 데이터 프레임 생성
    train_a=pd.DataFrame(columns=['Image','Label','Age','Gender','Race']) 
    valid_a=pd.DataFrame(columns=['Image','Label','Age','Gender','Race'])
    
    
    #StratifiedKFold를 이용하여 index를 뽑아 데이터 생성
    for i in train_index:
        train_a.loc[len(train_a)]=list(train_df.loc[i])
    train_dataset=train_a
    for i in valid_index:
        valid_a.loc[len(valid_a)]=list(train_df.loc[i])
    valid_dataset=valid_a
    
    
    #미리 정의한 CUstomDataset함수를 이용하여 이미지 전처리 수행
    train_dataset = CustomDataset(train_dataset, train='train', transform=train_transform)
    valid_dataset = CustomDataset(valid_dataset, train='train', transform=test_transform)

    #epoch마다 loss 저장
    train_loss = torch.zeros(n_epochs)
    valid_loss = torch.zeros(n_epochs)

    #epoch마다 acc저장
    train_acc = torch.zeros(n_epochs)
    valid_acc = torch.zeros(n_epochs)

    #model을 초기화 한 뒤 device가 cuda로 되어있으므로 GPU 사용
    model = VGG16()
    model.to(device)
    
    #데이터를 batch_size 별로 불러오기 위한 DataLoader 생성
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True)
    
    #분류 문제를 해결하기 위한 CrossEntropyLoss사용
    #Optimzer는 수렴속도가 빠른 Adam대신 가볍고 더 미세하게 조정하면 효과가 있다고 생각한 SGD를 사용
    #scheduler는 CosineAnnealingLR를 사용하 cosine 그래프르 그리면 learning rate가 감소한다.
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(),lr=0.01,momentum=0.9,weight_decay=0.01)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=60, eta_min=0.0001)

    
    
    #기본적인 학습 코드
    for e in range(0, n_epochs):
        
        
        model.train()
        for data, labels,others in tqdm(train_loader):

            data, labels,others[1],others[2]= data.to(device), labels.to(device), others[1].to(device).float().reshape(-1,1),others[2].to(device).float().reshape(-1,1)   #others[1] gender,others[2] race

            optimizer.zero_grad()
            logits = model(data,torch.cat((others[1],others[2]),dim=1))
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()    

            train_loss[e] += loss.item()
            
            #softmax를 사용하여 제일 확률 값의 class를 정답으로 뽑아낸다.
            ps = F.softmax(logits, dim=1)
            top_p, top_class = ps.topk(1, dim=1)
            equals = top_class == labels.reshape(top_class.shape)
            train_acc[e] += torch.mean(equals.type(torch.float)).detach().cpu()
            

        train_loss[e] /= len(train_loader)
        train_acc[e] /= len(train_loader)


        with torch.no_grad(): 
            model.eval()
            for data, labels, others in tqdm(valid_loader):
                data, labels,others[1],others[2]= data.to(device), labels.to(device),others[1].to(device).float().reshape(-1,1),others[2].to(device).float().reshape(-1,1)
                logits = model(data,torch.cat((others[1],others[2]),dim=1))
                loss = criterion(logits, labels)
                valid_loss[e] += loss.item()

                ps = F.softmax(logits, dim=1)
                top_p, top_class = ps.topk(1, dim=1)
                equals = top_class == labels.reshape(top_class.shape)
                valid_acc[e] += torch.mean(equals.type(torch.float)).detach().cpu()

        valid_loss[e] /= len(valid_loader)
        valid_acc[e] /= len(valid_loader)
        print("lr: ", optimizer.param_groups[0]['lr'])
        scheduler.step()
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            e, train_loss[e], valid_loss[e]))

        print('Epoch: {} \tTraining accuracy: {:.6f} \tValidation accuracy: {:.6f}'.format(
            e, train_acc[e], valid_acc[e]))

        if valid_loss[e] <= valid_loss_min: #valid loss 기준 더 낮은 값으로 저장
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss[e]))
            torch.save(model, f'model_{fold+1}.pt') #모델 저장
            valid_loss_min = valid_loss[e]
    
    #모델 별 confusion matrix를 뽑아내는 코드이다.
    model=torch.load(f'./model_{fold+1}.pt')
    classes=['1~10', '11~20', '21~30', '31~40', '41~']
    classes_cm=[0, 1, 2, 3, 4]
    test_loss = 0
    y_pred = []
    y_true = []
    test_acc = 0
    with torch.no_grad(): 
        model.eval()
        for data, labels, other in valid_loader:
            data, labels,other[1],other[2] = data.to(device), labels.to(device),other[1].to(device).float().reshape(-1,1),other[2].to(device).float().reshape(-1,1)
            logits = model(data,torch.cat((other[1],other[2]),dim=1))
            loss = criterion(logits, labels)
            test_loss += loss.item()

            top_p, top_class = logits.topk(1, dim=1)
            y_pred.extend(top_class.data.cpu().numpy())
            y_true.extend(labels.data.cpu().numpy())
            equals = top_class == labels.reshape(top_class.shape)
            test_acc += torch.sum(equals.type(torch.float)).detach().cpu()

        test_acc/=len(valid_loader.dataset)
        test_acc*=10

        cm = confusion_matrix(y_true, y_pred, labels=classes_cm, normalize='true')
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
        disp.plot()
        plt.show()
        print('Test accuracy : {}'.format(test_acc))
    #loss 분석 코드
    plt.plot(range(n_epochs),train_loss,label='train_loss')
    plt.plot(range(n_epochs),valid_loss,label='valid_loss')
    plt.xlabel('n_epochs')
    plt.ylabel('loss')
    plt.legend(loc='upper right')
    plt.show()

### 살려 무조건 살려 그래야지 너가 살아

In [None]:
#모델 앙상블 위한 모델들 로드 과정
model1 = VGG16()
model2 = VGG16()
model3 = VGG16()
model4 = VGG16()
model5 = VGG16()

model1=torch.load(f'./model_1.pt')
model2=torch.load(f'./model_2.pt')
model3=torch.load(f'./model_3.pt')
model4=torch.load(f'./model_4.pt')
model5=torch.load(f'./model_5.pt')

# 결과 CSV 생성

In [None]:
import numpy as np
import pandas as pd

model_list = [model1, model2, model3, model4, model5]  # K개의 모델
predictions = []  # 예측 결과를 저장할 리스트
id_list = []  # 파일 이름을 저장할 리스트

with torch.no_grad():
    for model in model_list:
        model.to(device)
        model.eval()
        model_predictions = []  # 모델의 예측 값을 저장할 리스트
        id_list=[]
        for data, file_name, other in test_loader:
            data, other[0], other[1] = data.to(device), other[0].to(device).float().reshape(-1, 1), other[1].to(device).float().reshape(-1, 1)
            logits = model(data, torch.cat((other[0], other[1]), dim=1))
            softmax_probs = torch.nn.functional.softmax(logits, dim=1)
            model_predictions += softmax_probs.tolist()
            id_list += file_name  # 파일 이름을 id_list에 추가
        predictions.append(model_predictions)
        
# 소프트 보팅을 통해 최종 예측 결과 도출
final_predictions = []
for i in range(len(predictions[0])):
    avg_probs = np.mean([pred[i] for pred in predictions], axis=0)
    final_predictions.append(np.argmax(avg_probs))

handout_result = pd.DataFrame({'Id': id_list, 'Category': final_predictions})
handout_result.to_csv('./result.csv', index=False)

In [None]:
handout_result

In [5]:
for i in range(10):
    for k in range(10):
        print(i,k, end='')
    print()

0 00 10 20 30 40 50 60 70 80 9
1 01 11 21 31 41 51 61 71 81 9
2 02 12 22 32 42 52 62 72 82 9
3 03 13 23 33 43 53 63 73 83 9
4 04 14 24 34 44 54 64 74 84 9
5 05 15 25 35 45 55 65 75 85 9
6 06 16 26 36 46 56 66 76 86 9
7 07 17 27 37 47 57 67 77 87 9
8 08 18 28 38 48 58 68 78 88 9
9 09 19 29 39 49 59 69 79 89 9


### final_predictions

# 참고 자료