In [1]:
from google.colab import files
files.upload()  # kaggle.json 파일 업로드

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"tg0120kim","key":"069f32af449c35caea8215527f3a75b8"}'}

In [2]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!pip install kaggle



In [None]:
import os
os.environ["KAGGLE_USERNAME"]="tg0120kim"
os.environ["KAGGLE_KEY"]="069f32af449c35caea8215527f3a75b8"

In [4]:
!kaggle competitions download -c osai-project

Downloading osai-project.zip to /content
 78% 60.0M/76.7M [00:01<00:00, 48.3MB/s]
100% 76.7M/76.7M [00:01<00:00, 61.9MB/s]


In [5]:
!unzip -o osai-project.zip

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  inflating: train/39280.png         
  inflating: train/39281.png         
  inflating: train/39285.png         
  inflating: train/39287.png         
  inflating: train/39401.png         
  inflating: train/39416.png         
  inflating: train/39420.png         
  inflating: train/39421.png         
  inflating: train/39427.png         
  inflating: train/39452.png         
  inflating: train/39465.png         
  inflating: train/39470.png         
  inflating: train/39472.png         
  inflating: train/39475.png         
  inflating: train/39482.png         
  inflating: train/39504.png         
  inflating: train/39524.png         
  inflating: train/39526.png         
  inflating: train/39540.png         
  inflating: train/39541.png         
  inflating: train/39542.png         
  inflating: train/39547.png         
  inflating: train/39560.png         
  inflating: train/39562.png         
  inflating: train/39576.png         


In [6]:
import pandas as pd

# 데이터 불러오기
train = pd.read_csv('./train/text_label.csv')
test = pd.read_csv('./test/text_label.csv')
val = pd.read_csv('./val/text_label.csv')
sol = pd.read_csv('./solution_sample.csv')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Compose, Resize, Normalize, ToTensor, RandomHorizontalFlip, RandomRotation, ColorJitter
from torchvision.models import efficientnet_b3
from transformers import BertTokenizer, BertModel
from PIL import Image
from tqdm import tqdm
import pandas as pd
import numpy as np
import random

# MixUp 함수 정의
def mixup(data, targets, alpha=1.0):
    indices = torch.randperm(data.size(0))
    shuffled_data = data[indices]
    shuffled_targets = targets[indices]

    lam = np.random.beta(alpha, alpha)
    mixed_data = lam * data + (1 - lam) * shuffled_data

    return mixed_data, targets, shuffled_targets, lam

# 데이터셋 정의
class KaggleDataset(Dataset):
    def __init__(self, dataframe, tokenizer, transform=None, max_len=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.transform = transform if transform else Compose([
            Resize((224, 224)),
            RandomHorizontalFlip(p=0.5),
            RandomRotation(15),
            ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            ToTensor(),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        img_path = row['img']
        text = row['text']
        label = row.get('label', -1)

        # 이미지 처리
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)

        # 텍스트 처리
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        label_tensor = torch.tensor(label, dtype=torch.float32) if label != -1 else torch.tensor(-1, dtype=torch.float32)
        return {
            'image': image,
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': label_tensor
        }

# 모델 정의
class MultimodalModel(nn.Module):
    def __init__(self, cnn_output_size=512, bert_output_size=768, num_classes=2):
        super(MultimodalModel, self).__init__()
        # EfficientNet 기반 이미지 처리
        self.cnn = efficientnet_b3(pretrained=True)
        self.cnn.classifier[1] = nn.Linear(self.cnn.classifier[1].in_features, cnn_output_size)

        # BERT 기반 텍스트 처리
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.text_fc = nn.Sequential(
            nn.Linear(bert_output_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5)
        )

        # 최종 결합 및 분류
        self.fc = nn.Sequential(
            nn.Linear(cnn_output_size + 512, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, images, input_ids, attention_mask):
        # 이미지 처리
        image_features = self.cnn(images)

        # 텍스트 처리
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = self.text_fc(bert_output.last_hidden_state[:, 0, :])

        # 결합 및 최종 출력
        combined_features = torch.cat((image_features, text_features), dim=1)
        output = self.fc(combined_features)
        return output

# 데이터 로드 및 전처리
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
transform = Compose([
    Resize((224, 224)),
    RandomHorizontalFlip(p=0.5),
    RandomRotation(15),
    ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = KaggleDataset(dataframe=train, tokenizer=tokenizer, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4, pin_memory=True)

val_dataset = KaggleDataset(dataframe=val, tokenizer=tokenizer, transform=transform)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True)

# 모델, 손실 함수, 옵티마이저 정의
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultimodalModel().to(device)
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

# Lookahead + SAM 옵티마이저 정의
base_optimizer = optim.AdamW(model.parameters(), lr=3e-5, weight_decay=1e-4)
optimizer = SAM(model.parameters(), base_optimizer)

# Scheduler
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=1)

# Mixed Precision Training
scaler = torch.cuda.amp.GradScaler()

# 학습 루프
num_epochs = 10
grad_accum_steps = 2
for epoch in range(num_epochs):
    model.train()
    train_loss, train_acc = 0, 0

    for idx, batch in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch+1}")):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['label'].to(device, non_blocking=True).long()

        # MixUp 적용
        if random.random() < 0.5:
            images, labels_a, labels_b, lam = mixup(images, labels)
            with torch.cuda.amp.autocast():
                outputs = model(images, input_ids, attention_mask)
                loss = lam * criterion(outputs, labels_a) + (1 - lam) * criterion(outputs, labels_b)
        else:
            with torch.cuda.amp.autocast():
                outputs = model(images, input_ids, attention_mask)
                loss = criterion(outputs, labels)

        loss = loss / grad_accum_steps
        scaler.scale(loss).backward()

        # Gradient Accumulation
        if (idx + 1) % grad_accum_steps == 0 or (idx + 1) == len(train_loader):
            optimizer.first_step(zero_grad=True)
            scaler.step(optimizer)
            scaler.update()
            optimizer.second_step(zero_grad=True)

        train_loss += loss.item() * grad_accum_steps
        train_acc += (outputs.argmax(dim=1) == labels).float().mean()

    scheduler.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_acc/len(train_loader):.4f}")




NameError: name 'SAM' is not defined

In [None]:
!kaggle competitions submit -c osai-project -f submission_optimized.csv -m "Final optimized model submission"

100% 12.3k/12.3k [00:00<00:00, 20.2kB/s]
Successfully submitted to [오픈소스AI] 컴퓨터비전+텍스트분석 캐글 프로젝트

In [None]:
# 0.675

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Compose, Resize, Normalize, ToTensor, RandomHorizontalFlip, RandomRotation, ColorJitter
from torchvision.models import efficientnet_b3
from transformers import BertTokenizer, BertModel
from PIL import Image
from tqdm import tqdm
import pandas as pd

# 데이터셋 정의
class KaggleDataset(Dataset):
    def __init__(self, dataframe, tokenizer, transform=None, max_len=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.transform = transform if transform else Compose([
            Resize((224, 224)),
            RandomHorizontalFlip(p=0.5),
            RandomRotation(15),
            ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            ToTensor(),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        img_path = row['img']
        text = row['text']
        label = row.get('label', -1)  # Test 데이터에는 레이블이 없음

        # 이미지 처리
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)

        # 텍스트 처리
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        label_tensor = torch.tensor(label, dtype=torch.float32) if label != -1 else torch.tensor(-1, dtype=torch.float32)
        return {
            'image': image,
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': label_tensor
        }

# 모델 정의
class Attention(nn.Module):
    def __init__(self, input_dim):
        super(Attention, self).__init__()
        self.attention = nn.Sequential(
            nn.Linear(input_dim, input_dim // 2),
            nn.Tanh(),
            nn.Linear(input_dim // 2, 1),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        weights = self.attention(x)
        return torch.sum(weights * x, dim=1)

class MultimodalModel(nn.Module):
    def __init__(self, cnn_output_size=512, bert_output_size=768, num_classes=2):
        super(MultimodalModel, self).__init__()
        # EfficientNet 기반 이미지 처리
        self.cnn = efficientnet_b3(pretrained=True)
        self.cnn.classifier[1] = nn.Linear(self.cnn.classifier[1].in_features, cnn_output_size)

        # BERT 기반 텍스트 처리
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.text_fc = nn.Sequential(
            nn.Linear(bert_output_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

        # Attention Layer
        self.attention = Attention(cnn_output_size + 512)

        # 최종 분류
        self.fc = nn.Sequential(
            nn.Linear(cnn_output_size + 512, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, input_ids, attention_mask):
        # 이미지 처리
        image_features = self.cnn(images)

        # 텍스트 처리
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = self.text_fc(bert_output.last_hidden_state[:, 0, :])

        # 결합
        combined_features = torch.cat((image_features, text_features), dim=1)
        attended_features = self.attention(combined_features.unsqueeze(1))

        # 최종 출력
        output = self.fc(attended_features)
        return output

# 데이터 로드 및 전처리
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
transform = Compose([
    Resize((224, 224)),
    RandomHorizontalFlip(p=0.5),
    RandomRotation(15),
    ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = KaggleDataset(dataframe=train, tokenizer=tokenizer, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4, pin_memory=True)

val_dataset = KaggleDataset(dataframe=val, tokenizer=tokenizer, transform=transform)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True)

test_dataset = KaggleDataset(dataframe=test, tokenizer=tokenizer, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True)

# 모델, 손실 함수, 옵티마이저 정의
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultimodalModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

# Mixed Precision Training
scaler = torch.cuda.amp.GradScaler()

# 학습 루프
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss, train_acc = 0, 0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['label'].to(device, non_blocking=True).long()

        with torch.cuda.amp.autocast():
            outputs = model(images, input_ids, attention_mask)
            loss = criterion(outputs, labels)

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()
        train_acc += (outputs.argmax(dim=1) == labels).float().mean()

    scheduler.step(train_loss / len(train_loader))
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_acc/len(train_loader):.4f}")

# 테스트 데이터 예측
model.eval()
resulting_label = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting Test Data"):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)

        outputs = model(images, input_ids, attention_mask)
        predictions = outputs.argmax(dim=1)
        resulting_label.extend(predictions.cpu().numpy())

# 결과 저장
test['label'] = resulting_label
test[['id', 'label']].to_csv('submission_optimized.csv', index=False)

print("Submission file saved as 'submission_optimized.csv'")

# 모델 저장
torch.save(model.state_dict(), "multimodal_model_optimized.pth")
print("Model saved as 'multimodal_model_optimized.pth'")


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Training Epoch 1: 100%|██████████| 422/422 [02:07<00:00,  3.32it/s]


Epoch 1/10, Loss: 0.6424, Accuracy: 0.6406


Training Epoch 2: 100%|██████████| 422/422 [02:00<00:00,  3.51it/s]


Epoch 2/10, Loss: 0.5397, Accuracy: 0.7448


Training Epoch 3: 100%|██████████| 422/422 [01:59<00:00,  3.53it/s]


Epoch 3/10, Loss: 0.4649, Accuracy: 0.7946


Training Epoch 4: 100%|██████████| 422/422 [01:58<00:00,  3.56it/s]


Epoch 4/10, Loss: 0.4029, Accuracy: 0.8267


Training Epoch 5: 100%|██████████| 422/422 [01:58<00:00,  3.56it/s]


Epoch 5/10, Loss: 0.3527, Accuracy: 0.8480


Training Epoch 6: 100%|██████████| 422/422 [02:00<00:00,  3.50it/s]


Epoch 6/10, Loss: 0.3137, Accuracy: 0.8614


Training Epoch 7: 100%|██████████| 422/422 [01:57<00:00,  3.60it/s]


Epoch 7/10, Loss: 0.2627, Accuracy: 0.8854


Training Epoch 8: 100%|██████████| 422/422 [02:00<00:00,  3.51it/s]


Epoch 8/10, Loss: 0.2355, Accuracy: 0.8945


Training Epoch 9: 100%|██████████| 422/422 [02:01<00:00,  3.48it/s]


Epoch 9/10, Loss: 0.2198, Accuracy: 0.8985


Training Epoch 10: 100%|██████████| 422/422 [01:56<00:00,  3.63it/s]


Epoch 10/10, Loss: 0.2122, Accuracy: 0.8977


Predicting Test Data: 100%|██████████| 100/100 [00:18<00:00,  5.36it/s]


Submission file saved as 'submission_optimized.csv'
Model saved as 'multimodal_model_optimized.pth'


In [8]:
# 0.675

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Compose, Resize, Normalize, ToTensor, RandomHorizontalFlip, RandomRotation, ColorJitter
from torchvision.models import efficientnet_b3
from transformers import BertTokenizer, BertModel
from PIL import Image
from tqdm import tqdm
import pandas as pd

# 데이터셋 정의
class KaggleDataset(Dataset):
    def __init__(self, dataframe, tokenizer, transform=None, max_len=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.transform = transform if transform else Compose([
            Resize((224, 224)),
            RandomHorizontalFlip(p=0.5),
            RandomRotation(15),
            ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            ToTensor(),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        img_path = row['img']
        text = row['text']
        label = row.get('label', -1)  # Test 데이터에는 레이블이 없음

        # 이미지 처리
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)

        # 텍스트 처리
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        label_tensor = torch.tensor(label, dtype=torch.float32) if label != -1 else torch.tensor(-1, dtype=torch.float32)
        return {
            'image': image,
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': label_tensor
        }

# 모델 정의
class Attention(nn.Module):
    def __init__(self, input_dim):
        super(Attention, self).__init__()
        self.attention = nn.Sequential(
            nn.Linear(input_dim, input_dim // 2),
            nn.Tanh(),
            nn.Linear(input_dim // 2, 1),
            nn.Softmax(dim=1)
        )

    def forward(self, x):
        weights = self.attention(x)
        return torch.sum(weights * x, dim=1)

class MultimodalModel(nn.Module):
    def __init__(self, cnn_output_size=512, bert_output_size=768, num_classes=2):
        super(MultimodalModel, self).__init__()
        # EfficientNet 기반 이미지 처리
        self.cnn = efficientnet_b3(pretrained=True)
        self.cnn.classifier[1] = nn.Linear(self.cnn.classifier[1].in_features, cnn_output_size)

        # BERT 기반 텍스트 처리
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.text_fc = nn.Sequential(
            nn.Linear(bert_output_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

        # Attention Layer
        self.attention = Attention(cnn_output_size + 512)

        # 최종 분류
        self.fc = nn.Sequential(
            nn.Linear(cnn_output_size + 512, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, input_ids, attention_mask):
        # 이미지 처리
        image_features = self.cnn(images)

        # 텍스트 처리
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = self.text_fc(bert_output.last_hidden_state[:, 0, :])

        # 결합
        combined_features = torch.cat((image_features, text_features), dim=1)
        attended_features = self.attention(combined_features.unsqueeze(1))

        # 최종 출력
        output = self.fc(attended_features)
        return output

# 데이터 로드 및 전처리
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
transform = Compose([
    Resize((224, 224)),
    RandomHorizontalFlip(p=0.5),
    RandomRotation(15),
    ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = KaggleDataset(dataframe=train, tokenizer=tokenizer, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4, pin_memory=True)

val_dataset = KaggleDataset(dataframe=val, tokenizer=tokenizer, transform=transform)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True)

test_dataset = KaggleDataset(dataframe=test, tokenizer=tokenizer, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True)

# 모델, 손실 함수, 옵티마이저 정의
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultimodalModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)

# Mixed Precision Training
scaler = torch.cuda.amp.GradScaler()

# 학습 루프
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss, train_acc = 0, 0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['label'].to(device, non_blocking=True).long()

        with torch.cuda.amp.autocast():
            outputs = model(images, input_ids, attention_mask)
            loss = criterion(outputs, labels)

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()
        train_acc += (outputs.argmax(dim=1) == labels).float().mean()

    scheduler.step(train_loss / len(train_loader))
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_acc/len(train_loader):.4f}")

# 테스트 데이터 예측
model.eval()
resulting_label = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting Test Data"):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)

        outputs = model(images, input_ids, attention_mask)
        predictions = outputs.argmax(dim=1)
        resulting_label.extend(predictions.cpu().numpy())

# 결과 저장
test['label'] = resulting_label
test[['id', 'label']].to_csv('submission_optimized.csv', index=False)

print("Submission file saved as 'submission_optimized.csv'")

# 모델 저장
torch.save(model.state_dict(), "multimodal_model_optimized.pth")
print("Model saved as 'multimodal_model_optimized.pth'")


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Training Epoch 1: 100%|██████████| 422/422 [02:00<00:00,  3.51it/s]


Epoch 1/10, Loss: 0.6311, Accuracy: 0.6611


Training Epoch 2: 100%|██████████| 422/422 [01:59<00:00,  3.53it/s]


Epoch 2/10, Loss: 0.5275, Accuracy: 0.7530


Training Epoch 3: 100%|██████████| 422/422 [01:59<00:00,  3.52it/s]


Epoch 3/10, Loss: 0.4654, Accuracy: 0.7940


Training Epoch 4: 100%|██████████| 422/422 [02:02<00:00,  3.46it/s]


Epoch 4/10, Loss: 0.4321, Accuracy: 0.8109


Training Epoch 5: 100%|██████████| 422/422 [02:00<00:00,  3.50it/s]


Epoch 5/10, Loss: 0.3716, Accuracy: 0.8427


Training Epoch 6: 100%|██████████| 422/422 [01:56<00:00,  3.62it/s]


Epoch 6/10, Loss: 0.3290, Accuracy: 0.8649


Training Epoch 7: 100%|██████████| 422/422 [01:59<00:00,  3.52it/s]


Epoch 7/10, Loss: 0.2938, Accuracy: 0.8769


Training Epoch 8: 100%|██████████| 422/422 [02:01<00:00,  3.47it/s]


Epoch 8/10, Loss: 0.2653, Accuracy: 0.8901


Training Epoch 9: 100%|██████████| 422/422 [01:59<00:00,  3.54it/s]


Epoch 9/10, Loss: 0.2202, Accuracy: 0.9073


Training Epoch 10: 100%|██████████| 422/422 [02:00<00:00,  3.49it/s]


Epoch 10/10, Loss: 0.2389, Accuracy: 0.8975


Predicting Test Data: 100%|██████████| 100/100 [00:19<00:00,  5.01it/s]


Submission file saved as 'submission_optimized.csv'
Model saved as 'multimodal_model_optimized.pth'


In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Compose, Resize, Normalize, ToTensor
from torchvision.models import efficientnet_b3
from transformers import BertTokenizer, BertModel
from PIL import Image
from tqdm import tqdm
import pandas as pd

# 데이터셋 정의
class KaggleDataset(Dataset):
    def __init__(self, dataframe, tokenizer, transform=None, max_len=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.transform = transform if transform else Compose([
            Resize((224, 224)),
            ToTensor(),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        img_path = row['img']
        text = row['text']
        label = row.get('label', -1)

        # 이미지 처리
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)

        # 텍스트 처리
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        label_tensor = torch.tensor(label, dtype=torch.long) if label != -1 else torch.tensor(-1, dtype=torch.long)
        return {
            'image': image,
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': label_tensor
        }

# Fine-Tuning 모델 정의
class MultimodalModel(nn.Module):
    def __init__(self, cnn_output_size=512, bert_output_size=768, num_classes=2):
        super(MultimodalModel, self).__init__()
        self.cnn = efficientnet_b3(pretrained=True)
        self.cnn.classifier[1] = nn.Linear(self.cnn.classifier[1].in_features, cnn_output_size)
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.text_fc = nn.Sequential(
            nn.Linear(bert_output_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.fc = nn.Sequential(
            nn.Linear(cnn_output_size + 512, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, input_ids, attention_mask):
        image_features = self.cnn(images)
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = self.text_fc(bert_output.last_hidden_state[:, 0, :])
        combined_features = torch.cat((image_features, text_features), dim=1)
        output = self.fc(combined_features)
        return output

# 로드 및 데이터 준비
pretrained_model_path = "multimodal_model_optimized.pth"
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Fine-Tuning 데이터 준비
train_dataset = KaggleDataset(dataframe=train, tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4, pin_memory=True)

test_dataset = KaggleDataset(dataframe=test, tokenizer=tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4, pin_memory=True)

# 모델 준비
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultimodalModel().to(device)

# 가중치 로드 시 strict=False 설정
pretrained_dict = torch.load(pretrained_model_path)
model_dict = model.state_dict()
filtered_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
model_dict.update(filtered_dict)
model.load_state_dict(model_dict)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=1e-4)

# Fine-Tuning 루프
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    train_loss, train_acc = 0, 0

    for batch in tqdm(train_loader, desc=f"Fine-Tuning Epoch {epoch+1}"):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['label'].to(device, non_blocking=True)

        optimizer.zero_grad()
        outputs = model(images, input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_acc += (outputs.argmax(dim=1) == labels).float().mean()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_acc/len(train_loader):.4f}")

# 테스트 데이터 예측
model.eval()
resulting_label = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting Test Data"):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)

        outputs = model(images, input_ids, attention_mask)
        predictions = outputs.argmax(dim=1)
        resulting_label.extend(predictions.cpu().numpy())

# 결과 저장
test['label'] = resulting_label
test[['id', 'label']].to_csv('submission_fine_tuned.csv', index=False)

print("Submission file saved as 'submission_fine_tuned.csv'")

# 모델 저장
torch.save(model.state_dict(), "multimodal_model_fine_tuned.pth")
print("Model saved as 'multimodal_model_fine_tuned.pth'")


  pretrained_dict = torch.load(pretrained_model_path)
Fine-Tuning Epoch 1: 100%|██████████| 422/422 [03:45<00:00,  1.87it/s]


Epoch 1/5, Loss: 0.2302, Accuracy: 0.8990


Fine-Tuning Epoch 2:  73%|███████▎  | 308/422 [02:44<01:00,  1.88it/s]Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7d4a6195b1c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1604, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1535, in _shutdown_workers
    if not self._shutdown:
AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_shutdown'
Fine-Tuning Epoch 2: 100%|██████████| 422/422 [03:45<00:00,  1.87it/s]


Epoch 2/5, Loss: 0.1963, Accuracy: 0.9169


Fine-Tuning Epoch 3: 100%|██████████| 422/422 [03:43<00:00,  1.89it/s]


Epoch 3/5, Loss: 0.1525, Accuracy: 0.9374


Fine-Tuning Epoch 4: 100%|██████████| 422/422 [03:44<00:00,  1.88it/s]


Epoch 4/5, Loss: 0.1215, Accuracy: 0.9526


Fine-Tuning Epoch 5: 100%|██████████| 422/422 [03:44<00:00,  1.88it/s]


Epoch 5/5, Loss: 0.0965, Accuracy: 0.9650


Predicting Test Data: 100%|██████████| 100/100 [00:17<00:00,  5.86it/s]


Submission file saved as 'submission_fine_tuned.csv'
Model saved as 'multimodal_model_fine_tuned.pth'


In [11]:
!kaggle competitions submit -c osai-project -f submission_fine_tuned.csv -m "Final optimized model submission"

100% 12.3k/12.3k [00:00<00:00, 19.8kB/s]
Successfully submitted to [오픈소스AI] 컴퓨터비전+텍스트분석 캐글 프로젝트

In [None]:
## 0.637
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Compose, Resize, Normalize, ToTensor, RandomHorizontalFlip, RandomRotation, ColorJitter
from torchvision.models import efficientnet_b0
from transformers import DistilBertTokenizer, DistilBertModel
from PIL import Image
from tqdm import tqdm
import pandas as pd

# 데이터셋 정의
class KaggleDataset(Dataset):
    def __init__(self, dataframe, tokenizer, transform=None, max_len=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.transform = transform if transform else Compose([
            Resize((224, 224)),
            RandomHorizontalFlip(p=0.5),
            RandomRotation(15),
            ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            ToTensor(),
            Normalize(mean=[0.5], std=[0.5])
        ])
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        img_path = row['img']
        text = row['text']
        label = row.get('label', -1)

        # 이미지 처리
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)

        # 텍스트 처리
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        label_tensor = torch.tensor(label, dtype=torch.float32) if label != -1 else torch.tensor(-1, dtype=torch.float32)
        return {
            'image': image,
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': label_tensor
        }

# 모델 정의
class MultimodalModel(nn.Module):
    def __init__(self, cnn_output_size=512, bert_output_size=768, num_classes=2):
        super(MultimodalModel, self).__init__()
        # EfficientNet 기반 이미지 처리
        self.cnn = efficientnet_b0(weights="IMAGENET1K_V1")
        self.cnn.classifier[1] = nn.Linear(self.cnn.classifier[1].in_features, cnn_output_size)

        # BERT 기반 텍스트 처리
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.text_fc = nn.Sequential(
            nn.Linear(bert_output_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

        # 최종 결합 및 분류
        self.fc = nn.Sequential(
            nn.Linear(cnn_output_size + 512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

    def forward(self, images, input_ids, attention_mask):
        # 이미지 처리
        image_features = self.cnn(images)

        # 텍스트 처리
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = self.text_fc(bert_output.last_hidden_state[:, 0, :])

        # 결합 및 최종 출력
        combined_features = torch.cat((image_features, text_features), dim=1)
        output = self.fc(combined_features)
        return output

# 데이터 로드 및 전처리
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
transform = Compose([
    Resize((224, 224)),
    RandomHorizontalFlip(p=0.5),
    RandomRotation(15),
    ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    ToTensor(),
    Normalize(mean=[0.5], std=[0.5])
])

train_dataset = KaggleDataset(dataframe=train, tokenizer=tokenizer, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

val_dataset = KaggleDataset(dataframe=val, tokenizer=tokenizer, transform=transform)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)

test_dataset = KaggleDataset(dataframe=test, tokenizer=tokenizer, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)

# 모델, 손실 함수, 옵티마이저 정의
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultimodalModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=3e-5, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Mixed Precision Training
scaler = torch.cuda.amp.GradScaler()

# 학습 루프
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    train_loss, train_acc = 0, 0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['label'].to(device, non_blocking=True).long()

        with torch.cuda.amp.autocast():
            outputs = model(images, input_ids, attention_mask)
            loss = criterion(outputs, labels)

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()
        train_acc += (outputs.argmax(dim=1) == labels).float().mean()

    scheduler.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_acc/len(train_loader):.4f}")

# 테스트 데이터 예측
model.eval()
resulting_label = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting Test Data"):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)

        outputs = model(images, input_ids, attention_mask)
        predictions = outputs.argmax(dim=1)
        resulting_label.extend(predictions.cpu().numpy())

# 결과 저장
test['label'] = resulting_label  # 테스트 데이터프레임에 예측 결과 추가
test[['id', 'label']].to_csv('submission_tutorial.csv', index=False)  # 'id'와 'label' 컬럼만 저장

print("Submission file saved as 'submission_tutorial.csv'")

# 모델 저장
torch.save(model.state_dict(), "multimodal_model_optimized_v2.pth")
print("Model saved as 'multimodal_model_optimized_v2.pth'")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100%|██████████| 20.5M/20.5M [00:00<00:00, 90.7MB/s]


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Training Epoch 1: 100%|██████████| 211/211 [01:09<00:00,  3.06it/s]


Epoch 1/10, Loss: 0.5788, Accuracy: 0.7049


Training Epoch 2: 100%|██████████| 211/211 [01:10<00:00,  3.00it/s]


Epoch 2/10, Loss: 0.4641, Accuracy: 0.7941


Training Epoch 3: 100%|██████████| 211/211 [01:14<00:00,  2.85it/s]


Epoch 3/10, Loss: 0.3689, Accuracy: 0.8382


Training Epoch 4: 100%|██████████| 211/211 [01:08<00:00,  3.09it/s]


Epoch 4/10, Loss: 0.2811, Accuracy: 0.8756


Training Epoch 5: 100%|██████████| 211/211 [01:07<00:00,  3.13it/s]


Epoch 5/10, Loss: 0.2106, Accuracy: 0.9062


Training Epoch 6: 100%|██████████| 211/211 [01:15<00:00,  2.79it/s]


Epoch 6/10, Loss: 0.1703, Accuracy: 0.9257


Training Epoch 7: 100%|██████████| 211/211 [01:09<00:00,  3.06it/s]


Epoch 7/10, Loss: 0.1390, Accuracy: 0.9375


Training Epoch 8: 100%|██████████| 211/211 [01:11<00:00,  2.97it/s]


Epoch 8/10, Loss: 0.1205, Accuracy: 0.9463


Training Epoch 9: 100%|██████████| 211/211 [01:14<00:00,  2.82it/s]


Epoch 9/10, Loss: 0.1092, Accuracy: 0.9504


Training Epoch 10: 100%|██████████| 211/211 [01:10<00:00,  2.98it/s]


Epoch 10/10, Loss: 0.1011, Accuracy: 0.9575


Predicting Test Data: 100%|██████████| 50/50 [00:15<00:00,  3.24it/s]


Submission file saved as 'submission_tutorial.csv'
Model saved as 'multimodal_model_optimized_v2.pth'


In [None]:
## 0.637
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Compose, Resize, Normalize, ToTensor, RandomHorizontalFlip, RandomRotation, ColorJitter
from torchvision.models import efficientnet_b0
from transformers import DistilBertTokenizer, DistilBertModel
from PIL import Image
from tqdm import tqdm
import pandas as pd

# 데이터셋 정의
class KaggleDataset(Dataset):
    def __init__(self, dataframe, tokenizer, transform=None, max_len=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.transform = transform if transform else Compose([
            Resize((224, 224)),
            RandomHorizontalFlip(p=0.5),
            RandomRotation(15),
            ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            ToTensor(),
            Normalize(mean=[0.5], std=[0.5])
        ])
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        img_path = row['img']
        text = row['text']
        label = row.get('label', -1)

        # 이미지 처리
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)

        # 텍스트 처리
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        label_tensor = torch.tensor(label, dtype=torch.float32) if label != -1 else torch.tensor(-1, dtype=torch.float32)
        return {
            'image': image,
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': label_tensor
        }

# 모델 정의
class MultimodalModel(nn.Module):
    def __init__(self, cnn_output_size=512, bert_output_size=768, num_classes=2):
        super(MultimodalModel, self).__init__()
        # EfficientNet 기반 이미지 처리
        self.cnn = efficientnet_b0(weights="IMAGENET1K_V1")
        self.cnn.classifier[1] = nn.Linear(self.cnn.classifier[1].in_features, cnn_output_size)

        # BERT 기반 텍스트 처리
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.text_fc = nn.Sequential(
            nn.Linear(bert_output_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

        # 최종 결합 및 분류
        self.fc = nn.Sequential(
            nn.Linear(cnn_output_size + 512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

    def forward(self, images, input_ids, attention_mask):
        # 이미지 처리
        image_features = self.cnn(images)

        # 텍스트 처리
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = self.text_fc(bert_output.last_hidden_state[:, 0, :])

        # 결합 및 최종 출력
        combined_features = torch.cat((image_features, text_features), dim=1)
        output = self.fc(combined_features)
        return output

# 데이터 로드 및 전처리
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
transform = Compose([
    Resize((224, 224)),
    RandomHorizontalFlip(p=0.5),
    RandomRotation(15),
    ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    ToTensor(),
    Normalize(mean=[0.5], std=[0.5])
])

train_dataset = KaggleDataset(dataframe=train, tokenizer=tokenizer, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

val_dataset = KaggleDataset(dataframe=val, tokenizer=tokenizer, transform=transform)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)

test_dataset = KaggleDataset(dataframe=test, tokenizer=tokenizer, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)

# 모델, 손실 함수, 옵티마이저 정의
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultimodalModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=3e-5, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Mixed Precision Training
scaler = torch.cuda.amp.GradScaler()

# 학습 루프
num_epochs = 12
for epoch in range(num_epochs):
    model.train()
    train_loss, train_acc = 0, 0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['label'].to(device, non_blocking=True).long()

        with torch.cuda.amp.autocast():
            outputs = model(images, input_ids, attention_mask)
            loss = criterion(outputs, labels)

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()
        train_acc += (outputs.argmax(dim=1) == labels).float().mean()

    scheduler.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_acc/len(train_loader):.4f}")

# 테스트 데이터 예측
model.eval()
resulting_label = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting Test Data"):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)

        outputs = model(images, input_ids, attention_mask)
        predictions = outputs.argmax(dim=1)
        resulting_label.extend(predictions.cpu().numpy())

# 결과 저장
test['label'] = resulting_label  # 테스트 데이터프레임에 예측 결과 추가
test[['id', 'label']].to_csv('submission_tutorial.csv', index=False)  # 'id'와 'label' 컬럼만 저장

print("Submission file saved as 'submission_tutorial.csv'")

# 모델 저장
torch.save(model.state_dict(), "multimodal_model_optimized_v2.pth")
print("Model saved as 'multimodal_model_optimized_v2.pth'")


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Training Epoch 1: 100%|██████████| 211/211 [01:06<00:00,  3.15it/s]


Epoch 1/12, Loss: 0.5778, Accuracy: 0.7051


Training Epoch 2: 100%|██████████| 211/211 [01:05<00:00,  3.21it/s]


Epoch 2/12, Loss: 0.4702, Accuracy: 0.7862


Training Epoch 3: 100%|██████████| 211/211 [01:08<00:00,  3.07it/s]


Epoch 3/12, Loss: 0.3763, Accuracy: 0.8327


Training Epoch 4: 100%|██████████| 211/211 [01:08<00:00,  3.06it/s]


Epoch 4/12, Loss: 0.2917, Accuracy: 0.8775


Training Epoch 5: 100%|██████████| 211/211 [01:05<00:00,  3.22it/s]


Epoch 5/12, Loss: 0.2251, Accuracy: 0.9011


Training Epoch 6: 100%|██████████| 211/211 [01:08<00:00,  3.10it/s]


Epoch 6/12, Loss: 0.1768, Accuracy: 0.9215


Training Epoch 7: 100%|██████████| 211/211 [01:09<00:00,  3.05it/s]


Epoch 7/12, Loss: 0.1488, Accuracy: 0.9301


Training Epoch 8: 100%|██████████| 211/211 [01:06<00:00,  3.17it/s]


Epoch 8/12, Loss: 0.1293, Accuracy: 0.9401


Training Epoch 9: 100%|██████████| 211/211 [01:06<00:00,  3.19it/s]


Epoch 9/12, Loss: 0.1135, Accuracy: 0.9489


Training Epoch 10: 100%|██████████| 211/211 [01:08<00:00,  3.06it/s]


Epoch 10/12, Loss: 0.1064, Accuracy: 0.9538


Training Epoch 11: 100%|██████████| 211/211 [01:08<00:00,  3.07it/s]


Epoch 11/12, Loss: 0.1073, Accuracy: 0.9542


Training Epoch 12: 100%|██████████| 211/211 [01:05<00:00,  3.23it/s]


Epoch 12/12, Loss: 0.1059, Accuracy: 0.9555


Predicting Test Data: 100%|██████████| 50/50 [00:12<00:00,  3.91it/s]


Submission file saved as 'submission_tutorial.csv'
Model saved as 'multimodal_model_optimized_v2.pth'


In [None]:
!kaggle competitions submit -c osai-project -f submission_tutorial.csv -m "Final optimized model submission"

100% 12.3k/12.3k [00:00<00:00, 14.8kB/s]
Successfully submitted to [오픈소스AI] 컴퓨터비전+텍스트분석 캐글 프로젝트

In [None]:
# 0.664
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Compose, Resize, Normalize, ToTensor, RandomHorizontalFlip, RandomRotation, ColorJitter
from torchvision.models import efficientnet_b0
from transformers import DistilBertTokenizer, DistilBertModel
from PIL import Image
from tqdm import tqdm
import pandas as pd

# 데이터셋 정의
class KaggleDataset(Dataset):
    def __init__(self, dataframe, tokenizer, transform=None, max_len=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.transform = transform if transform else Compose([
            Resize((128, 128)),
            RandomHorizontalFlip(p=0.5),
            RandomRotation(15),
            ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            ToTensor(),
            Normalize(mean=[0.5], std=[0.5])
        ])
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        img_path = row['img']
        text = row['text']
        label = row.get('label', -1)  # Test 데이터에는 레이블이 없음

        # 이미지 처리
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)

        # 텍스트 처리
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        label_tensor = torch.tensor(label, dtype=torch.float32) if label != -1 else torch.tensor(-1, dtype=torch.float32)
        return {
            'image': image,
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': label_tensor
        }

# 모델 정의
class MultimodalModel(nn.Module):
    def __init__(self, cnn_output_size=256, bert_output_size=768, num_classes=2):
        super(MultimodalModel, self).__init__()
        # EfficientNet 기반 이미지 처리
        self.cnn = efficientnet_b0(pretrained=True)
        self.cnn.classifier[1] = nn.Linear(self.cnn.classifier[1].in_features, cnn_output_size)

        # BERT 기반 텍스트 처리
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.text_fc = nn.Sequential(
            nn.Linear(bert_output_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

        # 최종 결합 및 분류
        self.fc = nn.Sequential(
            nn.Linear(cnn_output_size + 256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, input_ids, attention_mask):
        # 이미지 처리
        image_features = self.cnn(images)

        # 텍스트 처리
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = self.text_fc(bert_output.last_hidden_state[:, 0, :])

        # 결합 및 최종 출력
        combined_features = torch.cat((image_features, text_features), dim=1)
        output = self.fc(combined_features)
        return output

# 데이터 로드 및 전처리
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
transform = Compose([
    Resize((128, 128)),
    RandomHorizontalFlip(p=0.5),
    RandomRotation(15),
    ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    ToTensor(),
    Normalize(mean=[0.5], std=[0.5])
])

train_dataset = KaggleDataset(dataframe=train, tokenizer=tokenizer, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2, pin_memory=True)

val_dataset = KaggleDataset(dataframe=val, tokenizer=tokenizer, transform=transform)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2, pin_memory=True)

test_dataset = KaggleDataset(dataframe=test, tokenizer=tokenizer, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2, pin_memory=True)

# 모델, 손실 함수, 옵티마이저 정의
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultimodalModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Mixed Precision Training
scaler = torch.cuda.amp.GradScaler()

# 학습 루프
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss, train_acc = 0, 0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['label'].to(device, non_blocking=True).long()

        with torch.cuda.amp.autocast():
            outputs = model(images, input_ids, attention_mask)
            loss = criterion(outputs, labels)

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()
        train_acc += (outputs.argmax(dim=1) == labels).float().mean()

    scheduler.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_acc/len(train_loader):.4f}")

# 테스트 데이터 예측
model.eval()
resulting_label = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting Test Data"):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)

        outputs = model(images, input_ids, attention_mask)
        predictions = outputs.argmax(dim=1)
        resulting_label.extend(predictions.cpu().numpy())

# 결과 저장
test['label'] = resulting_label  # 테스트 데이터프레임에 예측 결과 추가
test[['id', 'label']].to_csv('submission_tutorial.csv', index=False)  # 'id'와 'label' 컬럼만 저장

print("Submission file saved as 'submission_tutorial.csv'")

# 모델 저장
torch.save(model.state_dict(), "multimodal_model_optimized.pth")
print("Model saved as 'multimodal_model_optimized.pth'")


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Training Epoch 1:   4%|▍         | 8/211 [00:03<01:20,  2.51it/s]


KeyboardInterrupt: 

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import Compose, Resize, Normalize, ToTensor, RandomHorizontalFlip, RandomRotation, ColorJitter
from torchvision.models import efficientnet_b0
from transformers import DistilBertTokenizer, DistilBertModel
from PIL import Image
from tqdm import tqdm
import pandas as pd

# 데이터셋 정의
class KaggleDataset(Dataset):
    def __init__(self, dataframe, tokenizer, transform=None, max_len=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.transform = transform if transform else Compose([
            Resize((224, 224)),
            RandomHorizontalFlip(p=0.5),
            RandomRotation(10),
            ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            ToTensor(),
            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        img_path = row['img']
        text = row['text']
        label = row.get('label', -1)  # Test 데이터에는 레이블이 없음

        # 이미지 처리
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)

        # 텍스트 처리
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        label_tensor = torch.tensor(label, dtype=torch.float32) if label != -1 else torch.tensor(-1, dtype=torch.float32)
        return {
            'image': image,
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': label_tensor
        }

# 모델 정의
class MultimodalModel(nn.Module):
    def __init__(self, cnn_output_size=512, lstm_hidden_size=256, num_classes=2):
        super(MultimodalModel, self).__init__()
        # EfficientNet 기반 이미지 처리
        self.cnn = efficientnet_b0(pretrained=True)
        self.cnn.classifier[1] = nn.Linear(self.cnn.classifier[1].in_features, cnn_output_size)

        # BERT 기반 텍스트 처리 및 LSTM
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.lstm = nn.LSTM(input_size=768, hidden_size=lstm_hidden_size, batch_first=True, bidirectional=True)
        self.text_fc = nn.Sequential(
            nn.Linear(lstm_hidden_size * 2, 512),
            nn.ReLU(),
            nn.Dropout(0.5)
        )

        # 최종 결합 및 분류
        self.fc = nn.Sequential(
            nn.Linear(cnn_output_size + 512, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, images, input_ids, attention_mask):
        # 이미지 처리
        image_features = self.cnn(images)

        # 텍스트 처리: BERT + LSTM
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        lstm_input = bert_output.last_hidden_state  # shape: (batch_size, seq_len, 768)
        lstm_output, _ = self.lstm(lstm_input)
        text_features = self.text_fc(lstm_output[:, -1, :])  # 마지막 타임스텝 출력

        # 결합 및 최종 출력
        combined_features = torch.cat((image_features, text_features), dim=1)
        output = self.fc(combined_features)
        return output

# 데이터 로드 및 전처리
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
transform = Compose([
    Resize((224, 224)),
    RandomHorizontalFlip(p=0.5),
    RandomRotation(10),
    ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = KaggleDataset(dataframe=train, tokenizer=tokenizer, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=2, pin_memory=True)

val_dataset = KaggleDataset(dataframe=val, tokenizer=tokenizer, transform=transform)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=2, pin_memory=True)

test_dataset = KaggleDataset(dataframe=test, tokenizer=tokenizer, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=2, pin_memory=True)

# 모델, 손실 함수, 옵티마이저 정의
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultimodalModel().to(device)
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = optim.AdamW(model.parameters(), lr=3e-5, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Mixed Precision Training
scaler = torch.cuda.amp.GradScaler()

# 학습 루프
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss, train_acc = 0, 0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['label'].to(device, non_blocking=True).long()

        with torch.cuda.amp.autocast():
            outputs = model(images, input_ids, attention_mask)
            loss = criterion(outputs, labels)

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient Clipping
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()
        train_acc += (outputs.argmax(dim=1) == labels).float().mean()

    scheduler.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_acc/len(train_loader):.4f}")

# 테스트 데이터 예측
model.eval()
resulting_label = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting Test Data"):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)

        outputs = model(images, input_ids, attention_mask)
        predictions = outputs.argmax(dim=1)
        resulting_label.extend(predictions.cpu().numpy())

# 결과 저장
test['label'] = resulting_label  # 테스트 데이터프레임에 예측 결과 추가
test[['id', 'label']].to_csv('submission_tutorial.csv', index=False)  # 'id'와 'label' 컬럼만 저장

print("Submission file saved as 'submission_tutorial.csv'")

# 모델 저장
torch.save(model.state_dict(), "multimodal_model_optimized.pth")
print("Model saved as 'multimodal_model_optimized.pth'")


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Training Epoch 1: 100%|██████████| 422/422 [01:24<00:00,  4.98it/s]


Epoch 1/10, Loss: 0.6616, Accuracy: 0.6400


Training Epoch 2: 100%|██████████| 422/422 [01:26<00:00,  4.87it/s]


Epoch 2/10, Loss: 0.6371, Accuracy: 0.6445


Training Epoch 3: 100%|██████████| 422/422 [01:23<00:00,  5.04it/s]


Epoch 3/10, Loss: 0.5888, Accuracy: 0.7087


Training Epoch 4: 100%|██████████| 422/422 [01:24<00:00,  4.98it/s]


Epoch 4/10, Loss: 0.5510, Accuracy: 0.7651


Training Epoch 5: 100%|██████████| 422/422 [01:28<00:00,  4.79it/s]


Epoch 5/10, Loss: 0.5244, Accuracy: 0.7858


Training Epoch 6: 100%|██████████| 422/422 [01:24<00:00,  4.97it/s]


Epoch 6/10, Loss: 0.5093, Accuracy: 0.7950


Training Epoch 7: 100%|██████████| 422/422 [01:26<00:00,  4.87it/s]


Epoch 7/10, Loss: 0.4844, Accuracy: 0.8181


Training Epoch 8: 100%|██████████| 422/422 [01:24<00:00,  4.98it/s]


Epoch 8/10, Loss: 0.4612, Accuracy: 0.8359


Training Epoch 9: 100%|██████████| 422/422 [01:24<00:00,  4.99it/s]


Epoch 9/10, Loss: 0.4399, Accuracy: 0.8489


Training Epoch 10: 100%|██████████| 422/422 [01:27<00:00,  4.82it/s]


Epoch 10/10, Loss: 0.4281, Accuracy: 0.8603


Predicting Test Data: 100%|██████████| 100/100 [00:15<00:00,  6.56it/s]


Submission file saved as 'submission_tutorial.csv'
Model saved as 'multimodal_model_optimized.pth'


In [None]:
# 0.671

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import ToTensor, Compose, Resize, Normalize
from torchvision.models import resnet18
from transformers import DistilBertTokenizer, DistilBertModel
from PIL import Image
from tqdm import tqdm
import pandas as pd

# 데이터셋 정의
class KaggleDataset(Dataset):
    def __init__(self, dataframe, tokenizer, transform=None, max_len=128):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.transform = transform if transform else Compose([Resize((128, 128)), ToTensor()])
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        img_path = row['img']
        text = row['text']
        label = row.get('label', -1)  # Test 데이터에는 레이블이 없음

        # 이미지 처리
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)

        # 텍스트 처리
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        label_tensor = torch.tensor(label, dtype=torch.float32) if label != -1 else torch.tensor(-1, dtype=torch.float32)
        return {
            'image': image,
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': label_tensor
        }

# 모델 정의
class MultimodalModel(nn.Module):
    def __init__(self, cnn_output_size=128, bert_output_size=768, num_classes=2):
        super(MultimodalModel, self).__init__()
        # CNN 기반 이미지 처리
        self.cnn = resnet18(pretrained=True)
        self.cnn.fc = nn.Linear(self.cnn.fc.in_features, cnn_output_size)

        # BERT 기반 텍스트 처리
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.text_fc = nn.Linear(bert_output_size, 128)

        # 최종 분류
        self.fc = nn.Linear(cnn_output_size + 128, num_classes)

    def forward(self, images, input_ids, attention_mask):
        # 이미지 처리
        image_features = self.cnn(images)

        # 텍스트 처리
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = self.text_fc(bert_output.last_hidden_state[:, 0, :])

        # 결합
        combined_features = torch.cat((image_features, text_features), dim=1)
        output = self.fc(combined_features)
        return output

# 데이터 로드 및 전처리
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
transform = Compose([Resize((128, 128)), ToTensor(), Normalize(mean=[0.5], std=[0.5])])

train_dataset = KaggleDataset(dataframe=train, tokenizer=tokenizer, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2, pin_memory=True)

val_dataset = KaggleDataset(dataframe=val, tokenizer=tokenizer, transform=transform)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2, pin_memory=True)

test_dataset = KaggleDataset(dataframe=test, tokenizer=tokenizer, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2, pin_memory=True)

# 모델, 손실 함수, 옵티마이저 정의
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultimodalModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)

# 학습 루프
num_epochs = 2
for epoch in range(num_epochs):
    model.train()
    train_loss, train_acc = 0, 0

    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['label'].to(device, non_blocking=True).long()

        # Forward
        outputs = model(images, input_ids, attention_mask)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Accuracy 계산
        train_loss += loss.item()
        train_acc += (outputs.argmax(dim=1) == labels).float().mean()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss/len(train_loader):.4f}, Accuracy: {train_acc/len(train_loader):.4f}")

# 테스트 데이터 예측
model.eval()
resulting_label = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Predicting Test Data"):
        images = batch['image'].to(device, non_blocking=True)
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)

        outputs = model(images, input_ids, attention_mask)
        predictions = outputs.argmax(dim=1)
        resulting_label.extend(predictions.cpu().numpy())

# 결과 저장
test['label'] = resulting_label  # 테스트 데이터프레임에 예측 결과 추가
test[['id', 'label']].to_csv('submission_tutorial.csv', index=False)  # 'id'와 'label' 컬럼만 저장

print("Submission file saved as 'submission_tutorial.csv'")


Training Epoch 1: 100%|██████████| 211/211 [01:13<00:00,  2.86it/s]


Epoch 1/5, Loss: 0.5571, Accuracy: 0.7289


Training Epoch 2: 100%|██████████| 211/211 [01:12<00:00,  2.90it/s]


Epoch 2/5, Loss: 0.3749, Accuracy: 0.8334


Training Epoch 3: 100%|██████████| 211/211 [01:12<00:00,  2.90it/s]


Epoch 3/5, Loss: 0.1772, Accuracy: 0.9328


Training Epoch 4: 100%|██████████| 211/211 [01:13<00:00,  2.89it/s]


Epoch 4/5, Loss: 0.0961, Accuracy: 0.9657


Training Epoch 5: 100%|██████████| 211/211 [01:12<00:00,  2.90it/s]


Epoch 5/5, Loss: 0.0719, Accuracy: 0.9760


Predicting Test Data: 100%|██████████| 50/50 [00:05<00:00,  8.76it/s]

Submission file saved as 'submission_tutorial.csv'





In [None]:
!kaggle competitions submit -c osai-project -f submission_tutorial.csv -m "Submission via API"

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.10/dist-packages/kaggle/__init__.py", line 7, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py", line 407, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.config/kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/
