#%%
## Import

In [None]:
pip install mlflow

In [None]:
pip install torch

In [None]:
pip install torchvision

In [None]:
pip install scikit-learn

In [None]:
pip install pandas

In [20]:
pip install torchvision

Note: you may need to restart the kernel to use updated packages.


In [80]:
pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.6.0.66-cp36-abi3-win_amd64.whl (35.6 MB)
     --------------------------------------- 35.6/35.6 MB 14.9 MB/s eta 0:00:00
Installing collected packages: opencv-python
Successfully installed opencv-python-4.6.0.66
Note: you may need to restart the kernel to use updated packages.


In [1]:
import random
import pandas as pd
import numpy as np
import os
import tempfile
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def set_image_dpi(file_path):
    im = Image.open(file_path)
    length_x, width_y = im.size
    factor = min(1, float(1024.0 / length_x))
    size = int(factor * length_x), int(factor * width_y)
    im_resized = im.resize(size, Image.ANTIALIAS)
    temp_file = tempfile.NamedTemporaryFile(delete=False,   suffix='.png')
    temp_filename = temp_file.name
    im_resized.save(temp_filename, dpi=(300, 300))
    return temp_filename

In [3]:

from torchvision.models import resnet18
from torchvision import transforms

from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action='ignore') 

In [4]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [5]:
CFG = {
    'IMG_HEIGHT_SIZE':64,
    'IMG_WIDTH_SIZE':224,
    'EPOCHS':20,
    'LEARNING_RATE':1e-3,
    'BATCH_SIZE':256,
    'NUM_WORKERS':0, # 본인의 GPU, CPU 환경에 맞게 설정
    'SEED':41
}

## Fixed RandomSeed

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

In [7]:
print(os.getcwd())

c:\Users\II\Downloads\open (2)


## Data Load & Train/Validation Split

In [8]:
train_path = 'C:/Users/II/Downloads/open (2)/train.csv'

df = pd.read_csv(train_path)

In [9]:
# 제공된 학습데이터 중 1글자 샘플들의 단어사전이 학습/테스트 데이터의 모든 글자를 담고 있으므로 학습 데이터로 우선 배치
df['len'] = df['label'].str.len()
train_v1 = df[df['len']==1]

In [10]:
print(train_v1)

                id                 img_path label  len
1      TRAIN_00001  ./train/TRAIN_00001.png     머    1
3      TRAIN_00003  ./train/TRAIN_00003.png     써    1
7      TRAIN_00007  ./train/TRAIN_00007.png     빈    1
10     TRAIN_00010  ./train/TRAIN_00010.png     윷    1
27     TRAIN_00027  ./train/TRAIN_00027.png     훵    1
...            ...                      ...   ...  ...
76869  TRAIN_76869  ./train/TRAIN_76869.png     틈    1
76872  TRAIN_76872  ./train/TRAIN_76872.png     부    1
76878  TRAIN_76878  ./train/TRAIN_76878.png     잔    1
76883  TRAIN_76883  ./train/TRAIN_76883.png     회    1
76886  TRAIN_76886  ./train/TRAIN_76886.png     톼    1

[23703 rows x 4 columns]


In [11]:
# 제공된 학습데이터 중 2글자 이상의 샘플들에 대해서 단어길이를 고려하여 Train (80%) / Validation (20%) 분할
df = df[df['len']>1]
train_v2, val, _, _ = train_test_split(df, df['len'], test_size=0.2, random_state=CFG['SEED'])

In [12]:
# 학습 데이터로 우선 배치한 1글자 샘플들과 분할된 2글자 이상의 학습 샘플을 concat하여 최종 학습 데이터로 사용
train = pd.concat([train_v1, train_v2])
print(len(train), len(val))

66251 10637


## Get Vocabulary

In [13]:
# 학습 데이터로부터 단어 사전(Vocabulary) 구축
train_gt = [gt for gt in train['label']]
train_gt = "".join(train_gt)
letters = sorted(list(set(list(train_gt))))
print(len(letters))

2349


In [14]:
vocabulary = ["-"] + letters
print(len(vocabulary))
idx2char = {k:v for k,v in enumerate(vocabulary, start=0)}
char2idx = {v:k for k,v in idx2char.items()}

2350


## CustomDataset

In [15]:
class CustomDataset(Dataset):
    def __init__(self, img_path_list, label_list, train_mode=True):
        self.img_path_list = img_path_list
        self.label_list = label_list
        self.img_path_list = img_path_list
        self.train_mode = train_mode
        
    def __len__(self):
        return len(self.img_path_list)
    
    def __getitem__(self, index):
        image = Image.open(self.img_path_list[index]).convert('RGB')
        #image = im.save("test-600.png", dpi=(600,600))
        
        #image = set_image_dpi(self.img_path_list[index])
        
        if self.train_mode:
            image = self.train_transform(image)
        else:
            image = self.test_transform(image)
            
        if self.label_list is not None:
            text = self.label_list[index]
            return image, text, self.img_path_list[index]
        else:
            return image
    
    # Image Augmentation
    def train_transform(self, image):
        transform_ops = transforms.Compose([
            transforms.Resize((CFG['IMG_HEIGHT_SIZE'],CFG['IMG_WIDTH_SIZE'])),
            transforms.ToTensor(),
            transforms.Grayscale(num_output_channels=3),
            transforms.Normalize([0.5], [0.5]),
            #transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
        ])
        return transform_ops(image)
    
    def test_transform(self, image):
        transform_ops = transforms.Compose([
            transforms.Resize((CFG['IMG_HEIGHT_SIZE'],CFG['IMG_WIDTH_SIZE'])),
            transforms.ToTensor(),
            transforms.Grayscale(num_output_channels=3),
            transforms.Normalize([0.5], [0.5])
            #transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
        ])
        return transform_ops(image)

In [16]:

train_dataset = CustomDataset(train['img_path'].values, train['label'].values)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=CFG['NUM_WORKERS'])

val_dataset = CustomDataset(val['img_path'].values, val['label'].values)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=CFG['NUM_WORKERS'])

In [17]:
image_batch, text_batch, name= next(iter(train_loader))
print(image_batch.size(), name[14], text_batch)



torch.Size([256, 3, 64, 224]) ./train/TRAIN_33423.png ('신비', '그나마', '씻기다', '쉴', '일반', '셋', '수입되다', '세', '달리다', '뜁', '선원', '추가', '대량', '예매하다', '콩', '쯩', '못', '사모님', '뱀', '호주머니', '젤', '벌', '불', '베개', '수도권', '회복되다', '날씨', '유산', '견해', '체조', '얘', '튀김', '삼계탕', '어려움', '세', '발자국', '터', '함께', '스타', '샛', '참여하다', '븐', '학생증', '신인', '신청', '양복', '추진하다', '집중하다', '구분되다', '기타', '지구', '꽐', '명예', '대륙', '천장', '신', '칡', '창조', '걋', '나무', '향', '여행사', '강', '흄', '출발', '전철', '꿈', '캠페인', '유산', '자극', '품', '대출', '관광버스', '내외', '법', '딱', '알리다', '먹이다', '왼발', '이제', '씁', '뒷골목', '만들다', '특정하다', '학교생활', '잘나다', '륵', '우유', '찜', '가늘다', '낮다', '적다', '척', '도', '이', '맛', '바탕', '끌다', '애쓰다', '이거', '어느덧', '주사', '겁', '늰', '빵', '곡', '바탕', '상대편', '옆', '헤', '중단', '회', '악', '농민', '긴급', '실은', '활용', '수십', '사흘', '실리다', '확대되다', '읍', '택하다', '화', '꼬마', '이해하다', '불편하다', '전문직', '멀리', '죽', '가다', '많아지다', '암', '일자', '이념', '수입되다', '팀', '돌다', '악몽', '간단하다', '불리다', '발휘하다', '미술관', '과학자', '정확하다', '턱', '큽', '텍스트', '신', '이곳저곳', '무용가', '무', '자동', '참석', '기술'

In [18]:
print(name[28])

./train/TRAIN_29309.png


In [19]:
print(name[0][8:-4])


TRAIN_00035


In [20]:
#img = image_batch[28].numpy()
#img = img.transpose(1,2,0)
#plt.imshow(img,cmap='Greys_r')
#plt.savefig('C:/Users/II/Downloads/open (2)/'+name[28][8:-4]+'.png',format='png',dpi=96)
#plt.savefig('C:/Users/II/Downloads/open (2)/'+name[28][8:-4]+'.png',format='png',dpi=300)

#plt.imshow(image_batch[0].numpy().transpose(1,2,0))
#plt.show()

In [21]:
import sys

## Model Define

In [22]:
class RecognitionModel(nn.Module):
    def __init__(self, num_chars=len(char2idx), rnn_hidden_size=256):
        super(RecognitionModel, self).__init__()
        self.num_chars = num_chars
        self.rnn_hidden_size = rnn_hidden_size
        
        # CNN Backbone = 사전학습된 resnet18 활용
        # https://arxiv.org/abs/1512.03385
        resnet = resnet18(pretrained=True)
        # CNN Feature Extract
        resnet_modules = list(resnet.children())[:-3]

        self.feature_extract = nn.Sequential(
            #nn.Conv2d(3,1,kernel_size=(1,1), stride=1, padding=1),
            #nn.Conv2d(1, 256, kernel_size=(1,1), stride=1, padding=1), #Given groups=1, weight of size [64, 3, 7, 7], expected input[256, 256, 66, 226] to have 3 channels, but got 256 channels instead
            *resnet_modules,
            nn.Conv2d(256, 256, kernel_size=(3,6), stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True)
        )

        self.linear1 = nn.Linear(1024, rnn_hidden_size)
        
        # RNN
        
        self.rnn = nn.RNN(input_size=rnn_hidden_size, 
                            hidden_size=rnn_hidden_size,
                            bidirectional=True, 
                            batch_first=True)
        self.linear2 = nn.Linear(self.rnn_hidden_size*2, num_chars)
        
        
    def forward(self, x):
        # CNN

        x = self.feature_extract(x) # [batch_size, channels, height, width]
        x = x.permute(0, 3, 1, 2) # [batch_size, width, channels, height]
         
        batch_size = x.size(0)
        T = x.size(1)
        x = x.view(batch_size, T, -1) # [batch_size, T==width, num_features==channels*height]
        x = self.linear1(x)
        
        # RNN
        x, hidden = self.rnn(x)
        
        output = self.linear2(x)
        output = output.permute(1, 0, 2) # [T==10, batch_size, num_classes==num_features]
        
        return output

## Define CTC Loss

In [23]:
criterion = nn.CTCLoss(blank=0) # idx 0 : '-'

In [24]:
def encode_text_batch(text_batch):
    text_batch_targets_lens = [len(text) for text in text_batch]
    text_batch_targets_lens = torch.IntTensor(text_batch_targets_lens)
    
    text_batch_concat = "".join(text_batch)
    text_batch_targets = [char2idx[c] for c in text_batch_concat]
    text_batch_targets = torch.IntTensor(text_batch_targets)
    
    return text_batch_targets, text_batch_targets_lens

In [25]:
def compute_loss(text_batch, text_batch_logits):
    """
    text_batch: list of strings of length equal to batch size
    text_batch_logits: Tensor of size([T, batch_size, num_classes])
    """
    text_batch_logps = F.log_softmax(text_batch_logits, 2) # [T, batch_size, num_classes]  
    text_batch_logps_lens = torch.full(size=(text_batch_logps.size(1),), 
                                       fill_value=text_batch_logps.size(0), 
                                       dtype=torch.int32).to(device) # [batch_size] 

    text_batch_targets, text_batch_targets_lens = encode_text_batch(text_batch)
    loss = criterion(text_batch_logps, text_batch_targets, text_batch_logps_lens, text_batch_targets_lens)

    return loss

## Train

In [26]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    
    best_loss = 999999
    best_model = None
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for image_batch, text_batch,_ in tqdm(iter(train_loader)):
            image_batch = image_batch.to(device)
            
            optimizer.zero_grad()
            
            text_batch_logits = model(image_batch)
            loss = compute_loss(text_batch, text_batch_logits)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        
        _train_loss = np.mean(train_loss)
        
        _val_loss = validation(model, val_loader, device)
        print(f'Epoch : [{epoch}] Train CTC Loss : [{_train_loss:.5f}] Val CTC Loss : [{_val_loss:.5f}]')
        
        if scheduler is not None:
            scheduler.step(_val_loss)
        
        if best_loss > _val_loss:
            best_loss = _val_loss
            best_model = model
    
    return best_model

## Validation

In [27]:
def validation(model, val_loader, device):
    model.eval()
    val_loss = []
    with torch.no_grad():
        for image_batch, text_batch,_ in tqdm(iter(val_loader)):
            image_batch = image_batch.to(device)
            
            text_batch_logits = model(image_batch)
            loss = compute_loss(text_batch, text_batch_logits)
            
            val_loss.append(loss.item())
    
    _val_loss = np.mean(val_loss)
    return _val_loss

## Run!!

In [28]:
model = RecognitionModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_loader, val_loader, scheduler, device)

100%|██████████| 259/259 [18:04<00:00,  4.19s/it]
100%|██████████| 42/42 [01:08<00:00,  1.64s/it]


Epoch : [1] Train CTC Loss : [6.61364] Val CTC Loss : [4.27441]


100%|██████████| 259/259 [17:43<00:00,  4.10s/it]
100%|██████████| 42/42 [01:08<00:00,  1.62s/it]


Epoch : [2] Train CTC Loss : [3.57647] Val CTC Loss : [1.87195]


100%|██████████| 259/259 [17:37<00:00,  4.08s/it]
100%|██████████| 42/42 [01:08<00:00,  1.63s/it]


Epoch : [3] Train CTC Loss : [1.85236] Val CTC Loss : [0.88643]


100%|██████████| 259/259 [17:42<00:00,  4.10s/it]
100%|██████████| 42/42 [01:07<00:00,  1.62s/it]


Epoch : [4] Train CTC Loss : [1.11185] Val CTC Loss : [0.56616]


100%|██████████| 259/259 [17:26<00:00,  4.04s/it]
100%|██████████| 42/42 [01:07<00:00,  1.60s/it]


Epoch : [5] Train CTC Loss : [0.75377] Val CTC Loss : [0.46574]


100%|██████████| 259/259 [17:24<00:00,  4.03s/it]
100%|██████████| 42/42 [01:07<00:00,  1.61s/it]


Epoch : [6] Train CTC Loss : [0.53553] Val CTC Loss : [0.42370]


100%|██████████| 259/259 [17:24<00:00,  4.03s/it]
100%|██████████| 42/42 [01:07<00:00,  1.62s/it]


Epoch : [7] Train CTC Loss : [0.38406] Val CTC Loss : [0.30661]


100%|██████████| 259/259 [17:57<00:00,  4.16s/it]
100%|██████████| 42/42 [01:08<00:00,  1.63s/it]


Epoch : [8] Train CTC Loss : [0.27229] Val CTC Loss : [0.27216]


100%|██████████| 259/259 [18:00<00:00,  4.17s/it]
100%|██████████| 42/42 [01:09<00:00,  1.65s/it]


Epoch : [9] Train CTC Loss : [0.20219] Val CTC Loss : [0.25576]


100%|██████████| 259/259 [17:56<00:00,  4.16s/it]
100%|██████████| 42/42 [01:08<00:00,  1.63s/it]


Epoch : [10] Train CTC Loss : [0.16884] Val CTC Loss : [0.27504]


100%|██████████| 259/259 [17:49<00:00,  4.13s/it]
100%|██████████| 42/42 [01:08<00:00,  1.64s/it]


Epoch : [11] Train CTC Loss : [0.13824] Val CTC Loss : [0.24485]


100%|██████████| 259/259 [18:00<00:00,  4.17s/it]
100%|██████████| 42/42 [01:09<00:00,  1.66s/it]


Epoch : [12] Train CTC Loss : [0.12293] Val CTC Loss : [0.23333]


100%|██████████| 259/259 [18:01<00:00,  4.18s/it]
100%|██████████| 42/42 [01:08<00:00,  1.64s/it]


Epoch : [13] Train CTC Loss : [0.09902] Val CTC Loss : [0.23179]


100%|██████████| 259/259 [17:55<00:00,  4.15s/it]
100%|██████████| 42/42 [01:09<00:00,  1.64s/it]


Epoch : [14] Train CTC Loss : [0.08867] Val CTC Loss : [0.22244]


100%|██████████| 259/259 [17:53<00:00,  4.14s/it]
100%|██████████| 42/42 [01:09<00:00,  1.65s/it]


Epoch : [15] Train CTC Loss : [0.08102] Val CTC Loss : [0.23615]


100%|██████████| 259/259 [18:08<00:00,  4.20s/it]
100%|██████████| 42/42 [01:09<00:00,  1.65s/it]


Epoch : [16] Train CTC Loss : [0.09233] Val CTC Loss : [0.21940]


100%|██████████| 259/259 [18:03<00:00,  4.19s/it]
100%|██████████| 42/42 [01:09<00:00,  1.65s/it]


Epoch : [17] Train CTC Loss : [0.08884] Val CTC Loss : [0.24458]


100%|██████████| 259/259 [18:07<00:00,  4.20s/it]
100%|██████████| 42/42 [01:09<00:00,  1.65s/it]


Epoch : [18] Train CTC Loss : [0.10521] Val CTC Loss : [0.23580]


100%|██████████| 259/259 [18:00<00:00,  4.17s/it]
100%|██████████| 42/42 [01:08<00:00,  1.63s/it]


Epoch : [19] Train CTC Loss : [0.08438] Val CTC Loss : [0.22495]
Epoch 00019: reducing learning rate of group 0 to 5.0000e-04.


100%|██████████| 259/259 [18:00<00:00,  4.17s/it]
100%|██████████| 42/42 [01:09<00:00,  1.65s/it]

Epoch : [20] Train CTC Loss : [0.01963] Val CTC Loss : [0.13968]





## Inference

In [29]:
test = pd.read_csv('C:/Users/II/Downloads/open (2)/test.csv')

In [30]:
test_dataset = CustomDataset(test['img_path'].values, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=CFG['NUM_WORKERS'])

In [31]:
def decode_predictions(text_batch_logits):
    text_batch_tokens = F.softmax(text_batch_logits, 2).argmax(2) # [T, batch_size]
    text_batch_tokens = text_batch_tokens.numpy().T # [batch_size, T]

    text_batch_tokens_new = []
    for text_tokens in text_batch_tokens:
        text = [idx2char[idx] for idx in text_tokens]
        text = "".join(text)
        text_batch_tokens_new.append(text)

    return text_batch_tokens_new

def inference(model, test_loader, device):
    model.eval()
    preds = []
    with torch.no_grad():
        for image_batch in tqdm(iter(test_loader)):
            image_batch = image_batch.to(device)
            
            text_batch_logits = model(image_batch)
            
            text_batch_pred = decode_predictions(text_batch_logits.cpu())
            
            preds.extend(text_batch_pred)
    return preds

In [32]:
predictions = inference(infer_model, test_loader, device)

100%|██████████| 290/290 [08:00<00:00,  1.66s/it]


## Submission

In [33]:
# 샘플 별 추론결과를 독립적으로 후처리
def remove_duplicates(text):
    if len(text) > 1:
        letters = [text[0]] + [letter for idx, letter in enumerate(text[1:], start=1) if text[idx] != text[idx-1]]
    elif len(text) == 1:
        letters = [text[0]]
    else:
        return ""
    return "".join(letters)

def correct_prediction(word):
    parts = word.split("-")
    parts = [remove_duplicates(part) for part in parts]
    corrected_word = "".join(parts)
    return corrected_word

In [34]:
submit = pd.read_csv('C:/Users/II/Downloads/open (2)/sample_submission.csv')
submit['label'] = predictions
submit['label'] = submit['label'].apply(correct_prediction)

In [35]:
submit.to_csv('C:/Users/II/Downloads/open (2)/submission.csv',encoding='utf-8-sig',index=False)