In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import pathlib
import torch.utils.data
from sklearn.preprocessing import MultiLabelBinarizer

import torchvision.transforms as transforms
import numpy as np
import torch.optim as optim
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import PIL

In [2]:
RANDOM_SEED = 666
cuda = torch.device('cuda')

LABEL_MAP = {
0: "Nucleoplasm" ,
1: "Nuclear membrane"   ,
2: "Nucleoli"   ,
3: "Nucleoli fibrillar center",   
4: "Nuclear speckles"   ,
5: "Nuclear bodies"   ,
6: "Endoplasmic reticulum"   ,
7: "Golgi apparatus"  ,
8: "Peroxisomes"   ,
9:  "Endosomes"   ,
10: "Lysosomes"   ,
11: "Intermediate filaments"  , 
12: "Actin filaments"   ,
13: "Focal adhesion sites"  ,
14: "Microtubules"   ,
15: "Microtubule ends"   ,
16: "Cytokinetic bridge"   ,
17: "Mitotic spindle"  ,
18: "Microtubule organizing center",  
19: "Centrosome",
20: "Lipid droplets"   ,
21: "Plasma membrane"  ,
22: "Cell junctions"   ,
23: "Mitochondria"   ,
24: "Aggresome"   ,
25: "Cytosol" ,
26: "Cytoplasmic bodies",
27: "Rods & rings"}

In [3]:
class ProtienDataset(Dataset):
    BANDS_NAMES = ['_red.png','_green.png','_blue.png','_yellow.png']
    
    def __len__(self):
        return len(self.images_df)
    
    def __init__(self, images_df, 
                 base_path, 
                 image_transform, 
                 augmentator=None,
                 train_mode=True    
                ):
        if not isinstance(base_path, pathlib.Path):
            base_path = pathlib.Path(base_path)
            
        self.images_df = images_df.copy()
        self.image_transform = image_transform
        self.augmentator = augmentator
        self.images_df.Id = self.images_df.Id.apply(lambda x: base_path / x)
        self.mlb = MultiLabelBinarizer(classes=list(LABEL_MAP.keys()))
        self.train_mode = train_mode
        
    def __getitem__(self, index):
        y = None
        X = self._load_multiband_image(index)
        if self.train_mode:
            y = self._load_multilabel_target(index)
        
        # augmentator can be for instance imgaug augmentation object
        if self.augmentator is not None:
            X = self.augmentator(X)
            
        X = self.image_transform(X)
            
        return X, y 
        
    def _load_multiband_image(self, index):
        row = self.images_df.iloc[index]
        image_bands = []
        for band_name in self.BANDS_NAMES:
            p = str(row.Id.absolute()) + band_name
            pil_channel = PIL.Image.open(p)
            image_bands.append(pil_channel)
            
        # lets pretend its a RBGA image to support 4 channels
        band4image = PIL.Image.merge('RGBA', bands=image_bands)
        return band4image
    
    def _load_multilabel_target(self, index):
        return list(map(int, self.images_df.iloc[index].Target.split(' ')))
    
        
    def collate_func(self, batch):
        labels = None
        images = [x[0] for x in batch]
        
        if self.train_mode:
            labels = [x[1] for x in batch]
            labels_one_hot  = self.mlb.fit_transform(labels)
            labels = torch.FloatTensor(labels_one_hot)
            
        
        return torch.stack(images)[:,:4,:,:], labels



In [10]:
# 모델 셋팅
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.C1 = nn.Conv2d(in_channels=4, out_channels=8, kernel_size=3,padding=1)
        self.C2 = nn.Conv2d(in_channels=8, out_channels=8, kernel_size=3,padding=1)
        
        self.C3 = nn.Conv2d(in_channels=8, out_channels=16, kernel_size=3,padding=1)
        self.C4 = nn.Conv2d(in_channels=16, out_channels=16, kernel_size=3,padding=1)
        
        self.C5 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3,padding=1)
        self.C6 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3,padding=1)
        self.C7 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3,padding=1)
        self.C8 = nn.Conv2d(in_channels=32, out_channels=32, kernel_size=3,padding=1)
        
        self.C9 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3,padding=1)
        self.C10 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3,padding=1)
        self.C11 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3,padding=1)
        self.C12 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3,padding=1)

        self.L1 = nn.Linear(16*16*64, 512)
        self.L2 = nn.Linear(512, 28)

    def forward(self, x):
        x=self.C1(x)
        x=self.C2(x)
        x=F.max_pool2d(F.relu(x),2)

        x=self.C3(x)
        x=self.C4(x)
        x=F.max_pool2d(F.relu(x),2)
        
        x=self.C5(x)
        x=self.C6(x)
        x=self.C7(x)
        x=self.C8(x)
        x=F.max_pool2d(F.relu(x),2)
        
        x=self.C9(x)
        x=self.C10(x)
        x=self.C11(x)
        x=self.C12(x)
        x=F.max_pool2d(F.relu(x),2)
        
        x = x.view(-1, self.num_flat_features(x))

        x = F.relu(self.L1(x))
        x = F.relu(self.L2(x))
        
#         x=F.sigmoid(x)

        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [11]:
## Submission
def predict_submission(model, submission_load):
    all_preds = []
#     model.eval()
    for i, b in enumerate(submission_load):
        if i % 100: print('processing batch {}/{}'.format(i, len(submission_load)))
        X, _ = b
#         if torch.cuda.is_available():
#             X = X.cuda()
        pred = model(X)
        all_preds.append(pred.sigmoid().cpu().data.numpy())
    return np.concatenate(all_preds)
        
         
def make_submission_file(sample_submission_df, predictions):
    submissions = []
    for row in predictions:
        subrow = ' '.join(list([str(i) for i in np.nonzero(row)[0]]))
        submissions.append(subrow)
    
    sample_submission_df['Predicted'] = submissions
    sample_submission_df.to_csv('submission.csv', index=None)
    
    return sample_submission_df

In [12]:
PATH_TO_IMAGES = './input/train/'
PATH_TO_TEST_IMAGES = './input/test/'
PATH_TO_META = './input/train.csv'
SAMPLE_SUBMI = './input/sample_submission.csv'

In [15]:
SEED = 666
DEV_MODE = True
    
df = pd.read_csv(PATH_TO_META)
df_train, df_test  = train_test_split(df, test_size=0.2, random_state=SEED)
df_submission = pd.read_csv(SAMPLE_SUBMI)

# if DEV_MODE:
#     df_train = df_train[:200]
#     df_test = df_test[:50]
#     df_submission = df_submission[:50]

# image_transform = transforms.Compose([transforms.ToTensor()])

image_transform = transforms.Compose([
           transforms.Resize(256),
           transforms.ToTensor(),
       ])

 
# Prepare datasets and loaders
   
gtrain = ProteinDataset(df_train, base_path=PATH_TO_IMAGES, image_transform=image_transform)
gtest = ProteinDataset(df_test, base_path=PATH_TO_IMAGES, image_transform=image_transform)
gsub = ProteinDataset(df_submission, base_path=PATH_TO_TEST_IMAGES, train_mode=False, image_transform=image_transform)

train_load = DataLoader(gtrain, collate_fn=gtrain.collate_func, batch_size=256, num_workers=6)
test_load = DataLoader(gtest, collate_fn=gtest.collate_func, batch_size=256, num_workers=6)
submission_load = DataLoader(gsub, collate_fn=gsub.collate_func, batch_size=256, num_workers=6)


In [16]:
model = Net()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(model.parameters(),lr = 0.01, momentum = 0.5)
optimizer = optim.Adam(model.parameters(), lr=0.0005, betas=(0.9, 0.99))

params = list(model.parameters())
print(len(params))
print(params[0].size())

def train(epoch):
    model.train()
    for batch_idx,(data,target) in enumerate(train_load):

        data = data.to(device)
        target = target.to(device)
        target = target.long()
#         print(target.type())
#         print(target)
        print(data.size())

        output = model(data)
#         print(output)
# 
        optimizer.zero_grad()
#         loss = criterion(output,target)
        loss = criterion(output, torch.max(target, 1)[1])
#         print(loss)
        loss.backward()
        optimizer.step()
#ㅇㄹㄴㅇㄹㅇ
        if batch_idx%100==0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                                                                           epoch, batch_idx * len(data), len(train_load.dataset),100. * batch_idx / len(train_load), loss))

def find(data):
    model.eval()
    data = data.to(device)
    output = model(data)
    return output

for epoch in range(1, 5):
    train(epoch)

28
torch.Size([8, 4, 3, 3])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size(

torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Size([256, 4, 256, 256])
torch.Si

In [17]:
submission_predictions =predict_submission(model, submission_load)

processing batch 1/46
processing batch 2/46
processing batch 3/46
processing batch 4/46
processing batch 5/46
processing batch 6/46
processing batch 7/46
processing batch 8/46
processing batch 9/46
processing batch 10/46
processing batch 11/46
processing batch 12/46
processing batch 13/46
processing batch 14/46
processing batch 15/46
processing batch 16/46
processing batch 17/46
processing batch 18/46
processing batch 19/46
processing batch 20/46
processing batch 21/46
processing batch 22/46
processing batch 23/46
processing batch 24/46
processing batch 25/46
processing batch 26/46
processing batch 27/46
processing batch 28/46
processing batch 29/46
processing batch 30/46
processing batch 31/46
processing batch 32/46
processing batch 33/46
processing batch 34/46
processing batch 35/46
processing batch 36/46
processing batch 37/46
processing batch 38/46
processing batch 39/46
processing batch 40/46
processing batch 41/46
processing batch 42/46
processing batch 43/46
processing batch 44/

In [31]:
# prepare the submission file and 
THRESHOLD = 0.8
print(submission_predictions)
p = submission_predictions>THRESHOLD

submission_file = make_submission_file(sample_submission_df=df_submission,predictions=p)

[[0.5        0.5        0.9006689  ... 0.5840846  0.5        0.5       ]
 [0.5        0.5        0.79495066 ... 0.933582   0.5        0.5       ]
 [0.5        0.5        0.5        ... 0.9072634  0.5        0.5       ]
 ...
 [0.5        0.5        0.7778057  ... 0.900845   0.5        0.5       ]
 [0.5        0.5        0.8303375  ... 0.7795607  0.5        0.5       ]
 [0.5        0.5        0.68358487 ... 0.96920514 0.5        0.5       ]]


In [32]:
submission_file.head()

Unnamed: 0,Id,Predicted
0,00008af0-bad0-11e8-b2b8-ac1f6b6435d0,2 4 5 7 19
1,0000a892-bacf-11e8-b2b8-ac1f6b6435d0,2 5 7 19 21 23 25
2,0006faa6-bac7-11e8-b2b7-ac1f6b6435d0,6 21 25
3,0008baca-bad7-11e8-b2b9-ac1f6b6435d0,2 5 7 19 21 23 25
4,000cce7e-bad4-11e8-b2b8-ac1f6b6435d0,6 7 19 21 23 25
