In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader
import torchvision
from torchvision import transforms,models
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn import metrics
from PIL import Image
from PIL import ImageFile
import albumentations

In [2]:
class ClassificationDataset(Dataset):
    def __init__(self ,image_paths, targets, resize=None, augmentations=None):
        self.image_paths = image_paths
        self.targets = targets
        self.resize = resize
        self.augmentations = augmentations

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, item):
        image = Image.open(self.image_paths[item])
        image = image.convert("RGB")
        targets = self.targets[item]

        if self.resize is not None:
            image = image.resize((self.resize[1], self.resize[0]),
                                 resample=Image.BILINEAR)

        image = np.array(image)

        if self.augmentations is not None:
            augmented = self.augmentations(image=image)
            image = augmented['image']

        # torch expects CxHxW instead of HxWxC
        image = np.transpose(image, (2,0,1)).astype(np.float32)

        image = torch.tensor(image, dtype=torch.float)
        targets = torch.tensor(targets, dtype=torch.long)

        return (image, targets)


In [3]:
root_dir = "../input/jpeg-melanoma-256x256/"
train_csv = pd.read_csv(root_dir + 'train.csv')

def get_train_val_split(df):
    #Removing Duplicates
    df = df[df.tfrecord != -1].reset_index(drop=True)
    train_tf_records = list(range(len(df.tfrecord.unique())))[:12]
    split_cond = df.tfrecord.apply(lambda x: x in train_tf_records)
    train_df = df[split_cond].reset_index()
    valid_df = df[~split_cond].reset_index()
    return train_df,valid_df

train_files, test_files = get_train_val_split(train_csv)

In [4]:
train_image_ids = train_files.image_name.values.tolist()
train_images = [os.path.join(root_dir + 'train/' ,i +'.jpg') for i in train_image_ids]
train_targets = train_files.target.values

test_image_ids = test_files.image_name.values.tolist()
test_images = [os.path.join(root_dir + 'train/' ,i +'.jpg') for i in test_image_ids]
test_targets = test_files.target.values

In [5]:
test_files.head()

Unnamed: 0,index,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
0,6,ISIC_0074542,IP_4698288,male,25.0,lower extremity,unknown,benign,0,14,5184,3456
1,10,ISIC_0076545,IP_9802602,male,55.0,upper extremity,unknown,benign,0,14,4288,2848
2,17,ISIC_0079038,IP_5295861,male,70.0,torso,unknown,benign,0,13,6000,4000
3,18,ISIC_0080512,IP_1870306,male,75.0,torso,unknown,benign,0,12,6000,4000
4,24,ISIC_0082934,IP_6572129,male,65.0,torso,unknown,benign,0,12,6000,4000


In [6]:
train_files.head()

Unnamed: 0,index,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
0,0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,0,6000,4000
1,1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0,6000,4000
2,2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,6,1872,1053
3,3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,0,1872,1053
4,4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,11,6000,4000


In [7]:
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
# adding a simple augmentation
aug = albumentations.Compose([
    albumentations.Normalize(mean, std, max_pixel_value=255.0, always_apply=True)
])


In [8]:
train_dataset = ClassificationDataset(image_paths=train_images, targets=train_targets, resize=(224,224), augmentations=aug)
test_dataset = ClassificationDataset(image_paths=test_images, targets= test_targets, resize=(224,224), augmentations=aug)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)

In [9]:
for images,targets in test_loader:
    print(images.shape)

torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size([64, 3, 224, 224])
torch.Size

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [11]:
# defining model
model = models.resnext50_32x4d(pretrained=True)
model.fc = nn.Sequential(
    nn.Linear(2048,1000),
    nn.Dropout(p=0.5),
    nn.Linear(1000,1)
)
model.to(device)

Downloading: "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth" to /root/.cache/torch/checkpoints/resnext50_32x4d-7cdf4587.pth


HBox(children=(FloatProgress(value=0.0, max=100441675.0), HTML(value='')))




ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1

In [12]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=5e-4)

In [22]:
def train(data_loader, model, optimizer, device):
    model.train()
    
    for images,targets in data_loader:
        
        images = images.to(device, dtype=torch.float)
        targets = targets.to(device, dtype=torch.float) 
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, targets.view(-1,1))
        loss.backward()
        optimizer.step()
    

In [23]:
def evaluate(data_loader, model, device):
    model.eval()
    
    final_targets =  []
    final_outputs = []
    
    with torch.no_grad():
        for images, targets in data_loader:
            
            images = images.to(device, dtype=torch.float)
            targets = targets.to(device, dtype=torch.float) 
            
            output = model(images)
            targets = targets.detach().cpu().numpy().tolist()
            output = output.detach().cpu().numpy().tolist()
            
            # extend the original list
            final_targets.extend(targets)
            final_outputs.extend(output)
            
    return final_outputs, final_targets

In [24]:
for epoch in range(5):
    train(train_loader, model, optimizer, device=device)
    predictions, valid_targets = evaluate(test_loader, model, device=device)
    roc_auc = metrics.roc_auc_score(valid_targets, predictions)
    # f1_score = metrics.f1_score(valid_targets,predictions)
    print(f"Epochs={epoch} Valid ROC AUC={roc_auc}")

Epochs=0 Valid ROC AUC=0.8165465096065935
Epochs=1 Valid ROC AUC=0.8287134901014732
Epochs=2 Valid ROC AUC=0.8505806153686918
Epochs=3 Valid ROC AUC=0.8344791712281232
Epochs=4 Valid ROC AUC=0.8367204221885172


In [25]:
# predictions on test data
df = pd.read_csv(root_dir + 'test.csv')
df.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,width,height
0,ISIC_0052060,IP_3579794,male,70.0,,6000,4000
1,ISIC_0052349,IP_7782715,male,40.0,lower extremity,6000,4000
2,ISIC_0058510,IP_7960270,female,55.0,torso,6000,4000
3,ISIC_0073313,IP_6375035,female,50.0,torso,6000,4000
4,ISIC_0073502,IP_0589375,female,45.0,lower extremity,1920,1080


In [26]:
class submissionDataset(Dataset):
    def __init__(self ,image_paths, resize=None, augmentations=None):
        self.image_paths = image_paths
        self.resize = resize
        self.augmentations = augmentations

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, item):
        image = Image.open(self.image_paths[item])
        image = image.convert("RGB")

        if self.resize is not None:
            image = image.resize((self.resize[1], self.resize[0]),
                                 resample=Image.BILINEAR)

        image = np.array(image)

        if self.augmentations is not None:
            augmented = self.augmentations(image=image)
            image = augmented['image']

        # torch expects CxHxW instead of HxWxC
        image = np.transpose(image, (2,0,1)).astype(np.float32)

        image = torch.tensor(image, dtype=torch.float)

        return image

In [30]:
sub_image_ids = df.image_name.values.tolist()
sub_images = [os.path.join(root_dir + 'test/' ,i +'.jpg') for i in sub_image_ids]

sub_dataset = submissionDataset(image_paths=sub_images, resize=(224,224), augmentations=aug)
sub_loader = DataLoader(sub_dataset, batch_size=16, shuffle=False, num_workers=4)

In [31]:
for image in sub_loader:
    print(image.shape)
    break

torch.Size([16, 3, 224, 224])


In [32]:
def get_preds(model,device=None,tta=3):
    if device is None:
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    preds = np.zeros(len(sub_dataset))
    for tta_id in range(tta):
        test_preds = []
        with torch.no_grad():
            for image in sub_loader:
                image = image.to(device)
                output = model(image)
                output = torch.sigmoid(output)
                test_preds.extend(output.cpu().numpy())
            preds += np.array(test_preds).reshape(-1)
        print(f'TTA {tta_id}')
    preds /= tta
    return preds

#Changing tta to 25 from 10
preds = get_preds(model,tta=25)  

TTA 0
TTA 1
TTA 2
TTA 3
TTA 4
TTA 5
TTA 6
TTA 7
TTA 8
TTA 9
TTA 10
TTA 11
TTA 12
TTA 13
TTA 14
TTA 15
TTA 16
TTA 17
TTA 18
TTA 19
TTA 20
TTA 21
TTA 22
TTA 23
TTA 24


In [34]:
subm = pd.read_csv(root_dir + 'sample_submission.csv')
subm.target = preds
subm.to_csv('submission.csv',index=False)

NameError: name 'path' is not defined