In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install wtfml
!pip install pretrainedmodels

In [None]:
import os
import torch

import albumentations
import pretrainedmodels

import numpy as np
import pandas as pd
import torch.nn as nn

import cv2
import torch

import numpy as np


from PIL import Image
from PIL import ImageFile


ImageFile.LOAD_TRUNCATED_IMAGES = True

#from apex import amp
from sklearn import metrics
from torch.nn import functional as F

#from wtfml.data_loaders.image import ClassificationDataset
#from wtfml.engine import Engine
from wtfml.utils import EarlyStopping

from sklearn.model_selection import StratifiedKFold


In [None]:
class SEResNext50_32x4d(nn.Module):
    def __init__(self, pretrained="imagenet"):
        super(SEResNext50_32x4d, self).__init__()
        self.model = pretrainedmodels.__dict__[
            "se_resnext50_32x4d"
        ](pretrained=pretrained)
        self.out = nn.Linear(2048, 1)
    
    def forward(self, image, targets):
        bs, _, _, _ = image.shape
        x = self.model.features(image)
        x = F.adaptive_avg_pool2d(x, 1)
        x = x.reshape(bs, -1)
        out = self.out(x)
        loss = nn.BCEWithLogitsLoss()(
            out, targets.reshape(-1, 1).type_as(out)
        )
        return out, loss

In [None]:
class AverageMeter:
    """
    Computes and stores the average and current value
    """

    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:

import datetime
import torch
from tqdm import tqdm

try:
    from torch.cuda import amp
    _amp_available = True
except ImportError:
    _amp_available = False

try:
    import torch_xla.core.xla_model as xm
    import torch_xla.distributed.parallel_loader as pl

    _xla_available = True
except ImportError:
    _xla_available = False


def reduce_fn(vals):
    return sum(vals) / len(vals)


class Engine:
    def __init__(
        self,
        model,
        optimizer,
        device,
        data_loader,
        scheduler=None,
        accumulation_steps=1,
        use_tpu=False,
        tpu_print=10,
        fp16=False,
        model_fn=None,
        use_mean_loss=False,
    ):
        super(Engine, self).__init__()
        """
        model_fn should take batch of data, device and model and return loss
        for example:
            def model_fn(data, device, model):
                images, targets = data
                images = list(image.to(device) for image in images)
                targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
                _, loss = model(images, targets)
                return loss
        """
        self.model = model
        self.optimizer = optimizer
        self.device = device
        self.scheduler = scheduler
        self.data_loader = data_loader
        self.accumulation_steps = accumulation_steps
        self.use_tpu = use_tpu
        self.tpu_print = tpu_print
        self.model_fn = model_fn
        self.fp16 = fp16
        if self.fp16 and not _amp_available:
            raise Exception(
                "You want to use fp16 but dont have amp installed"
            )
        self.use_mean_loss = use_mean_loss
        self.scaler = None

        if self.use_tpu and not _xla_available:
            raise Exception(
                "You want to use TPUs but you dont have pytorch_xla installed"
            )
        if self.fp16 and use_tpu:
            raise Exception("Apex fp16 is not available when using TPUs")
        if self.fp16:
            self.scaler = amp.GradScaler()

    def train(self):
        losses = AverageMeter()
        self.model.train()
        print_idx = int(len(self.data_loader) * self.tpu_print / 100)
        if self.accumulation_steps > 1:
            self.optimizer.zero_grad()
        if self.use_tpu:
            para_loader = pl.ParallelLoader(self.data_loader, [self.device])
            tk0 = para_loader.per_device_loader(self.device)
        else:
            tk0 = tqdm(self.data_loader, total=len(self.data_loader))

        for b_idx, data in enumerate(tk0):
            if self.accumulation_steps == 1 and b_idx == 0:
                self.optimizer.zero_grad()

            if self.model_fn is None:
                for key, value in data.items():
                    data[key] = value.to(self.device)
                _, loss = self.model(**data)
            else:
                if self.fp16:
                    with amp.autocast():
                        loss = self.model_fn(data, self.device, self.model)
                else:
                    loss = self.model_fn(data, self.device, self.model)

            if not self.use_tpu:
                with torch.set_grad_enabled(True):
                    if self.use_mean_loss:
                        loss = loss.mean()

                    if self.fp16:
                        self.scaler.scale(loss).backward()
                    else:
                        loss.backward()

                    if (b_idx + 1) % self.accumulation_steps == 0:
                        if self.fp16:
                            self.scaler.step(self.optimizer)
                        else:
                            self.optimizer.step()

                        if self.scheduler is not None:
                            self.scheduler.step(loss)

                        if self.fp16:
                            self.scaler.update()

                        if b_idx > 0:
                            self.optimizer.zero_grad()

            else:
                loss.backward()
                xm.optimizer_step(self.optimizer)
                if self.scheduler is not None:
                    self.scheduler.step(loss)
                if b_idx > 0:
                    self.optimizer.zero_grad()
            if self.use_tpu:
                reduced_loss = xm.mesh_reduce("loss_reduce", loss, reduce_fn)
                losses.update(reduced_loss.item(), self.data_loader.batch_size)
            else:
                losses.update(loss.item(), self.data_loader.batch_size)

            if not self.use_tpu:
                tk0.set_postfix(loss=losses.avg)
            else:
                if b_idx % print_idx == 0 or b_idx == len(self.data_loader):
                    xm.master_print(
                        f"{datetime.datetime.now()}: Batch {b_idx} / {len(self.data_loader)}, loss={losses.avg}"
                    )
        if not self.use_tpu:
            tk0.close()
        return losses.avg

    def evaluate(self, return_predictions=False):
        losses = AverageMeter()
        print_idx = int(len(self.data_loader) * self.tpu_print / 100)
        self.model.eval()
        final_predictions = []
        with torch.no_grad():
            if self.use_tpu:
                para_loader = pl.ParallelLoader(self.data_loader, [self.device])
                tk0 = para_loader.per_device_loader(self.device)
            else:
                tk0 = tqdm(self.data_loader, total=len(self.data_loader))
            for b_idx, data in enumerate(tk0):
                for key, value in data.items():
                    data[key] = value.to(self.device)
                if self.fp16:
                    with amp.autocast():
                        batch_preds, loss = self.model(**data)
                else:
                    batch_preds, loss = self.model(**data)
                if return_predictions:
                    final_predictions.append(batch_preds)
                if self.use_tpu:
                    reduced_loss = xm.mesh_reduce("loss_reduce", loss, reduce_fn)
                    losses.update(reduced_loss.item(), self.data_loader.batch_size)
                else:
                    if self.use_mean_loss:
                        loss = loss.mean()
                    losses.update(loss.item(), self.data_loader.batch_size)
                if not self.use_tpu:
                    tk0.set_postfix(loss=losses.avg)
                else:
                    if b_idx % print_idx == 0 or b_idx == len(self.data_loader):
                        xm.master_print(
                            f"{datetime.datetime.now()}: Batch {b_idx} / {len(self.data_loader)}, loss={losses.avg}"
                        )
            if not self.use_tpu:
                tk0.close()
        return losses.avg, final_predictions
    
    def predict(self):
        self.model.eval()
        final_predictions = []
        if self.use_tpu:
            raise Exception("TPU not available for predict yet!")
        with torch.no_grad():
            tk0 = tqdm(self.data_loader, total=len(self.data_loader))
            for data in tk0:
                for key, value in data.items():
                    data[key] = value.to(self.device)
                predictions, _ = self.model(**data)
                predictions = predictions.cpu()
                final_predictions.append(predictions)
        return final_predictions

In [None]:
class ClassificationDataset:
    def __init__(self, image_paths, targets, resize, augmentations=None, backend="pil", channel_first=True,):
        super(ClassificationDataset, self).__init__()
        
        self.image_paths = image_paths
        self.targets = targets
        self.resize = resize
        self.augmentations = augmentations
        self.backend = backend
        self.channel_first = channel_first
        

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, item):
        targets = self.targets[item]
        if os.path.isfile(self.image_paths[item]):
            if self.backend == "pil":
                image = Image.open(self.image_paths[item])
                if self.resize is not None:
                    image = image.resize(
                        (self.resize[1], self.resize[0]), resample=Image.BILINEAR
                    )
                image = np.array(image)
                if self.augmentations is not None:
                    augmented = self.augmentations(image=image)
                    image = augmented["image"]
            elif self.backend == "cv2":
                image = cv2.imread(self.image_paths[item])
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                if self.resize is not None:
                    image = cv2.resize(
                        image,
                        (self.resize[1], self.resize[0]),
                        interpolation=cv2.INTER_CUBIC,
                    )
                if self.augmentations is not None:
                    augmented = self.augmentations(image=image)
                image = augmented["image"]
            else:
                raise Exception("Backend not implemented")
            if self.channel_first:
                image = np.transpose(image, (2, 0, 1)).astype(np.float32)
        
        return {
            "image": torch.tensor(image),
            "targets": torch.tensor(targets),
        }

In [None]:
# create folds
df = pd.read_csv("../input/siim-isic-melanoma-classification/train.csv")
df["kfold"] = -1    
df = df.sample(frac=1).reset_index(drop=True)
y = df.target.values
kf = StratifiedKFold(n_splits=5)

for f, (t_, v_) in enumerate(kf.split(X=df, y=y)):
    df.loc[v_, 'kfold'] = f

df.to_csv("train_folds.csv", index=False)

In [None]:
df = pd.read_csv("./train_folds.csv")
df.shape



In [None]:
def train(fold):
    training_data_path = "../input/siic-isic-224x224-images/train"
    df = pd.read_csv("./train_folds.csv")
    device = "cuda"
    epochs = 50
    train_bs = 32
    valid_bs = 16
    mean = (0.485, 0.456, 0.406)
    std = (0.229, 0.224, 0.225)

    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    train_aug = albumentations.Compose(
        [
            albumentations.Normalize(mean, std, max_pixel_value=255.0, always_apply=True),
        ]
    )

    valid_aug = albumentations.Compose(
        [
            albumentations.Normalize(mean, std, max_pixel_value=255.0, always_apply=True),
        ]
    )

    train_images = df_train.image_name.values.tolist()
    train_images = [os.path.join(training_data_path, i + ".png") for i in train_images]
    train_targets = df_train.target.values

    valid_images = df_valid.image_name.values.tolist()
    valid_images = [os.path.join(training_data_path, i + ".png") for i in valid_images]
    valid_targets = df_valid.target.values

    train_dataset = ClassificationDataset(
        image_paths=train_images,
        targets=train_targets,
        resize=None,
        augmentations=train_aug
    )

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=train_bs,
        shuffle=True,
        num_workers=4
    )

    valid_dataset = ClassificationDataset(
        image_paths=valid_images,
        targets=valid_targets,
        resize=None,
        augmentations=valid_aug,
    )

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=valid_bs,
        shuffle=False,
        num_workers=4
    )

    model = SEResNext50_32x4d(pretrained="imagenet")
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        patience=3,
        mode="max"
    )

    es = EarlyStopping(patience=5, mode="max")
    for epoch in range(epochs):
        training_loss = Engine(model,optimizer,device,train_loader,scheduler=scheduler,fp16=True).train()
        valid_loss, predictions = Engine(model,optimizer,device,valid_loader,scheduler=scheduler,fp16=True).evaluate(return_predictions=True)
        predictions = [tensor.detach().cpu().numpy() for tensor in predictions]
        predictions = np.vstack((predictions)).ravel()
        auc = metrics.roc_auc_score(valid_targets, predictions)
        print(f"Epoch = {epoch}, AUC = {auc}")
        scheduler.step(auc)

        es(auc, model, model_path=f"model_fold_{fold}.bin")
        if es.early_stop:
            print("Early stopping")
            break
    

In [None]:
def predict(fold):
    test_data_path = "../input/siic-isic-224x224-images/test"
    model_path = f"model_fold_{fold}.bin"
    df_test = pd.read_csv("../input/siim-isic-melanoma-classification/test.csv")
    df_test.loc[:, "target"] = 0

    device = "cuda"
    epochs = 50
    test_bs = 16
    mean = (0.485, 0.456, 0.406)
    std = (0.229, 0.224, 0.225)

    test_aug = albumentations.Compose(
        [
            albumentations.Normalize(mean, std, max_pixel_value=255.0, always_apply=True),
        ]
    )

    test_images = df_test.image_name.values.tolist()
    test_images = [os.path.join(test_data_path, i + ".png") for i in test_images]
    test_targets = df_test.target.values

    test_dataset = ClassificationDataset(
        image_paths=test_images,
        targets=test_targets,
        resize=None,
        augmentations=test_aug
    )

    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=test_bs,
        shuffle=False,
        num_workers=4
    )

    model = SEResNext50_32x4d(pretrained="imagenet")
    model.load_state_dict(torch.load(os.path.join(model_path)))
    model.to(device)

    predictions = Engine(model,device,test_loader).predict()
    predictions = [tensor.detach().cpu().numpy() for tensor in predictions]
    predictions = np.vstack((predictions)).ravel()
    return predictions


In [None]:
train(0)

In [None]:
train(1)
train(2)
train(3)
train(4)

In [None]:
p0 = predict(0)

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
df= pd.read_csv("../input/siim-isic-melanoma-classification/train.csv")
df.head()

In [None]:
images_names = list(df.image_name.values)
print(len(images_names))

In [None]:
image_path_name = os.listdir("../input/siim-isic-melanoma-classification/jpeg/train")
print(len(image_path_name))

In [None]:
image_path_pre = os.listdir("../input/siic-isic-224x224-images/train")
print((image_path_pre))

In [None]:
print((pd.Series(images_names).isin(pd.Series(image_path_pre))))

In [None]:
image_name = df.image_name.values.tolist()
image_name_path = [os.path.join("../input/siic-isic-224x224-images/train", i+".png") for i in image_name]

In [None]:
new_names_converted_of_df = [(image_name.replace) for i in image_name]

In [None]:
print((pd.Series(images_names).isin(pd.Series(image_path_pre))))