## Reference

Custom Dataset classes in pytorch
https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

k-Fold validation pytorch
--
1. https://stackoverflow.com/questions/58996242/cross-validation-for-mnist-dataset-with-pytorch-and-sklearn
2. https://discuss.pytorch.org/t/i-need-help-in-this-k-fold-cross-validation-implementation/90705/5
3. https://github.com/buomsoo-kim/PyTorch-learners-tutorial/blob/master/PyTorch%20Basics/pytorch-datasets-2.ipynb


kFold split sklearn
--
1. sklearn.model_selection.KFold -  normal ordered splits without any shuffle by default. 
2. sklearn.model_selection.StratifiedKFold - tries to preserve the distribution of each class in each set
3. GroupFold - ensures the group of data is not repeated in any fold; little complex concept
4. RepeatedKFold - repeat kfold n times with different random state each instance

## Library imports

In [3]:
# common imports
import os
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
#import math
#import time
#from skimage import io, transform
#from typing import Dict
#from pathlib import Path

# interactive plot libraries
import matplotlib.pyplot as plt
import seaborn as sns
#from plotly.offline import init_notebook_mode, iplot # download_plotlyjs, plot
#import plotly.graph_objs as go
#from plotly.subplots import make_subplots
#init_notebook_mode(connected=True)

# torch imports
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from torchvision.models.resnet import resnet50, resnet18, resnet34, resnet101
import torch.nn.functional as F


# sklearn imports
from sklearn.model_selection import KFold, StratifiedKFold

## Config files

In [4]:
cfg = {
    'train_img_path': "cassava-leaf-disease-classification/train_images/",
    'train_csv_path': 'cassava-leaf-disease-classification/train.csv',
    
    'model_params': {
        'model_architecture': 'resnet18', 'model_name': "R18_pretrain_imagenet",
        'lr': 1e-4, 'weight_path': "", 
        'lr_find' : 0, 'train': 1, 'validate': 0,'test': 0 },

    'train_data_loader': { 'batch_size': 16, 'shuffle': True, 'num_workers': 4 },
    
    'val_data_loader': {'batch_size': 16, 'shuffle': True, 'num_workers': 4 },

    'test_data_loader': {'batch_size': 32, 'shuffle': False, 'num_workers': 4 },

    'train_params': {'train_start_batch_index' : 117001, 'max_num_steps': 11, 'checkpoint_every_n_steps': 5 } }

In [6]:
index_label_map = {
                0: "Cassava Bacterial Blight (CBB)", 
                1: "Cassava Brown Streak Disease (CBSD)",
                2: "Cassava Green Mottle (CGM)", 
                3: "Cassava Mosaic Disease (CMD)", 
                4: "Healthy"
                }

## EDA

In [7]:
#train_csv = pd.read_csv('cassava-leaf-disease-classification/train.csv')
#train_csv['disease'] = train_csv['label'].map(index_label_map);
#print(train_csv.shape)
#train_csv.head()

In [8]:
#_, axes = plt.subplots(1, 1, figsize=(16, 8))
#sns.countplot(x='disease', data=train_csv, ax=axes);

In [9]:
#print(train_csv['disease'].value_counts(normalize=True))

## TODO

1. load images into dataset (Dataset class of pytorch maybe)
2. split into 5 fold data - scikit learn
3. simple network -r18, r50 with last layers changed to 5 lables
4. adam optimizer, lr_finder, cross entropy loss
5. cv score

## Helper functions

In [10]:
def find_no_of_trainable_params(model):
    total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    #print(total_trainable_params)
    return total_trainable_params

In [11]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
set_seed(42)

## Dataset class

In [12]:
class CassavaDataset(Dataset):
    """Cassave leaf disease detection dataset."""

    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.cassava_leaf_disease = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.cassava_leaf_disease)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.root_dir,
                                self.cassava_leaf_disease.iloc[idx, 0])
        image = Image.open(img_name)
        if self.transform:
            image = self.transform(image)
        
        label = np.array(self.cassava_leaf_disease.iloc[idx, 1])
        return (image, label)

## Transforms and Dataloader

In [13]:
train_transform = transforms.Compose([transforms.Resize(255),
                                transforms.CenterCrop(224),
                                transforms.ToTensor()])

cassava_train_dataset = CassavaDataset( csv_file=cfg['train_csv_path'],
                                        root_dir=cfg['train_img_path'],
                                        transform=train_transform)

trainloader = DataLoader(cassava_train_dataset,
                         batch_size=cfg['train_data_loader']['batch_size'], 
                         shuffle=cfg['train_data_loader']['shuffle'])

## Pretrained model

In [14]:
model = models.resnet34(pretrained=True)
print('1 : ', find_no_of_trainable_params(model))
# Freeze parameters so we don't backprop through them
for param in model.parameters():
    param.requires_grad = False
print('2 : ', find_no_of_trainable_params(model))
fc_layer = nn.Sequential(nn.Linear(in_features=512, out_features=128), nn.ReLU(),
                         nn.Linear(in_features=128, out_features=5))
                        #nn.LogSoftmax(dim=1))
model.fc = fc_layer
print('3 : ', find_no_of_trainable_params(model))
#print(model)

1 :  21797672
2 :  0
3 :  66309


## Device, loss fn, optimizer

In [15]:
# Use GPU if it's available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.to(device);

# loss function
criterion = nn.CrossEntropyLoss()

# Only train the classifier parameters, feature parameters are frozen
optimizer = optim.Adam(model.fc.parameters(), lr=0.001)

## Training loop

In [19]:
splits = KFold(n_splits = 5, shuffle = True, random_state = 42)

In [20]:
for fold, (train_idx, valid_idx) in enumerate(splits.split(CassavaDataset)):
    print('Fold : {}'.format(fold))
    print(train_idx.shape)
    print(valid_idx.shape)

TypeError: Expected sequence or array-like, got <class 'type'>

In [None]:


.............
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)
    train_loader = torch.utils.data.DataLoader(
                      WrapperDataset(total_set,  transform=transforms['train']), 
                      batch_size=64, sampler=train_sampler)
    valid_loader = torch.utils.data.DataLoader(
                      WrapperDataset(total_set, transform = transforms['valid']),
                      batch_size=64, sampler=valid_sampler)
    model.load_state_dict(model_wts)