In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
from PIL import Image

import pytorch_lightning as pl
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

DATA_DIR = '/kaggle/input/plant-pathology-2020-fgvc7'
SEED = 1279

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Initial Data Analysis

In [None]:
!pip install ImageHash

In [None]:
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
train_df.rename(columns={'multiple_diseases': 'multi'}, inplace=True)
to_drop = ['Train_379', 'Train_782', 'Train_1661'] # dropping repeated/mislabeled samples
train_df = train_df[~train_df.image_id.isin(to_drop)]
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
print(train_df.head())
print(test_df.head())
print(type(train_df.iloc[0]))
for iterable in train_df.iloc[0].index:
    print(iterable)

In [None]:
# image display stuff

def disp(img_row):
    img = Image.open(f"{DATA_DIR}/images/{img_row['image_id']}.jpg")
    plt.figure()
    plt.axis('off')
    plt.title(', '.join([f'{lbl}={img_row[lbl]}' for lbl in img_row.index]))
    plt.imshow(img)

In [None]:
rusty_leaves = train_df[train_df['rust'] > 0]
idxs = np.random.choice(len(rusty_leaves), size=10)
for idx in idxs:
    disp(rusty_leaves.iloc[idx])

In [None]:
scabby = train_df[train_df['scab'] > 0]
idxs = np.random.choice(len(scabby), size=10)
for idx in idxs:
    disp(scabby.iloc[idx])

In [None]:
helth = train_df[train_df['healthy'] > 0]
idxs = np.random.choice(len(helth), size=10)
for idx in idxs:
    disp(helth.iloc[idx])

In [None]:
idxs = np.random.choice(len(test_df), size=10)
for i in idxs:
    disp(test_df.iloc[i])

In [None]:
# looking for similar images (https://github.com/JohannesBuchner/imagehash/blob/master/find_similar_images.py)
import imagehash
def find_similar_images(userpaths, hashfunc=imagehash.phash):
    def is_image(filename):
        f = filename.lower()
        return f.endswith(".png") or f.endswith(".jpg") or \
            f.endswith(".jpeg") or f.endswith(".bmp") or \
            f.endswith(".gif") or '.jpg' in f or  f.endswith(".svg")
    
    image_filenames = []
    for userpath in userpaths:
        image_filenames += [os.path.join(userpath, path) for path in os.listdir(userpath) if is_image(path)]
    images = {}
    problematic = []
    print("filenames obtained")
    for img in tqdm(sorted(image_filenames)):
        try:
            hash = hashfunc(Image.open(img))
        except Exception as e:
            print('Problem:', e, 'with', img)
            continue
        if hash in images:
            print(img, '  already exists as', ' '.join(images[hash]))
            problematic += [img, *images[hash]]
#             if 'dupPictures' in img:
#                 print('rm -v', img)
        images[hash] = images.get(hash, []) + [img]
    return problematic
    
problematic = find_similar_images([f'{DATA_DIR}/images/'])

In [None]:
# print(problematic)
pairs = []
for k in range(0,len(problematic),2):
    pairs.append( (problematic[k].rsplit('/', 1)[-1], problematic[k+1].rsplit('/', 1)[-1]) )
print(pairs[-3:])

In [None]:
idxs = [379, 1173, 782, 592, 815, 1661]
for i in idxs:
    disp(train_df.iloc[i])
    
# seems like we should drop Train_379, Train_782, and Train_1661

In [None]:
for col in ['healthy', 'multi', 'rust', 'scab']:
    print(f"{col}, {sum(train_df[col])}")
    
# data distribution: fairly even except for less multi

## Model Training
Note: This is designed to be runnable even if the data analysis part is not run. As such, there's some code repeat.

### Data Setup

In [None]:
from torch.utils.data.dataset import Dataset, Subset
from torch.utils.data import DataLoader

In [None]:
def get_train_data():
    train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
    train_df.rename(columns={'multiple_diseases': 'multi'}, inplace=True)
    to_drop = ['Train_379', 'Train_782', 'Train_1661'] # dropping repeated/mislabeled samples
    train_df = train_df[~train_df.image_id.isin(to_drop)]
    files = [f"{DATA_DIR}/images/{fname}.jpg" for fname in train_df['image_id']]
    labels = train_df.iloc[:, 1:].to_numpy().argmax(axis=1)
    return files, labels

In [None]:
def get_test_data():
    test_df = pd.read_csv(f'{DATA_DIR}/test.csv')
    files = [f"{DATA_DIR}/images/{fname}.jpg" for fname in test_df['image_id']]
    return files

In [None]:
def get_transforms(train=True):
    transforms = 

In [None]:
class AppleDataset(Dataset):
    def __init__(self, is_train, transforms):
        self.transforms = transforms
        self.is_train = is_train
        if is_train:
            self.filepaths, self.labels = get_train_data()
        else:
            self.filepaths = get_test_data()
        
    def __len__(self):
        return len(self.filepaths)
    
    def __getitem__(self, i):
        img = Image.open(self.filepaths[i]).convert('RGB')
        img = self.transforms(img)
        lbl = self.labels[i] if self.is_train else None
        return (img, lbl)

In [None]:
# Lightning folds !
class AppleKFoldDataModule(pl.LightningDataModule):
    
    def setup(self, stage=None):
        transforms = torchvision.transforms.Compose([
            torchvision.transforms.Resize((224, 224)), # change this to some variable later!
            torchvision.transforms.ToTensor()
        ])
        self.train_dset = AppleDataset(True, transforms)
        self.test_dset = AppleDataset(False, transforms)
    
    def setup_folds(self, k_folds):
        rng = np.random.default_rng(seed=SEED)
        shuffled_idx = rng.permutation(len(self.train_dset))

        self.splits = np.array_split(shuffled_idx, k_folds)
        
    def setup_fold_index(self, i):
        train_idx = np.concatenate((self.splits[:i],self.splits[i+1:]))
        val_idx = self.splits[i]
        self.train_fold = Subset(self.train_dset, train_idx)
        self.val_fold = Subset(self.train_dset, val_idx)
    
    def train_dataloader(self):
        return DataLoader(self.train_fold, batch_size=32, shuffle=True) # change batch size for variable
    
    def val_dataloader(self):
        return DataLoader(self.val_fold, batch_size=32, shuffle=True)
        
    def test_dataloader(self):
        return DataLoader(self.test_dset, shuffle=True)
    
    def __post_init__(cls):
        super().__init__()

### Model(s) Setup

In [None]:
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F

In [None]:
def initialize_base_model(model_name, pretrain=True):
    if model_name == 'resnet-50':
        model = torchvision.models.resnet50(pretrained=pretrain)
        inp_size = 224
        out_size = 1000
        
    return model, inp_size, out_size

In [None]:
class AppleModel(pl.LightningModule): #Lightning-fied
    def __init__(self, base_model_name='resnet-50', out_classes=4):
        super().__init__()
        self.ft_extractor, self.inp_size, out_size = initialize_base_model(base_model_name, pretrain=True)
        self.clfr = nn.Sequential(nn.Linear(out_size, out_classes), nn.Softmax(dim=1))
        
    def training_step(self, batch, batch_idx):
        x,y = batch
        x = self.ft_extractor(x)
        out = self.clfr(x)
        loss = F.nll_loss(out, y)
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer
        

### Dataset and Dataloader setup

In [None]:
# Lightning Train Loop

model = AppleModel()
datamodule = AppleKFoldDataModule()
trainer = pl.Trainer(
    limit_train_batches=32,
    max_epochs=2)
internal_fit_loop = trainer.fit_loop
trainer.fit_loop = KFoldLoop(5, export_path="./")
trainer.fit_loop.connect(internal_fit_loop)
trainer.fit(model, datamodule)

In [None]:
model = AppleModel()
transforms = torchvision.transforms.Compose([
            torchvision.transforms.Resize((224, 224)), # change this to some variable later!
            torchvision.transforms.ToTensor()
])
datamodule = DataLoader(AppleDataset(True, transforms), batch_size=32, shuffle=True)
trainer = pl.Trainer(
    max_epochs=2,
    accelerator="gpu" if torch.cuda.is_available() else "cpu"
)
trainer.fit(model, datamodule)

In [None]:
models = []
for k in range(1):
    curr_model = AppleModel()
    train_files = np.concatenate(image_folds[:k] + image_folds[k+1:])
    train_lbl = np.concatenate(label_folds[:k] + label_folds[k+1:])
    val_files = image_folds[k]
    val_lbl = label_folds[k]
    run_train_loop(curr_model, train_files, train_lbl, val_files, val_lbl)
    models.append(curr_model)