In [59]:
!pip install split-folders

Collecting split-folders
  Downloading split_folders-0.5.1-py3-none-any.whl (8.4 kB)
Installing collected packages: split-folders
Successfully installed split-folders-0.5.1


In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time
import copy

import torch
import torchvision.models as models
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.datasets as ds
from torchvision import models, transforms, utils, datasets
from torch.utils import data
from torchsummary import summary
from torch.optim import lr_scheduler

# Data Read-In

In [2]:
fish_df = pd.read_csv("data/fish_df.csv")

In [3]:
is_fish_df = pd.read_csv("data/is_fish.csv").iloc[:,1:]
is_fish_df["local_paths"] = is_fish_df["Species"].astype(str) + "/" + is_fish_df["Filename"]
path_set = set(is_fish_df["local_paths"])

In [4]:
is_fish_df["Species"].unique()

array([ 972, 6652, 5689, ..., 1266, 3255, 1776])

In [7]:
# We normalize to imagenet mean for the data (https://stackoverflow.com/questions/58151507/why-pytorch-officially-use-mean-0-485-0-456-0-406-and-std-0-229-0-224-0-2)
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]),
    'val': transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]),
    'test': transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])}

data_dir = "/media/shivaram/SharedVolum/Projects/FishID/scraped_images/scientific_split/"
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
                                          data_transforms[x])
                  for x in ['train', 'val', 'test']}

In [12]:
batch_size = 64
epoch_samples = 2560# len(samples_weight)

In [13]:
weighted_samplers = {}
for subset in ["train", "val", "test"]:
    target = image_datasets[subset].targets
    
    if subset == "train":
        class_sample_count =np.array([ len(np.where(target == t)[0]) for t in np.unique(target)])
        weight = 1. / class_sample_count
        samples_weight = np.array([weight[t] for t in target])
        samples_weight = torch.from_numpy(samples_weight)
        epoch_samples = len(samples_weight)
        sampler = data.WeightedRandomSampler(samples_weight, epoch_samples)
        weighted_samplers[subset] = sampler
    else:
        sampler = data.RandomSampler(image_datasets[subset])
        weighted_samplers[subset] = sampler
    


dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], sampler = weighted_samplers[x], 
                                              batch_size=batch_size, num_workers=4)
              for x in ['train', 'val', 'test']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val', 'test']}
class_names = image_datasets['train'].classes

19748
41736


# Create Custom Dataset and Load Data

In [None]:
files_path = "/media/shivaram/SharedVolum/Projects/FishID/scraped_images/scientific_split/"

In [None]:
def is_valid_file(image_path, base_path = files_path, path_set = path_set):
    local_path = image_path.split("/", 7)[7]
    return (local_path in path_set)

In [None]:
import torch
from torchvision import transforms, datasets

In [None]:
data_transform = transforms.Compose([
        transforms.Resize([224]),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

### Generate Train-Test-Val

In [None]:
from collections import Counter
counts = dict(Counter(species_dataset.targets))

In [None]:
species_dataset = datasets.ImageFolder(root=files_path,
                                           transform=data_transform)

In [None]:
len(species_dataset)

In [None]:
train_size = int(.7*len(species_dataset))
val_size = int(.1*len(species_dataset))
test_size = len(species_dataset) - train_size - val_size

https://discuss.pytorch.org/t/dataloader-using-subsetrandomsampler-and-weightedrandomsampler-at-the-same-time/29907/2

In [None]:
# Generate the indices of the elements in the training, test, and validation samples
train_proportion = .7
val_proportion = .1
test_proportion = .2

class_targets = species_dataset.targets #list that gives the class label for each sample
target_count = {t:len(np.where(class_targets == t)[0]) for t in np.unique(class_targets)} #dictionary with frequency count of each class

current_start = 0
train_indices = []
val_indices = []
test_indices = []

# Iterate through each class and add indices to corresponding set for train, val, test
for class_label, class_count in target_count.items():
    
    curr_ind = current_start
    
    train_indices.extend(np.arange(curr_ind, curr_ind + int(train_proportion*class_count)))
    curr_ind += int(train_proportion*class_count)
    val_indices.extend(np.arange(curr_ind, curr_ind + int(val_proportion*class_count)))
    curr_ind += int(val_proportion*class_count)
    test_indices.extend(np.arange(curr_ind, current_start + class_count))
    current_start += class_count

train_indices = torch.tensor(train_indices).int()
val_indices = torch.tensor(val_indices).int()
test_indices = torch.tensor(test_indices).int()
indices = {"train": train_indices, "val": val_indices, "test": test_indices}

In [55]:
# Generate Sample Weights
class_sample_count =np.array([ len(np.where(class_targets == t)[0]) for t in np.unique(class_targets)])
weight = 1. / class_sample_count
samples_weight = np.array([weight[t] for t in class_targets])
samples_weight = torch.from_numpy(samples_weight)
sampler = data.WeightedRandomSampler(samples_weight, len(samples_weight))

In [None]:
Either have to create custom Dataset class that loads in or create new directory with symlinks for train_val_test

In [58]:
image_datasets = {}
weighted_samplers = {}

# Create WeightedRandomSampler, DataLoader, and  Dataset for each subset
for phase in ["train", "val", "test"]:
    
    Image
    image_datasets[phase] = data.TensorDataset(species_dataset[indices[phase]], class_targets[indices[phase]])
    
    
    weighted_samplers[phase] = sampler #MAYBE MAKE ONE FOR EACH SET???????????????????????
    
    

TypeError: only integer tensors of a single element can be converted to an index

In [52]:
species_dataset.

[('/media/shivaram/SharedVolum/Projects/FishID/scraped_images/scientific/10/imagesqtbnand9gcq-8zyssq4vqejlroit88riqipqkdid6g9wmwusqpcau.jpg',
  0),
 ('/media/shivaram/SharedVolum/Projects/FishID/scraped_images/scientific/10/imagesqtbnand9gcq0xy8cgnubaci08wnzg03hsyzagjunnvxgmwusqpcau.jpg',
  0),
 ('/media/shivaram/SharedVolum/Projects/FishID/scraped_images/scientific/10/imagesqtbnand9gcq0yipz6csqd_4bnnbz9cebnqagp8ba5y1gxwusqpcau.jpg',
  0),
 ('/media/shivaram/SharedVolum/Projects/FishID/scraped_images/scientific/10/imagesqtbnand9gcq2diuc7cv4r8spzajzomcahikvzujzcqrftausqpcau.jpg',
  0),
 ('/media/shivaram/SharedVolum/Projects/FishID/scraped_images/scientific/10/imagesqtbnand9gcq2jy087321zfxzu6sqj_yzn1er_8ggxensyausqpcau.jpg',
  0),
 ('/media/shivaram/SharedVolum/Projects/FishID/scraped_images/scientific/10/imagesqtbnand9gcq3242ab1bdcufkjf1tjzsoyeiczu7thh2i2qusqpcau.jpg',
  0),
 ('/media/shivaram/SharedVolum/Projects/FishID/scraped_images/scientific/10/imagesqtbnand9gcq3rpekj2h7denib8btbp

In [33]:
# Create dummy data with class imbalance 99 to 1
numDataPoints = 1000
data_dim = 5
bs = 100
a_data = torch.randn(numDataPoints, data_dim)
target = torch.cat((torch.zeros(int(numDataPoints * 0.99), dtype=torch.long),
                    torch.ones(int(numDataPoints * 0.01), dtype=torch.long)))

print('target train 0/1: {}/{}'.format(
    (target == 0).sum(), (target == 1).sum()))

# Create subset indices
subset_idx = torch.cat((torch.arange(100), torch.arange(-5, 0)))
print(subset_idx)

# Compute samples weight (each sample should get its own weight)
class_sample_count = torch.tensor(
    [(target[subset_idx] == t).sum() for t in torch.unique(target, sorted=True)])
weight = 1. / class_sample_count.float()
samples_weight = torch.tensor([weight[t] for t in target[subset_idx]])

# Create sampler, dataset, loader
sampler = data.WeightedRandomSampler(samples_weight, len(samples_weight))
train_dataset = data.TensorDataset(
    a_data[subset_idx], target[subset_idx])
train_loader = data.DataLoader(
    train_dataset, batch_size=bs, num_workers=1, sampler=sampler)

# Iterate DataLoader and check class balance for each batch
for i, (x, y) in enumerate(train_loader):
    print("batch index {}, 0/1: {}/{}".format(
        i, (y == 0).sum(), (y == 1).sum()))

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
.7 of each class in order 
next .2 of each class
remaining .1  of each class

In [1]:
image_datasets = data.random_split(species_dataset, [train_size, val_size, test_size], generator = torch.Generator().manual_seed(42))

NameError: name 'data' is not defined

In [37]:

sep = ["train", "val", "test"]
image_datasets = {sep[i]:image_datasets[i] for i in range(len(sep))}

3


In [41]:
image_datasets["train"].dataset

Dataset ImageFolder
    Number of datapoints: 203076
    Root location: /media/shivaram/SharedVolum/Projects/FishID/scraped_images/scientific/
    StandardTransform
Transform: Compose(
               Resize(size=[224], interpolation=bilinear, max_size=None, antialias=None)
               RandomHorizontalFlip(p=0.5)
               ToTensor()
               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
           )

In [31]:
weighted_samplers = {}
for subset in ["train", "val", "test"]:
    target = image_datasets[subset].targets
    class_sample_count =np.array([ len(np.where(target == t)[0]) for t in np.unique(target)])
    weight = 1. / class_sample_count
    samples_weight = np.array([weight[t] for t in target])
    samples_weight = torch.from_numpy(samples_weight)
    epoch_samples = len(samples_weight)

    sampler = data.WeightedRandomSampler(samples_weight, epoch_samples)
    weighted_samplers[subset] = sampler

NameError: name 'image_datasets' is not defined

# Build Model

In [83]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

NVIDIA GeForce RTX 3060 Ti
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [86]:
model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
# Here the size of each output sample is set to 2.
# Alternatively, it can be generalized to nn.Linear(num_ftrs, len(class_names)).
model_ft.fc = nn.Linear(num_ftrs, len(species_dataset.classes))

model_ft = model_ft.to(device)

criterion = nn.CrossEntropyLoss()

# Observe that all parameters are being optimized
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
#exp_lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer_ft, factor = .2, patience = 3)|

## Train Model

# Todo
- Train Standard Network
- Adjust sample weighting
- Train Siamese Network or GAN
- Refactor code
- [Don't weighted random sample for validation and test](https://discuss.pytorch.org/t/dataloader-using-subsetrandomsampler-and-weightedrandomsampler-at-the-same-time/29907/7)