# Kelp Competition

## Data preparation

In [43]:
%reset -f
import gc
gc.collect()

0

In [44]:
import os
import shutil
from glob import glob
import re
import numpy as np
import pandas as pd
from random import shuffle, seed
# CV tools
import tifffile as tiff
#Visulization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from typing import Tuple
from pathlib import Path
# Pytorch
import torch
import torch.nn as nn
from torchvision import transforms, models
import torchvision.transforms.functional as TF
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim
# from torchsummary import summary
from tqdm import *
import matplotlib.ticker as mtick
import matplotlib.ticker as mticker


In [45]:
# Mac OS
# device = "mps" if torch.backends.mps.is_available else "cpu"
# print(f"device is {device}")
# # Windows
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device is {device}")

device is cuda


### Set working directory

In [46]:
current_dir = os.getcwd()
# Define your data and label folder paths
data_dir = os.path.join(current_dir, 'data')
test_dir = os.path.join(data_dir, 'test')
train_dir = os.path.join(data_dir, 'train')
label_dir = os.path.join(data_dir, 'label')

temp_dir = [data_dir,train_dir,test_dir,label_dir]

for dir in temp_dir:
    if not os.path.exists(dir):
        os.makedirs(dir)

# for dirpath, dirnames, filenames in os.walk(data_dir):
#     for filename in filenames:
#         print(os.path.join(dirpath, filename))

In [47]:
current_dir = os.getcwd()
data_dir = os.path.join(current_dir, 'data')

training_data_folder = os.path.join(data_dir, 'train')
label_data_folder = os.path.join(data_dir, 'label')

# Path to the mini_train folder
mini_train_folder = os.path.join(data_dir, 'mini_train')
mini_label_folder = os.path.join(data_dir, 'mini_label')

def create_mini_dataset(source_folder, destination_folder, num_files=40):
    """
    Create a mini dataset by copying a specified number of files from the source folder to the destination folder.

    Args:
    source_folder (str): Path to the source folder containing the files to be copied.
    destination_folder (str): Path to the destination folder where the files will be copied.
    num_files (int): Number of files to copy. Default is 40.
    """
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    # List the files in the source folder
    files = sorted(os.listdir(source_folder))

    # Take the first 'num_files' files
    selected_files = files[:num_files]

    # Copy the selected files to the destination folder
    for file in selected_files:
        source_path = os.path.join(source_folder, file)
        destination_path = os.path.join(destination_folder, file)
        shutil.copyfile(source_path, destination_path)

    print(f"Mini dataset folder created with the first {num_files} files.")

create_mini_dataset(training_data_folder, mini_train_folder, 40)
create_mini_dataset(label_data_folder, mini_label_folder, 40)


Mini dataset folder created with the first 40 files.
Mini dataset folder created with the first 40 files.


In [48]:
pattern_train = mini_train_folder + "/*.tif"
train_list = glob(pattern_train)

pattern_label = mini_label_folder + "/*.tif"
label_list = glob(pattern_label)

# pattern_test = test_dir + "/*.tif"
# test_list = glob(pattern_test)

In [49]:
type(label_list)

list

### Understand the dataset

In [50]:
# image = tiff.imread('/Users/carriexia/Desktop/spring_2023/computer_vision/kelp_competition/data/train/AA498489_satellite.tif')

# # Get the shape of the image
# shape = image.shape

# print("Image shape:", shape)

In [51]:
# image = tiff.imread('/Users/carriexia/Desktop/spring_2023/computer_vision/kelp_competition/data/label/AA498489_kelp.tif')

# print("label image shape:", image.shape)
# image_array = np.array(image)

# # Find the unique values in the image
# unique_values = np.unique(image_array)
# print(unique_values)


In [52]:
# def tensor_transform(list):
#     for image in list:
#         with tiff.tifffile(image) as temp:
#             temp.asarray




# #  randomly print the map in training list
# def show_image():
#     n = np.random.randint(1,4509)
#     with tiff.TiffFile(sorted(train_list)[n]) as train_temp:
#         train = train_temp.asarray()
#     with tiff.TiffFile(sorted(label_list)[n]) as label_temp:
#         label = label_temp.asarray()

#     n_channels = train.shape[2]

#     plt.figure(figsize=(20, 10))  # Adjust the size as needed

#     # Add the single image as the first subplot
#     plt.subplot(2, 4, 1)  # Consider it's placed in a 2x4 grid
#     plt.imshow(label)  # Assuming it's grayscale; adjust as needed
#     plt.title('Kelp Image Label')
#     plt.axis('off')

#     # Loop through each channel of the satellite image and add as subsequent subplots
#     for i in range(n_channels):
#         plt.subplot(2, 4, i + 2)  # Offset by 2 to account for the first kelp image
#         plt.imshow(train[:, :, i])  # Display each channel in grayscale
#         plt.title(f'Channel {i+1}')
#         plt.axis('off')

#     plt.tight_layout()
#     plt.show()
    
    
# show_image()

## Custom Dataset class

In [53]:
def count_files_in_folder(folder):
    # Get list of filenames in the folder
    files = os.listdir(folder)
    
    # Count the number of files
    num_files = len(files)
    
    return num_files

# Example usage:
num_files_in_folder1 = count_files_in_folder(mini_train_folder)
print(f"Number of files in training folder: {num_files_in_folder1}")
# Example usage:
num_files_in_folder1 = count_files_in_folder(mini_label_folder)
print(f"Number of files in label folder: {num_files_in_folder1}")

Number of files in training folder: 40
Number of files in label folder: 40


In [54]:
def compare_folders(folder1, folder2):
    # Get list of filenames in each folder
    files1 = sorted(os.listdir(folder1))
    files2 = sorted(os.listdir(folder2))
    
    for file1, file2 in zip(files1, files2):
        # Split filenames by underscore and compare the first part
        name1 = file1.split('_')[0]
        name2 = file2.split('_')[0]
        
        if name1 != name2:
            print(f"Files '{file1}' and '{file2}' have different prefixes.")
            return False
    
    print("All filenames in both folders have the same prefixes.")
    return True


compare_folders(mini_train_folder, mini_label_folder)

All filenames in both folders have the same prefixes.


True

In [55]:
from sklearn.preprocessing import MinMaxScaler

class auto_dataset():
    def __init__(self, folder_train: Path, folder_target: Path, split_type="train", val_size=0.2) -> None:
        self.split_type = split_type
        # Populate file paths
        self.fpaths = glob(folder_train + "/*")
        self.tpaths = glob(folder_target + "/*")
    
        # Calculate split index based on val_size
        split_index = int(len(self.fpaths) * (1 - val_size))
    
        # Create indices for train and val sets
        indices = list(range(len(self.fpaths)))
        if split_type == "train":
            self.train_indices = indices[:split_index]
        elif split_type == "val":
            self.val_indices = indices[split_index:]
        elif split_type == "all":
            self.train_indices = indices
            self.val_indices = list(range(len(self.tpaths)))
        
        # Initialize MinMaxScaler
        self.scaler = MinMaxScaler()

    def __len__(self) -> int:
        if self.split_type == "train":
            return len(self.train_indices)
        elif self.split_type == "val":
            return len(self.val_indices)
        elif self.split_type == "all":
            return len(self.train_indices) + len(self.val_indices)
    
    def __getitem__(self, index) -> Tuple[torch.Tensor]:
        with tiff.TiffFile(self.fpaths[index]) as train_temp:
            train = train_temp.asarray()
            # Normalize train data using MinMaxScaler
            train = self.scaler.fit_transform(train.reshape(-1, 1)).reshape(train.shape)

        with tiff.TiffFile(self.tpaths[index]) as label_temp:
            label = label_temp.asarray()
            # Normalize label data using MinMaxScaler
            label = self.scaler.fit_transform(label.reshape(-1, 1)).reshape(label.shape)
            
        return torch.Tensor(train).permute(2, 0, 1).to(device=device), torch.Tensor(label).to(device=device)


In [56]:
# Initialize train and validation datasets
train_dataset = auto_dataset(mini_train_folder, mini_label_folder, split_type="train", val_size=0.2)
val_dataset = auto_dataset(mini_train_folder, mini_label_folder, split_type="val", val_size=0.2)
for i in range(len(train_dataset)):
    train_data, label_data = train_dataset[i]
    print(f"Item {i+1}:")
    print(f"Train data shape: {train_data.shape}")
    print(f"Label data shape: {label_data.shape}")



Item 1:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 2:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])


Item 3:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 4:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 5:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 6:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 7:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 8:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 9:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 10:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 11:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 12:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 13:
Train data shape: torch.Size([7, 350, 350])
Label data sha

In [57]:
for i in range(len(val_dataset)):
    train_data, label_data = val_dataset[i]
    print(f"Item {i+1}:")
    print(f"Train data shape: {train_data.shape}")
    print(f"Label data shape: {label_data.shape}")


Item 1:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 2:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 3:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 4:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 5:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 6:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 7:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])
Item 8:
Train data shape: torch.Size([7, 350, 350])
Label data shape: torch.Size([350, 350])


In [58]:
print("Train Dataset Length:", len(train_dataset))
print("Validation Dataset Length:", len(val_dataset))

Train Dataset Length: 32
Validation Dataset Length: 8


## Data loader

In [59]:
lr = 0.001
batch_size = 8
num_epochs = 2

In [60]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

## Define model

In [61]:
class DoubleConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(DoubleConv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
        )
    def forward(self, x):
            return self.conv(x)


In [62]:
class UNET(nn.Module):
    def __init__(
        self, in_channels, out_channels, features
        ):
            super(UNET, self).__init__()
            self.ups = nn.ModuleList()
            self.downs = nn.ModuleList()
            self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

            # Down part
            for feature in features:
                self.downs.append(DoubleConv(in_channels, feature))
                in_channels = feature

            # UP sampling part
            for feature in reversed(features):
                self.ups.append(
                    nn.ConvTranspose2d(
                        feature*2, feature, kernel_size=2, stride=2,
                        )
                )
                self.ups.append(DoubleConv(feature*2, feature))
            self.bottleneck = DoubleConv(features[-1], features[-1]*2)
            self.final_conv = nn.Conv2d(features[0], out_channels, kernel_size=1)

    def forward(self, x):
        print("Input size:", x.size())
        
        skip_connections = []
        for idx, down in enumerate(self.downs):
            x = down(x)
            skip_connections.append(x)
            x = self.pool(x)
            print(f"Down {idx + 1} output size:", x.size())
        
        x = self.bottleneck(x)
        skip_connections = skip_connections[::-1]

        for idx in range(0, len(self.ups), 2):
            x = self.ups[idx](x)
            skip_connection = skip_connections[idx//2]
            if x.shape != skip_connection.shape:
                x = TF.resize(x, skip_connection.shape[2:])
            concat_skip = torch.cat((skip_connection, x), dim=1)
            x = self.ups[idx+1](concat_skip)
            print(f"Up {idx // 2 + 1} output size:", x.size())
            
        output = self.final_conv(x)
        print("Final output size:", output.size())
        
        return output


In [63]:
# Create the U-Net model with 7 input channels and 1 output channel
in_channels = 7  # Number of input channels
out_channels = 1  # Number of output channels
features = [64, 128, 256, 512]
model = UNET(in_channels, out_channels, features)
# Specify data type (e.g., torch.float32)
model.to(device)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=lr)

In [65]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x1cd141fc940>

In [66]:
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for ix, data in enumerate(iter(train_loader)):
        inputs, targets = data

        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs).squeeze(1)
        # Compute the loss
        loss = criterion(outputs, targets)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")

Input size: torch.Size([8, 7, 350, 350])


OutOfMemoryError: CUDA out of memory. Tried to allocate 240.00 MiB. GPU 0 has a total capacty of 8.00 GiB of which 0 bytes is free. Of the allocated memory 7.05 GiB is allocated by PyTorch, and 233.59 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF