# Filamentous Fungi Cell Phenotyping using Meta's DINO

## Install dependencies

In [2]:
!pip install scikit-image

Collecting scikit-image
  Downloading scikit_image-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.7/14.7 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tifffile>=2022.8.12 (from scikit-image)
  Downloading tifffile-2023.12.9-py3-none-any.whl (223 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.6/223.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting lazy_loader>=0.3 (from scikit-image)
  Downloading lazy_loader-0.3-py3-none-any.whl (9.1 kB)
Installing collected packages: tifffile, lazy_loader, scikit-image
Successfully installed lazy_loader-0.3 scikit-image-0.22.0 tifffile-2023.12.9
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip insta

In [3]:
!pip install imgaug

Collecting imgaug
  Using cached imgaug-0.4.0-py2.py3-none-any.whl (948 kB)
Collecting Shapely (from imgaug)
  Downloading shapely-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: Shapely, imgaug
Successfully installed Shapely-2.0.2 imgaug-0.4.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Set Up Environment

In [2]:
import pandas as pd

In [3]:
from skimage.exposure import equalize_adapthist, equalize_hist
from skimage.filters import threshold_otsu, threshold_triangle

import torch
from tqdm import tqdm
from imgaug import augmenters as iaa
import imgaug
import numpy as np
from urllib.parse import urlparse, urlunparse
from PIL import Image

from torch.utils.data import Dataset
from torch.utils.data import DataLoader, RandomSampler

## Custom Functions and Classes

In [4]:
def rescale_intensity(img, q):

    import numpy as np
    import skimage as sk

    img = sk.exposure.rescale_intensity(
        img, in_range=tuple(np.quantile(img, q=(q, 1 - q)))
    )
    return img

In [6]:
class CellDataset(Dataset):
    def __init__(self, meta_data_path, transforms=None, uri_field="uri"):
        """
        Args:
        """
        meta_file = os.path.join(meta_data_path, 'train.csv')
        df = pd.read_csv(meta_file)
        df = df.loc[df['modality_name'] == 'BrightField']
        df['uri'] = df.uri.str.replace('file:///images/', image_root)#'s3://syedazi-demo-content-aiml-team/bayer-crop-science-poc/fungi-cell/')
        df = df.head(10)

        self.uri_field = uri_field
        self.transforms = transforms
        self.meta_data = df

    def get_image(self, row: pd.Series):
        url = row[self.uri_field]
        #url = urlparse(row[self.uri_field])
        #bucket = url.netloc
        #key = url.path[1:]
        #file_byte_string = client.get_object(Bucket=bucket, Key=key)["Body"]
        print(url)
        image = np.array(Image.open(url)).astype(np.uint16)
        if self.transforms:
            image = self.transforms(image=image)

        return image
        
    def __len__(self):
        return self.meta_data.index.size

    def __getitem__(self, idx):

        meta_data = self.meta_data.iloc[idx]

        image = self.get_image(meta_data)

        image = equalize_adapthist(image)
        image = rescale_intensity(image, q=1e-2)
        thresh = threshold_otsu(image)
        # Convert binary image back to uint8

        binary = image > thresh

        # Normalize between 0-1 using PIL
        image = Image.fromarray(binary).convert('L')
        image = np.array(image) / 255.0
        #image = image / image.max()
        return image


In [20]:
from imgaug import augmenters as iaa

dataset.transforms = [
    iaa.Fliplr(0.5),
    iaa.Flipud(0.5),
    iaa.Rotate([-180, 180]),
    # iaa.Affine(scale=(0.95, 1.0)),
    iaa.Resize(512),
]
dataset.transforms = iaa.Sequential(dataset.transforms)

### Distributed Training with SageMaker

In [None]:
import boto3

client = boto3.client('s3')

In [7]:
import sagemaker

session = sagemaker.Session()

role = sagemaker.get_execution_role()

## Setup for FSx for Lustre

### Check for Attached FSx for Lustre File-system

In [9]:
import boto3
import socket

def fsx_file_systems(fsx_client):
    """Generator for listing Fsx file systems"""

    next_token = None
    while True:
        if next_token:
            resp = fsx_client.describe_file_systems(NextToken=next_token)
        else:
            resp = fsx_client.describe_file_systems()
            
        file_systems = resp['FileSystems']
        for fs in file_systems:
            yield fs

        try:
            next_token = resp['NextToken']
        except KeyError:
            break

#file_system_id = 'fs-09f1c1fdba99021df'

notebook_attached_fsx = !df -kh | grep '@tcp:/' \
    | sed 's/\([0-9a-zA-Z\.]*\)@tcp:\/\([a-zA-Z0-9]*\).*/\1 \2/'
#print(notebook_attached_fsx)
#fsx_mount_name = notebook_attached_fsx[0].split()[1]
fsx_mount_name = 'jhhnlbev'
fsx_client = boto3.client("fsx")

for fsx_fs in fsx_file_systems(fsx_client):
    mount_name = fsx_fs['LustreConfiguration']['MountName']
    fs_id = fsx_fs['FileSystemId']
    if mount_name == fsx_mount_name:
        file_system_id = fs_id
        break
        
if file_system_id:
    print(f"FSx for Lustre file-system is attached: {file_system_id}")
else:
    print(f"No FSx for Lustre file-system is attached")

FSx for Lustre file-system is attached: fs-0795f30d11e52d61c


### Define Subnets and Security Groups

In [10]:
import os
import boto3

security_group_ids=None
subnets=None

if file_system_id:
    fsx_client = boto3.client("fsx")
    ec2_client = boto3.client('ec2')
    
    response = fsx_client.describe_file_systems(FileSystemIds=[file_system_id])
    file_system=response['FileSystems'][0]
    subnets = file_system['SubnetIds']
    network_interface_ids = file_system['NetworkInterfaceIds']
         
    response = ec2_client.describe_network_interfaces(
        NetworkInterfaceIds=network_interface_ids)
    network_interface = response['NetworkInterfaces'][0]
    groups = network_interface['Groups']
    security_group_ids = [ x['GroupId'] for x in groups ]
   
subnets = list(set(subnets)) if isinstance(subnets, list) else None
security_group_ids = list(set(security_group_ids)) if isinstance(security_group_ids, list) \
                        else None

print(f"Subnets: {subnets}")
print(f"Security groups: {security_group_ids}")


Subnets: ['subnet-d35737dd']
Security groups: ['sg-7413544b', 'sg-07ddb349c10a25863']


### Define Amazon FSx Lustre Train Data Channel

In [11]:
from sagemaker.inputs import FileSystemInput

fsx_data_channels = None

if file_system_id:
    
    # Specify directory path for input data on the file system. 
    # You need to provide normalized and absolute path below.
    file_system_directory_path = 'data/images'
    print(f'FSx file-system data input path: {file_system_directory_path}')

    # Specify the access mode of the mount of the directory associated with the file system. 
    # Directory must be mounted 'ro'(read-only).
    file_system_access_mode = 'ro'

    # Specify your file system type.
    file_system_type = 'FSxLustre'

    train = FileSystemInput(file_system_id=file_system_id,
                                        file_system_type=file_system_type,
                                        directory_path=f"/{fsx_mount_name}/{file_system_directory_path}",
                                        file_system_access_mode=file_system_access_mode)

    file_system_directory_path = 'data'
    
    meta = FileSystemInput(file_system_id=file_system_id,
                                        file_system_type=file_system_type,
                                        directory_path=f"/{fsx_mount_name}/{file_system_directory_path}",
                                        file_system_access_mode=file_system_access_mode)

    
    fsx_data_channels = {'train': train, 'meta': meta}
else:
    print("FSx for Lustre file-system is not available")


FSx file-system data input path: data/images


In [12]:
fsx_data_channels

{'train': <sagemaker.inputs.FileSystemInput at 0x7f078d0b8cd0>,
 'meta': <sagemaker.inputs.FileSystemInput at 0x7f078d0b9510>}

In [13]:
from sagemaker.inputs import FileSystemInput

train_data = FileSystemInput(file_system_id = 'fs-0795f30d11e52d61c',
                file_system_type = 'FSxLustre',
                directory_path = '/jhhnlbev/fsx/data/images',
                file_system_access_mode = 'rw')
meta_data = FileSystemInput(file_system_id = 'fs-0795f30d11e52d61c',
                file_system_type = 'FSxLustre',
                directory_path = '/jhhnlbev/fsx/data',
                file_system_access_mode = 'rw')
data_channels = {'train': train_data, 'meta': meta_data}

In [14]:
import time
from sagemaker.pytorch import PyTorch

# output bucket where final model artifacts are uploaded 
DINO_OUTPUT_BUCKET = 'mlbucket-eff85bfb'

# paths on training instance  
#sm_metadata_path = '/opt/ml/input/data/metadata'              
#sm_data_path = '/opt/ml/input/data/train'                     
sm_output_path = '/opt/ml/output/data'                        
sm_checkpoint_path = '/opt/ml/checkpoints'                

# training job name
dino_base_job_name = f'dino-model-{int(time.time())}'

# create SageMaker Estimator
estimator = PyTorch(
    base_job_name=dino_base_job_name,
    source_dir='code',
    entry_point='ddp_dino_single_channel.py',
    role=role,
    framework_version="2.0.0",
    py_version="py310",
    instance_count=1,
    instance_type="ml.p3.16xlarge",
    #input_mode='FastFile',
    distribution = {'smdistributed':{'dataparallel':{'enabled': True}}},        
    volume_size=100,
    sagemaker_session=session,
    hyperparameters = {
        # hyperparameters passed to entry point script
        'arch': 'vit_small',
        'patch_size': 16,
        #'metadata_dir': sm_metadata_path,
        #'data_dir': sm_data_path,
        'output_dir': sm_checkpoint_path,
        #'checkpoint_dir': sm_checkpoint_path,
        'epochs': 100,
        'saveckp_freq': 20,
    },
    max_run=24*60*60,               
    checkpoint_local_path = sm_checkpoint_path,
    #checkpoint_s3_uri =f's3://{DINO_OUTPUT_BUCKET}/checkpoints', 
    disable_profiler=True,
    debugger_hook_config=False,
    subnets=subnets,
    security_group_ids=security_group_ids
)

In [15]:
estimator.fit(fsx_data_channels)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: dino-model-1704818744-2024-01-09-16-45-45-443


Using provided s3_resource
2024-01-09 16:45:45 Starting - Starting the training job......
2024-01-09 16:46:32 Starting - Preparing the instances for training.........
2024-01-09 16:48:07 Downloading - Downloading input data...
2024-01-09 16:48:32 Downloading - Downloading the training image........................
2024-01-09 16:52:48 Training - Training image download completed. Training in progress......[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-01-09 16:53:26,555 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-01-09 16:53:26,614 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-01-09 16:53:26,625 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-01-09 16:53:26,627 sagemaker_pytorch_container.training INFO     Invoking SMDataPara

UnexpectedStatusException: Error for Training job dino-model-1704818744-2024-01-09-16-45-45-443: Failed. Reason: AlgorithmError: Framework Error: 
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/urllib3/connection.py", line 174, in _new_conn
    conn = connection.create_connection(
  File "/opt/conda/lib/python3.10/site-packages/urllib3/util/connection.py", line 95, in create_connection
    raise err
  File "/opt/conda/lib/python3.10/site-packages/urllib3/util/connection.py", line 85, in create_connection
    sock.connect(sa)
TimeoutError: timed out

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/botocore/httpsession.py", line 465, in send
    urllib_response = conn.urlopen(
  File "/opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py", line 787, in urlopen
    retries = retries.increment(
  File "/opt/conda/lib/python3.10/site-packages/urllib3/util/retry.py", line 525, in increment
    raise six.reraise(type(error), error, _stacktrace)
  File "/opt/conda/lib/python3.10

## Setup for Training with S3

In [12]:
import time
from sagemaker.pytorch import PyTorch

# output bucket where final model artifacts are uploaded 
DINO_OUTPUT_BUCKET = 'mlbucket-eff85bfb'

# paths on training instance  
#sm_metadata_path = '/opt/ml/input/data/metadata'              
#sm_data_path = '/opt/ml/input/data/train'                     
sm_output_path = '/opt/ml/output/data'                        
sm_checkpoint_path = '/opt/ml/checkpoints'                

# training job name
dino_base_job_name = f'dino-model-{int(time.time())}'

# create SageMaker Estimator
estimator = PyTorch(
    base_job_name=dino_base_job_name,
    source_dir='code',
    entry_point='ddp_dino_single_channel_sm.py',
    role=role,
    framework_version="2.0.0",
    py_version="py310",
    instance_count=1,
    instance_type="ml.p3.16xlarge",
    input_mode='FastFile',
    distribution = {'smdistributed':{'dataparallel':{'enabled': True}}},        
    volume_size=100,
    sagemaker_session=session,
    hyperparameters = {
        # hyperparameters passed to entry point script
        'arch': 'vit_base',
        'patch_size': 16,
        #'metadata_dir': sm_metadata_path,
        #'data_dir': sm_data_path,
        'output_dir': sm_checkpoint_path,
        #'checkpoint_dir': sm_checkpoint_path,
        'epochs': 300,
        'batch_size_per_gpu' : 8,
        'teacher_temp' : 0.07,
        'warmup_teacher_temp_epochs' : 30,
        'saveckp_freq': 20,
    },
    max_run=24*60*60,               
    checkpoint_local_path = sm_checkpoint_path,
    #checkpoint_s3_uri =f's3://{DINO_OUTPUT_BUCKET}/checkpoints', 
    disable_profiler=True,
    debugger_hook_config=False
)

In [None]:
meta_data = "s3://mlbucket-eff85bfb/cell-data/"
train_data="s3://mlbucket-eff85bfb/cell-data/images/"
fsx_data_channels = {'train': train_data, 'meta': meta_data}
estimator.fit(fsx_data_channels)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: dino-model-1704886396-2024-01-10-11-33-17-222


Using provided s3_resource
2024-01-10 11:33:17 Starting - Starting the training job.........
2024-01-10 11:34:25 Starting - Preparing the instances for training.........
2024-01-10 11:36:03 Downloading - Downloading input data...
2024-01-10 11:36:36 Downloading - Downloading the training image........................
2024-01-10 11:40:32 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-01-10 11:41:11,294 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-01-10 11:41:11,354 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-01-10 11:41:11,364 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-01-10 11:41:11,366 sagemaker_pytorch_container.training INFO     Invoking SMDataPar