# Clog Loss: Advance Alzheimer’s Research with Stall Catchers

https://www.drivendata.org/competitions/65/clog-loss-alzheimers-research/page/217/

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
from IPython.display import clear_output
import numpy as np
import pandas as pd
import cv2 
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef

from IPython.lib.display import YouTubeVideo

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, TimeDistributed,Dropout, Activation, Flatten,Conv2D, MaxPooling2D,LSTM,Bidirectional

import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (15, 10)

from clog_utils import *

AUTOTUNE = tf.data.experimental.AUTOTUNE

np.set_printoptions(precision=3, suppress=True)

tf.__version__

### Constants

In [None]:
DATASET_PATH = '/home/user/percepto/datasets/ClogLoss'

CONSECUTIVE_FRAMES = 10
IMG_H, IMG_W, IMG_C = (256, 256, 3)

### Train Data

In [None]:
train_labels_df = pd.read_csv(f'{DATASET_PATH}/train_labels.csv')
train_labels_df

In [None]:
train_stalled_df = train_labels_df[train_labels_df['stalled'] > 0]
train_stalled_df

In [None]:
train_metadata_df = pd.read_csv(f'{DATASET_PATH}/train_metadata.csv')
train_metadata_df

### Test Data

In [None]:
test_metadata_df = pd.read_csv(f'{DATASET_PATH}/test_metadata.csv')
test_metadata_df

#### Nano Data

In [None]:
!ls {DATASET_PATH}/nano | wc -l

In [None]:
nano_df = train_metadata_df[train_metadata_df['nano'] == True]
nano_df

#### Micro Data

In [None]:
!ls {DATASET_PATH}/micro | wc -l

In [None]:
micro_df = train_metadata_df[train_metadata_df['micro'] == True]
micro_df

#### Validate that Nano dataset is included inside Micro

In [None]:
np.all(nano_df['filename'].isin(micro_df['filename']))

### Paths and Statistics

In [None]:
def get_label(x):
    return 1. if x > 0.75 else 0

get_labels = np.vectorize(get_label)

In [None]:
df, folder = nano_df, 'nano'
#df, folder = micro_df, 'micro'

In [None]:
train_df, val_df = train_test_split(df, test_size=0.3, random_state=42)

In [None]:
train_metadata_paths = DATASET_PATH+'/train/'+train_metadata_df['filename'].values
#train_metadata_y = get_labels(train_metadata_df['crowd_score'].values)

test_metadata_paths = DATASET_PATH+'/test/'+test_metadata_df['filename'].values

train_paths = DATASET_PATH+'/'+folder+'/'+train_df['filename'].values
train_y = get_labels(train_df['crowd_score'].values)

val_paths = DATASET_PATH+'/'+folder+'/'+val_df['filename'].values
val_y = get_labels(val_df['crowd_score'].values)

In [None]:
print(f'Train Metadata: {len(train_metadata_paths)}')
print(f'Test: {len(test_metadata_paths)}')
print(f'Trainig: train {len(train_paths)}, validation {len(val_paths)}')

### Yutube

In [None]:
YouTubeVideo('-2aW6m60mYg')

In [None]:
YouTubeVideo('c6TtoQhMrbA')

In [None]:
YouTubeVideo('_uJ_dcy-OXQ')

In [None]:
YouTubeVideo('9RFnguYmd_8')

### Stalled and Flowing Clips Pathes

In [None]:
stalled_df = df[df['crowd_score'] >=0.5]
stalled_paths = DATASET_PATH+'/'+folder+'/'+stalled_df['filename'].values
len(stalled_paths), stalled_paths[:5]

In [None]:
flowing_df = df[df['crowd_score'] < 0.5]
flowing_paths = DATASET_PATH+'/'+folder+'/'+flowing_df['filename'].values
len(flowing_paths), flowing_paths[:5]

### Display Flowing Clips

In [None]:
display_clip(flowing_paths[0])

### Display Stalled Clips

In [None]:
display_clip(stalled_paths[0])

### Display Test Clips

In [None]:
display_clip(test_metadata_paths[10])

### Read Clip as Tensors

In [None]:
frames = read_clip(stalled_paths[100])
frames.shape

##### Show some clips frames

In [None]:
plt.subplot(131)
plt.imshow(frames[0])
plt.subplot(132)
plt.imshow(frames[20])
plt.subplot(133)
plt.imshow(frames[40])

##### Show some mask with frames

In [None]:
masks = extract_masks(frames)

In [None]:
display_fused(frames[30], masks[30])

### Strategy

In [None]:
strategy = tf.distribute.MirroredStrategy()

BATCH_SIZE_PER_REPLICA = 4
BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
BATCH_SIZE

### DataSet

##### Methods

In [None]:
def read_frames(path, y):
    frames = tf.numpy_function(read_clip, [path, IMG_H, IMG_H], tf.uint8)   
    masks = tf.numpy_function(extract_masks, [frames], tf.uint8)
    return (frames, masks), y

def preprocess(X, y):
    frames, masks = X[0], X[1]
    frames = tf.cast(frames, tf.float32)
    masks = tf.cast(masks, tf.float32)
    y = tf.cast(y, tf.float32)
    imgs = frames * masks
    imgs = imgs/255
    n = tf.shape(imgs)[0]
    k = n // CONSECUTIVE_FRAMES
    imgs = imgs[:k*CONSECUTIVE_FRAMES]
    H, W, C = tf.shape(imgs)[1], tf.shape(imgs)[2], tf.shape(imgs)[3]
    imgs = tf.reshape(imgs, (k, CONSECUTIVE_FRAMES, H, W, C))
    y = tf.reshape(y, (1,))
    y = tf.repeat(y, k)
    return imgs, y

##### Datasets

In [None]:
train = tf.data.Dataset.from_tensor_slices((train_paths, train_y)).map(read_frames).cache('/tmp/clog_train')
val = tf.data.Dataset.from_tensor_slices((val_paths, val_y)).map(read_frames).cache('/tmp/clog_val')

In [None]:
%%time 
for i, ((frames, masks), y) in enumerate(train.take(2)):
    print(f'{i}: frames={frames.shape}, masks={masks.shape}, y={y}')
    display_fused(frames[30].numpy(), masks[30].numpy())

In [None]:
%%time 
for i, ((frames, masks), y) in enumerate(val.take(2)):
    print(f'{i}: frames={frames.shape}, masks={masks.shape}, y={y}')
    display_fused(frames[30].numpy(), masks[30].numpy())

##### Datasets

In [None]:
train_ds = train.repeat()
#train_ds = train_ds.shuffle(10)
train_ds = train_ds.map(preprocess, num_parallel_calls=AUTOTUNE, deterministic=False)
train_ds = train_ds.unbatch()
train_ds = train_ds.batch(BATCH_SIZE)
train_ds = train_ds.prefetch(AUTOTUNE)
train_ds

In [None]:
val_ds = val.map(preprocess, num_parallel_calls=AUTOTUNE, deterministic=False)
val_ds = val_ds.unbatch()
val_ds = val_ds.batch(BATCH_SIZE)
val_ds = val_ds.prefetch(AUTOTUNE)
val_ds

##### Show Datasets

In [None]:
for i, (X, y) in enumerate(train_ds.take(1)):
    print(f'{i}: X={X.shape}, y={y.shape}, label={y.numpy()}')
    plt.figure(); plt.imshow(X[0,0]); plt.show
    plt.figure(); plt.imshow(X[0,-1]); plt.show

In [None]:
for i, (X,y) in enumerate(val_ds.take(1)):
    print(f'{i}: X={X.shape}, y={y.shape}, label={y.numpy()}')
    plt.figure(); plt.imshow(X[0,0]); plt.show
    plt.figure(); plt.imshow(X[0,-1]); plt.show

### Model

In [None]:
input_shape = X.shape
print(f'Input Shape: {input_shape}')

In [None]:
def make_backbone():
    backbone = Sequential()
    backbone.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=input_shape[2:]))
    backbone.add(Conv2D(64, (3, 3), activation='relu'))
    backbone.add(MaxPooling2D((2, 2)))
    backbone.add(Dropout(0.25))
    backbone.add(Conv2D(64, (3, 3), activation='relu'))
    backbone.add(MaxPooling2D((2, 2)))
    backbone.add(Conv2D(64, (3, 3), activation='relu'))
    backbone.add(MaxPooling2D((2, 2)))
    backbone.add(Flatten())
    return backbone

In [None]:
def make_model(backbone):
    model=Sequential()
    model.add(TimeDistributed(backbone,input_shape=input_shape[1:]))
    model.add(Bidirectional(LSTM(32)))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(32,activation='relu'))
    model.add(Dense(2,activation='softmax'))
    return model

In [None]:
with strategy.scope():
    backbone = make_backbone()
    model = make_model(backbone)
    model.compile(loss='sparse_categorical_crossentropy',optimizer=tf.optimizers.Adam(2*1e-4),metrics=['accuracy'])

In [None]:
backbone.summary()

In [None]:
model.summary()

### Train

In [None]:
steps_per_epoch = 5698 // BATCH_SIZE
validation_steps = 2448 // BATCH_SIZE

In [None]:
print(f'steps_per_epoch {steps_per_epoch}')
print(f'validation_steps {validation_steps}')

In [None]:
modelCheckpoint = tf.keras.callbacks.ModelCheckpoint(filepath='checkpoints/cp.ckpt', save_best_only=True, monitor='val_accuracy', mode='max', verbose=1)

model.fit(train_ds, 
          initial_epoch=0,
          epochs=60, 
          steps_per_epoch=steps_per_epoch, 
          validation_data=val_ds, 
          validation_steps=validation_steps,
          callbacks=[modelCheckpoint], 
          verbose=1)

In [None]:
model.save('saved_models/1')

### Load Model

In [None]:
model = tf.keras.models.load_model('saved_models/1')

### Evaluation

In [None]:
y_pred = []
for i, (path, y) in enumerate(zip(val_paths, val_y)):
    frames = read_clip(path, IMG_H, IMG_W)
    masks = extract_masks(frames)
    X, _ = preprocess([frames, masks], y)
    p = np.argmax(model(X).numpy(), axis=-1)  
    p = (np.sum(p)/len(p) > 0.5).astype(np.int32)
    y_pred.append(p)
    print(f'{i}[{len(val_y)}]: path={path}, y={y}, p={p}')

In [None]:
y_true = val_y[:len(pred_y)]
y_pred = np.array(pred_y)

In [None]:
matthews_corrcoef(y_true, y_pred)