In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import glob
import sys
from dotenv import load_dotenv
from tqdm import tqdm
tqdm.pandas()
import math
from sklearn.model_selection import train_test_split
import os
import PIL
import tensorflow.keras.backend as K
sys.path.append('../scripts/helper_functions_cv/tensorflow_helpers/')
from save_weights_every_epoch import CallbackForSavingModelWeights
from multiprocessing import Pool

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
strategy = tf.distribute.get_strategy()
AUTO = tf.data.experimental.AUTOTUNE
REPLICAS = strategy.num_replicas_in_sync

In [3]:
load_dotenv('../config_files/dev.env')

True

In [4]:
train_files = glob.glob(f'{os.environ["files_path"]}train_v2/*.jpg')

In [5]:
train_csv = pd.read_csv(f'{os.environ["files_path"]}train_ship_segmentations_v2.csv')

In [6]:
train_csv.head()

Unnamed: 0,ImageId,EncodedPixels
0,00003e153.jpg,
1,0001124c7.jpg,
2,000155de5.jpg,264661 17 265429 33 266197 33 266965 33 267733...
3,000194a2d.jpg,360486 1 361252 4 362019 5 362785 8 363552 10 ...
4,000194a2d.jpg,51834 9 52602 9 53370 9 54138 9 54906 9 55674 ...


In [7]:
updated_train_csv = train_csv.groupby('ImageId')['EncodedPixels'].apply(list).reset_index()

In [8]:
# (updated_train_csv['EncodedPixels'].values.tolist()[0][0])

In [9]:
def create_classification_labels(data):
    try:
        if math.isnan(data[0]) == True:
            return 0
    except:
        return 1
    # else:
    #     return 1

In [10]:
updated_train_csv['class_labels'] = updated_train_csv['EncodedPixels'].progress_apply(create_classification_labels)

100%|██████████| 192556/192556 [00:00<00:00, 732557.15it/s]


In [11]:
updated_train_csv.head()

Unnamed: 0,ImageId,EncodedPixels,class_labels
0,00003e153.jpg,[nan],0
1,0001124c7.jpg,[nan],0
2,000155de5.jpg,[264661 17 265429 33 266197 33 266965 33 26773...,1
3,000194a2d.jpg,[360486 1 361252 4 362019 5 362785 8 363552 10...,1
4,0001b1832.jpg,[nan],0


In [12]:
updated_train_csv['fixed_paths'] = updated_train_csv['ImageId'].progress_apply(lambda x: f"{os.environ['files_path']}train_v2/" + x)

100%|██████████| 192556/192556 [00:00<00:00, 520142.92it/s]


In [16]:
def find_bad_ones(data):
    bad_ones = []
    for x in tqdm(data):
        try:
            img = tf.io.read_file(x)
            img = tf.image.decode_jpeg(img, channels = 3)
        except:
            bad_ones.append(x)
    return bad_ones

In [17]:
to_check = updated_train_csv['fixed_paths'].values.tolist()

In [18]:
bad_ones = find_bad_ones(to_check)

100%|██████████| 192556/192556 [08:15<00:00, 388.26it/s]


In [22]:
bad_ones

['/home/ubuntu/files/train_v2/6384c3e78.jpg']

In [25]:
updated_train_csv = updated_train_csv[updated_train_csv.fixed_paths != '/home/ubuntu/files/train_v2/6384c3e78.jpg']

In [26]:
def split_datasets(data, test_size = 0.01):
    train, test = train_test_split(data, test_size = test_size, random_state = 42) 
    train, val = train_test_split(train, test_size = test_size, random_state = 42)
    return train, val, test

In [27]:
train, val, test = split_datasets(updated_train_csv)

In [28]:
def read_train_imgs(img, label, shape):
    img = tf.io.read_file(img)
    img = tf.image.decode_jpeg(img, channels = 3)
    img = tf.image.resize(img, size = shape)
    img = img / 255
    return img, label

In [35]:
def get_data(data, shape = (256, 256), shuffle = True, repeat = True, batch = True, batch_size = 32):
    imgs, labels = data['fixed_paths'].values.tolist(), data['class_labels'].values.tolist()
    shapes = [shape for x in range(len(imgs))]
    tensor = tf.data.Dataset.from_tensor_slices((imgs, labels, shapes))
    tensor = tensor.cache()
    if repeat:
        tensor = tensor.repeat()
    if shuffle:
        tensor = tensor.shuffle(2048 * 2)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        tensor = tensor.with_options(opt)
    tensor = tensor.map(read_train_imgs)
    if batch:
        tensor = tensor.batch(batch_size * REPLICAS)
    tensor = tensor.prefetch(AUTO)
    return tensor

In [30]:
train_dataset = get_data(train)
val_dataset = get_data(val)
test_dataset = get_data(test)

In [36]:
def create_model(model_name, shape):
    with strategy.scope():
        input_layer = tf.keras.Input(shape = shape)
        construct = getattr(keras.applications, model_name)
        mid_layer = construct(include_top = False, 
                            weights = None, 
                            pooling = 'avg')(input_layer)
        last_layer = keras.layers.Dense(1, activation = 'sigmoid')(mid_layer)
        model = keras.Model(input_layer, last_layer)
    return model
def compile_new_model(model):
    with strategy.scope():
        loss = keras.losses.BinaryCrossentropy()
        optimizer = keras.optimizers.SGD()
        prec = keras.metrics.Precision(name = 'prec')
        rec = keras.metrics.Recall(name = 'rec')
        model.compile(
            loss = loss,
            optimizer = optimizer,
            metrics = [prec, rec]
        )
    return model

In [37]:
model = create_model('ResNet50', (256, 256, 3))
model = compile_new_model(model)

In [38]:
K.clear_session()
log_dir = f"{os.environ['tb_path']}classification/renset_50_baseline/"
if os.path.exists(log_dir) == False:
    os.makedirs(log_dir)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = log_dir)
weights_path = '../../weights/classification/resnet_50_baseline/'
weights_save = CallbackForSavingModelWeights(weights_path)
batch_size = 32
train_dataset = get_data(train)
val_dataset = get_data(val, repeat = False, shuffle = False)
model = create_model('ResNet50', (256, 256, 3))
model = compile_new_model(model)
model_hist = model.fit(
    train_dataset,
    validation_data = val_dataset,
    verbose = 1,
    epochs = 20,
    steps_per_epoch = len(train) // (batch_size * REPLICAS),
    callbacks = [
        tensorboard_callback,
        weights_save
    ]
)

Epoch 1/20
 766/5897 [==>...........................] - ETA: 5:28 - loss: 0.6161 - prec: 0.4005 - rec: 0.1918