## Code to move heightmaps to single folder for easier testing. Also storing ids for future backtracking

In [1]:
import fnmatch
import scipy.io as sio
import os
import numpy as np
import pandas as pd

from shutil import copyfile
from random import shuffle

import matplotlib.pyplot as plt

from PIL import Image

from collections import Counter


In [2]:
dest_path = "data\\heparin\\agg"
id_file_name = 'ids.csv'
id_file_name2 = 'ids2.csv'

source_folders = []

source_folders.append(("data\\heparin\\2018.5.15\\prp1\\data", "2018.5.15", 1800))
source_folders.append(("data\\heparin\\2018.5.15\\prp2\\data", "2018.5.15", 3600))
source_folders.append(("data\\heparin\\2018.5.15\\prp3\\data", "2018.5.15", 7200))
source_folders.append(("data\\heparin\\2018.6.1\\prp1\\data", "2018.6.1", 1800))
source_folders.append(("data\\heparin\\2018.6.1\\prp2\\data", "2018.6.1", 3600))
source_folders.append(("data\\heparin\\2018.6.1\\prp3\\data", "2018.6.1", 7200))
source_folders.append(("data\\heparin\\2018.6.5\\prp1\\data", "2018.6.5", 1800))
source_folders.append(("data\\heparin\\2018.6.5\\prp2\\data", "2018.6.5", 3600))
source_folders.append(("data\\heparin\\2018.6.5\\prp3\\data", "2018.6.5", 7200))
source_folders.append(("data\\heparin\\2018.6.6\\prp1\\data", "2018.6.6", 1800))
source_folders.append(("data\\heparin\\2018.6.6\\prp2\\data", "2018.6.6", 3600))
source_folders.append(("data\\heparin\\2018.6.6\\prp3\\data", "2018.6.6", 7200))

In [3]:
write_files = True

Load previous database of ids if available

In [4]:
df = None

for file in os.listdir(dest_path):
    if fnmatch.fnmatch(file, id_file_name):
        df = pd.read_csv(f'{dest_path}\\{file}')

if df is None:
    df = pd.DataFrame(columns=['name', 'date', 'flow_rate', 'source'])

In [5]:
print(df.shape)

(0, 4)


In [6]:
files = []

count = df.shape[0]

for source_path, date, flow in source_folders:
    for file in os.listdir(source_path):
        if fnmatch.fnmatch(file, '*.mat'):            
            hm = sio.loadmat(f'{source_path}\\{file}')['height_r']
        
            if np.max(hm) == 0:
                continue
                
            files.append([str(count), date, flow, file])
            
            if write_files:
                copyfile(f'{source_path}\\{file}', f'{dest_path}\\{count}.mat')
            count += 1
        
df2 = pd.DataFrame(files, columns=['name', 'date', 'flow_rate', 'source'])
df = df.append(df2, ignore_index=True)

In [7]:
df.to_csv(os.path.join(dest_path, id_file_name))

## Find max image size

In [8]:
files = []

max_x = 0
max_y = 0

filenames = []
images = []

for file in os.listdir(dest_path):
    if fnmatch.fnmatch(file, '*.mat'):
        files.append(file)
        
        hm = sio.loadmat(f'{dest_path}\\{file}')['height_r']
        
        if np.max(hm) == 0:
            continue
        
        images.append((hm, file))
        filenames.append(file)
        
        max_x=max(hm.shape[0],max_x)
        max_y=max(hm.shape[1],max_y)
                
print(max_x)
print(max_y)
print(len(images))

685
377
138


## Pad all images to max size

In [9]:
padded_images = []

for m, filename in images:
    filename = filename[:-4]
    padded_images.append((np.pad(np.copy(m), ((0,max_x-m.shape[0]),(0,max_y-m.shape[1])), 'constant', constant_values=(0)), filename))

## Save Padded images as numpy array

In [10]:
for image, filename in padded_images:
    if write_files:
        np.save(f'{dest_path}\\{filename}', np.array(image))


## Data Augmentation

### Flip images along x-axis 

In [11]:
all_imgs = padded_images.copy()
files = []

for image, filename in padded_images:
    new_name = filename+"_flipped"
    flipped_img = np.fliplr(image)
    
    old_image = df.loc[df['name'] == filename].copy()
    date = old_image['date'].values[0]
    flow = old_image['flow_rate'].values[0]
    source = old_image['source'].values[0]
        
    if write_files:
        np.save(f'{dest_path}\\{new_name}', flipped_img)
        
    all_imgs.append((flipped_img, new_name))
    
    files.append([new_name, date, flow, source])

df2 = pd.DataFrame(files, columns=['name', 'date', 'flow_rate', 'source'])
df = df.append(df2, ignore_index=True)
    
df.to_csv(os.path.join(dest_path, id_file_name))

In [12]:
print(len(all_imgs))

276


## Data Analysis

In [13]:
df['flow_rate'].value_counts()

3600    122
1800     78
7200     76
Name: flow_rate, dtype: int64

In [14]:
def plot_images_for_filenames(filenames, labels, rows=4):
    imgs = [np.load(f'{train_path}{filename}.npy') for filename in filenames]
    
    return plot_images(imgs, labels, rows)
    
        
def plot_images(imgs, labels, rows=4):
    # Set figure to 13 inches x 8 inches
    figure = plt.figure(figsize=(20, 20))

    cols = len(imgs) // rows + 1

    for i in range(min(len(imgs), 1000)):
        subplot = figure.add_subplot(rows, cols, i + 1)
        subplot.axis('Off')
        if labels:
            subplot.set_title(labels[i], fontsize=16)
        plt.imshow(imgs[i], cmap='gray')

In [15]:
slow = df[df['flow_rate'] == 1800]
plot_images_for_filenames(list(slow['name']), None, rows=9)

NameError: name 'train_path' is not defined

In [None]:
medium = df[df['flow_rate'] == 3600]
plot_images_for_filenames(list(medium['name']), None, rows=9)

In [None]:
fast = df[df['flow_rate'] == 7200]
plot_images_for_filenames(list(fast['name']), None, rows=9)

In [None]:
df = []

for file in os.listdir(dest_path):
    if fnmatch.fnmatch(file, '*.mat'):
        hm = sio.loadmat(f'{dest_path}\\{file}')['height_r']
        df.append(hm)
        
img_sizes = Counter([i.shape for i in df])

size, freq = zip(*Counter({i: v for i, v in img_sizes.items()}).most_common(20))

plt.figure(figsize=(10, 6))

plt.bar(range(len(freq)), list(freq), align='center')
plt.xticks(range(len(size)), list(size), rotation=70)
plt.title("Image size frequencies (where freq > 1)")

plt.show()