In [None]:
import os
import sys
from glob import glob
from loguru import logger
from yaml import load, FullLoader

import pandas as pd
import numpy as np
import rasterio as rio
from matplotlib import pyplot as plt
from skimage import color
from skimage.feature import hog
from skimage.transform import resize

from collections import Counter
from math import floor


In [None]:
sys.path.insert(1,'..')
import functions.fct_misc as misc
from functions.fct_rasters import remove_black_border

logger = misc.format_logger(logger)

## Functions

In [None]:
def im_list_to_hog(im_list, channel_axis=None):
    hog_images = {}
    hog_features = {}
    for name, image in im_list.items():
        ppc = floor(min(image.shape)/6)
        fd, hog_image = hog(image, orientations=4, pixels_per_cell=(ppc,ppc), cells_per_block=(4, 4), block_norm= 'L2', visualize=True, channel_axis=channel_axis)
        hog_images[name] = hog_image
        hog_features[name] = fd

    return hog_images, hog_features

In [None]:
def print_images(image_dict, v_max):
    f, axarr = plt.subplots(3,3)
    axarr[0,0].imshow(image_dict['0_2570184_1148461.tif'], vmin=0, vmax=v_max)
    axarr[0,1].imshow(image_dict['4_2569842_1149296.tif'], vmin=0, vmax=v_max)
    axarr[0,2].imshow(image_dict['1_2571614_1152259.tif'], vmin=0, vmax=v_max)
    axarr[1,0].imshow(image_dict['5_2569300_1148156.tif'], vmin=0, vmax=v_max)
    axarr[1,1].imshow(image_dict['0_2570190_1148491.tif'], vmin=0, vmax=v_max)
    axarr[1,2].imshow(image_dict['10_2580845_1165703.tif'], vmin=0, vmax=v_max)
    axarr[2,0].imshow(image_dict['4_2569483_1149035.tif'], vmin=0, vmax=v_max)
    axarr[2,1].imshow(image_dict['5_2569281_1148151.tif'], vmin=0, vmax=v_max)
    axarr[2,2].imshow(image_dict['6_2567727_1147671.tif'], vmin=0, vmax=v_max)

## Processing

Argument and parameter specification

In [None]:
with open('../../config/config_symbol_classif.yaml') as fp:
    cfg = load(fp, Loader=FullLoader)['test_notebooks.py']

Load input parameters

In [None]:
WORKING_DIR = cfg['working_dir']
OUTPUT_DIR = cfg['output_dir']
TILE_DIR = cfg['tile_dir']

In [None]:
os.chdir(WORKING_DIR)
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
logger.info('Read data...')
tile_list = glob(os.path.join(TILE_DIR, '*.tif'))

In [None]:
image_data = {}
for tile_path in tile_list:
    with rio.open(tile_path) as src:
        image_data[os.path.basename(tile_path)] = src.read().transpose(1, 2, 0)

In [None]:
image_dict = image_data
f, axarr = plt.subplots(3,3)
axarr[0,0].imshow(image_dict['0_2570184_1148461.tif'])
axarr[0,1].imshow(image_dict['4_2569842_1149296.tif'])
axarr[0,2].imshow(image_dict['1_2571614_1152259.tif'])
axarr[1,0].imshow(image_dict['5_2569300_1148156.tif'])
axarr[1,1].imshow(image_dict['0_2570190_1148491.tif'])
axarr[1,2].imshow(image_dict['10_2580845_1165703.tif'])
axarr[2,0].imshow(image_dict['4_2569483_1149035.tif'])
axarr[2,1].imshow(image_dict['5_2569281_1148151.tif'])
axarr[2,2].imshow(image_dict['6_2567727_1147671.tif'])

In [None]:
data_gray = {key: color.rgb2gray(i) for key, i in image_data.items()}

In [None]:
image_dict = data_gray
vmax = 1
print_images(image_dict, vmax)

In [None]:
data_ratio = {key: np.divide(i[:,:, 2], i[:,:, 0], out=i[:,:, 2].astype(np.float64), where=i[:,:, 0]!=0) for key, i in image_data.items()}

In [None]:
norm_data_ratio = {key: (i-np.min(i))/(np.max(i)-np.min(i))*255 for key, i in data_ratio.items()}

In [None]:
image_dict = data_ratio
f, axarr = plt.subplots(3,3)
axarr[0,0].imshow(image_dict['0_2570184_1148461.tif'], vmin=0, vmax=image_dict['0_2570184_1148461.tif'].max())
axarr[0,1].imshow(image_dict['4_2569842_1149296.tif'], vmin=0, vmax=image_dict['4_2569842_1149296.tif'].max())
axarr[0,2].imshow(image_dict['1_2571614_1152259.tif'], vmin=0, vmax=image_dict['1_2571614_1152259.tif'].max())
axarr[1,0].imshow(image_dict['5_2569300_1148156.tif'], vmin=0, vmax=image_dict['5_2569300_1148156.tif'].max())
axarr[1,1].imshow(image_dict['0_2570190_1148491.tif'], vmin=0, vmax=image_dict['0_2570190_1148491.tif'].max())
axarr[1,2].imshow(image_dict['10_2580845_1165703.tif'], vmin=0, vmax=image_dict['10_2580845_1165703.tif'].max())
axarr[2,0].imshow(image_dict['4_2569483_1149035.tif'], vmin=0, vmax=image_dict['4_2569483_1149035.tif'].max())
axarr[2,1].imshow(image_dict['5_2569281_1148151.tif'], vmin=0, vmax=image_dict['5_2569281_1148151.tif'].max())
axarr[2,2].imshow(image_dict['6_2567727_1147671.tif'], vmin=0, vmax=image_dict['6_2567727_1147671.tif'].max())

In [None]:
image_dict = norm_data_ratio
v_max = 255
print_images(image_dict, v_max)

In [None]:
np.histogram(data_ratio['10_2580845_1165703.tif'], bins=25)

In [None]:
np.histogram(norm_data_ratio['10_2580845_1165703.tif'], bins=50)

In [None]:
np.unique(data_ratio['10_2580845_1165703.tif'])[-2]

In [None]:
image_dict = norm_data_ratio
v_max = 255
print_images(image_dict, v_max)

In [None]:
# Normalized based on the second highest value
second_norm_data = {key: np.divide((i-np.min(i)), (np.unique(i)[-2]-np.min(i)), out=np.ones_like(i), where=i<=np.unique(i)[-2])*255 for key, i in data_ratio.items()}

In [None]:
np.histogram(second_norm_data['10_2580845_1165703.tif'], bins=50)

In [None]:
image_dict = second_norm_data
v_max = 255
f, axarr = plt.subplots(3,3)
axarr[0,0].imshow(image_dict['0_2570184_1148461.tif'])
axarr[0,1].imshow(image_dict['4_2569842_1149296.tif'])
axarr[0,2].imshow(image_dict['1_2571614_1152259.tif'])
axarr[1,0].imshow(image_dict['5_2569300_1148156.tif'])
axarr[1,1].imshow(image_dict['0_2570190_1148491.tif'])
axarr[1,2].imshow(image_dict['10_2580845_1165703.tif'])
axarr[2,0].imshow(image_dict['4_2569483_1149035.tif'])
axarr[2,1].imshow(image_dict['5_2569281_1148151.tif'])
axarr[2,2].imshow(image_dict['6_2567727_1147671.tif'])

In [None]:
# Normalized all value between 0 and 2 to a range of 0 to 255
third_norm_data = {key: np.divide((i-np.min(i)), (2-np.min(i)), out=np.ones_like(i), where=i<2)*255 for key, i in data_ratio.items()}

In [None]:
np.histogram(third_norm_data['6_2567727_1147671.tif'], bins=50)

In [None]:
image_dict = third_norm_data
v_max = 255
print_images(image_dict, v_max)

### Hog on scaled data

In [None]:
hog_scaled_images, hog_scaled_features = im_list_to_hog(third_norm_data)

In [None]:
np.histogram(hog_scaled_images['6_2567727_1147671.tif'], bins=50)

In [None]:
image_dict = hog_scaled_images
v_max = 25
print_images(image_dict, v_max)

### HOG on grey images

#### Scale images

In [None]:
cropped_images = {k: remove_black_border(v) for k, v in data_gray.items()}

In [None]:
print(data_gray['13_2584431_1160733.tif'].shape)
print(cropped_images['13_2584431_1160733.tif'].shape)

In [None]:
oriented_grey_images = {k: v.transpose(1, 0) if v.shape[1] > v.shape[0] else v for k, v in cropped_images.items()}

Get ratio and size of the images

In [None]:
size_ratio = {k: v.shape[0] / v.shape[1] for k, v in oriented_grey_images.items()}
print(min(size_ratio.values()), max(size_ratio.values()))

In [None]:
max_size_images = {k: max(v.shape) for k, v in oriented_grey_images.items()}
array_values = np.array(list(max_size_images.values()))
print(min(max_size_images.values()), max(max_size_images.values()), np.median(array_values))

In [None]:
min_size_images = {k: min(v.shape) for k, v in oriented_grey_images.items()}
array_values = np.array(list(min_size_images.values()))
print(min(min_size_images.values()), max(min_size_images.values()), np.median(array_values))

In [None]:
df = pd.DataFrame.from_dict(size_ratio, orient='index')
df.hist(bins=50)
plt.title('Size Ratio')

In [None]:
df = pd.DataFrame.from_dict(max_size_images, orient='index')
df.hist(bins=50)
plt.title('Image size')

In [None]:
resized_images = {}
for name, image in cropped_images.items():
    new_size = np.median(array_values)
    if max(cropped_images[name].shape) <= new_size:
        resized_images[name] = resize(cropped_images[name], (new_size, new_size))
    else:
        resized_images[name] = resize(cropped_images[name], (new_size, new_size), anti_aliasing=True)

In [None]:
print(cropped_images['0_2570184_1148461.tif'].shape, cropped_images['4_2569842_1149296.tif'].shape, cropped_images['1_2571614_1152259.tif'].shape)
print(cropped_images['5_2569300_1148156.tif'].shape, cropped_images['0_2570190_1148491.tif'].shape, cropped_images['10_2580845_1165703.tif'].shape)
print(cropped_images['4_2569483_1149035.tif'].shape, cropped_images['5_2569281_1148151.tif'].shape, cropped_images['6_2567727_1147671.tif'].shape)

In [None]:
print(resized_images['0_2570184_1148461.tif'].shape, resized_images['4_2569842_1149296.tif'].shape, resized_images['1_2571614_1152259.tif'].shape)
print(resized_images['5_2569300_1148156.tif'].shape, resized_images['0_2570190_1148491.tif'].shape, resized_images['10_2580845_1165703.tif'].shape)
print(resized_images['4_2569483_1149035.tif'].shape, resized_images['5_2569281_1148151.tif'].shape, resized_images['6_2567727_1147671.tif'].shape)

In [None]:
image_dict = resized_images
v_max = 1
print_images(image_dict, v_max)

In [None]:
max_size_images = {k: max(v.shape) for k, v in resized_images.items()}
array_values = np.array(list(max_size_images.values()))
print(min(max_size_images.values()), max(max_size_images.values()), np.median(array_values))

#### Apply HOG

In [None]:
hog_gray_images, hog_gray_features = im_list_to_hog(resized_images)

In [None]:
np.histogram(hog_gray_images['6_2567727_1147671.tif'], bins=50)

In [None]:
image_dict = hog_gray_images
v_max = 0.1
print_images(image_dict, v_max)

In [None]:
test_list = [len(ft) for ft in hog_gray_features.values()]
(min(test_list), max(test_list))

In [None]:
print(hog_gray_features['0_2570184_1148461.tif'])

#### Control correlation to save only necessary features

In [None]:
hog_features_df = pd.DataFrame(hog_gray_features)
hog_features_df = hog_features_df.transpose()

In [None]:
hog_features_df.head()

In [None]:
corr_hog_features = hog_features_df.corr()

In [None]:
high_corr_indices = np.where(corr_hog_features.abs() > 0.667)
row_indices = high_corr_indices[0]
column_indices = high_corr_indices[1]

In [None]:
filtered_row_indices = [row for row, col in zip(row_indices, column_indices) if row < col]
filtered_column_indices = [col for row, col in zip(row_indices, column_indices) if row < col]
filtered_indices_list = filtered_row_indices + filtered_column_indices

filtered_pos_dict = {i: [filtered_row_indices[i], filtered_column_indices[i]] for i in range(len(filtered_row_indices))}
filtered_pos_df = pd.DataFrame(filtered_pos_dict).transpose()

In [None]:
values_count = Counter(filtered_indices_list)
print(values_count.most_common(25))

In [None]:
left_duplicates_df = filtered_pos_df.copy()
cleaned_hog_features_df = hog_features_df.copy()
for value, _ in values_count.most_common():
    cleaned_hog_features_df.drop(columns=[value], inplace=True)
    left_duplicates_df = left_duplicates_df[(left_duplicates_df[0] != value) & (left_duplicates_df[1] != value)].copy()

    if left_duplicates_df.empty:
        break

In [None]:
logger.info(f'{hog_features_df.shape[1] - cleaned_hog_features_df.shape[1]} features were dropped because of a high correlation.')
logger.info(f'{len(values_count.keys())} features were listed in the list of indices')
logger.info(f'{cleaned_hog_features_df.shape[1]} hog features are left.')

In [None]:
cleaned_hog_features_df.head(5)

In [None]:
cleaned_hog_features_df.to_csv(os.path.join(OUTPUT_DIR, 'hog_features.csv'))