In [None]:
import os
import sys
from loguru import logger
from tqdm import tqdm
from yaml import load, FullLoader

import geopandas as gpd
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
sys.path.insert(1,'..')
import functions.fct_misc as misc

logger = misc.format_logger(logger)

# Processing

In [None]:
with open('../../config/config_symbol_classif.yaml') as fp:
    cfg = load(fp, Loader=FullLoader)['test_notebooks.py']

In [None]:
WORKING_DIR = cfg['working_dir']
OUTPUT_DIR = cfg['output_dir']
SVM_DIR = cfg['svm_dir']

IMAGES_FILE = cfg['image_gpkg']
HOG_FEATURES = cfg['hog_features']
BAND_STATS = cfg['band_stats']

In [None]:
os.chdir(WORKING_DIR)
os.makedirs(SVM_DIR, exist_ok=True)

Read files

In [None]:
images_gdf = gpd.read_file(IMAGES_FILE)
band_stats_df = pd.read_csv(BAND_STATS)
hog_features_df = pd.read_csv(HOG_FEATURES)

In [None]:
images_gdf.head()

In [None]:
band_stats_df.head()

In [None]:
hog_features_df.columns

In [None]:
images_w_stats_gdf = images_gdf.copy()
stat_list = []
for band in band_stats_df.band.unique():
    sub_band_stats_df = band_stats_df[band_stats_df.band == band].copy()
    sub_band_stats_df.rename(columns={'mean': f'mean_{band}', 'std': f'std_{band}', 'median': f'median_{band}', 'min': f'min_{band}', 'max': f'max_{band}'}, inplace=True)
    sub_band_stats_df.drop(columns=['CATEGORY', 'band'], inplace=True)
    sub_band_stats_df.loc[:, 'image_name'] = sub_band_stats_df.image_name.str.rstrip('.tif')

    images_w_stats_gdf = images_w_stats_gdf.merge(sub_band_stats_df, how='inner', on='image_name')
    
    stat_list.extend([f'mean_{band}', f'std_{band}', f'median_{band}', f'min_{band}', f'max_{band}'])

In [None]:
logger.error(f'{} elements were lost when joining the images and stats.')

In [None]:
images_w_stats_gdf.columns

In [None]:
corr_band_stats = images_w_stats_gdf[stat_list].corr()
corr_band_stats.to_excel(os.path.join(OUTPUT_DIR, 'band_stats_corr.xlsx'))

In [None]:
images_w_stats_gdf.drop(columns=[
    'mean_R', 'std_R', 'mean_G', 'min_G', 'mean_B', 'std_B',    # Columns with a high correlation with at least one other column
    'max_R', 'max_G',                                           # Columns unlikely to bring information based on the boxplot
], inplace=True)

In [None]:
name_map = {col: f'hog_{col}' for col in hog_features_df.columns if col != 'Unnamed: 0'}
hog_features_df.rename(columns=name_map, inplace=True)
hog_features_df['image_name'] = hog_features_df['Unnamed: 0'].str.rstrip('.tif')
hog_features_df.drop(columns=['Unnamed: 0'], inplace=True)


In [None]:
hog_features_df.head()

In [None]:
features_gdf = images_w_stats_gdf.merge(hog_features_df, how='inner', on='image_name')

In [None]:
features_gdf.head()

In [None]:
len(images_w_stats_gdf) - len(features_gdf)

In [None]:
features_gdf.columns

In [None]:
features_list = [col for col in features_gdf.columns if col.split('_')[0] in ['min', 'median', 'std', 'max', 'hog']]

In [None]:
features_gdf[features_list].to_numpy()

In [None]:
data_trn, data_tst, label_trn, label_tst, image_trn, image_tst = train_test_split(
    features_gdf[features_list].to_numpy(), features_gdf.CATEGORY, features_gdf.image_name, test_size=0.2, random_state=42
)

In [None]:
split_control_df = pd.merge(label_trn.value_counts().reset_index(), label_tst.value_counts().reset_index(), on='CATEGORY', suffixes=('_trn', '_tst'))
split_control_df['part_tst'] = round(split_control_df['count_tst'] / (split_control_df['count_tst'] + split_control_df['count_trn']), 3)
too_low = split_control_df[split_control_df['part_tst'] < 0.17].shape[0]

print(too_low)
split_control_df

In [None]:
# https://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use
svc_model = svm.SVC(random_state=42, cache_size=1000)
scaler = StandardScaler()

parameters = {
    'C': [i/10 for i in range(5, 50, 1)],
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}
clf = GridSearchCV(svc_model, parameters, n_jobs=10, verbose=2, scoring='f1_weighted')

In [None]:
data_trn_scaled = scaler.fit_transform(data_trn)
data_tst_scaled = scaler.transform(data_tst)

In [None]:
clf.fit(data_trn_scaled, label_trn)

In [None]:
clf.best_params_

In [None]:
pred_tst = clf.predict(data_tst_scaled)
pred_trn = clf.predict(data_trn_scaled)

In [None]:
confusion_matrix_df = pd.DataFrame(confusion_matrix(label_tst, pred_tst), columns=clf.classes_, index=clf.classes_)
confusion_matrix_df

In [None]:
print('METRICS FOR TEST DATA')
print("Accuracy: "+str(round(accuracy_score(label_tst, pred_tst), 3)))
print('\n')
print(classification_report(label_tst, pred_tst))
print('\n')

In [None]:
print('METRICS FOR TRAINING DATA')
print("Accuracy: "+str(round(accuracy_score(label_trn, pred_trn), 3)))
print('\n')
print(classification_report(label_trn, pred_trn))
print('\n')

In [None]:
results_df = pd.DataFrame({'image_name': image_tst, 'pred': pred_tst})
results_gdf = images_gdf.merge(results_df, how='inner', on='image_name')
results_gdf['correct'] = [True if row.CATEGORY == row.pred else False for row in results_gdf.itertuples()]

In [None]:
results_gdf.to_file(os.path.join(SVM_DIR, 'svm_results.gpkg'))