# Ablation Study 1 - Simple Regression Model for Classification

##### This notebook contains all the code necessary to perform our first ablation test. Here, we are seeing if using a simple logistic regression model based on the three peak fit parameters (height, width, and position) can successfully classify a pixel into material vs. substrate.

# Load in Libraries, Data Files, and Ground Truth Data

In [None]:
import hyperspy.api as hs
import hyperspy.signal_tools as hs_st
import hyperspy.axes as axes
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from hyperspy.signals import Signal1D
from tqdm import tqdm
import csv
import os
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from Ground_Truth_Creator import getGT
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

hs.set_log_level('INFO')

In [None]:
# Loop through tif stacks to load in data as well as the ground truth results
file_list = []
for item in os.listdir('Time Series Oxidation Files'):
    if item.endswith('.tif'):
        file_list.append(item)

data_list = []
for item in tqdm(file_list):
    data_list.append(getGT(item))

# Data Frame Creation and Visualization

In [None]:
height_Array = []
position_Array = []
width_Array = []
time_Array = []
classification_Array = []

# Loop through .csv files containing curve fit parameters and organize arrays for
# each peak fit parameter
for file in tqdm(os.listdir('Peak Parameter CSV Files')):
    if file.endswith('Amp.csv'):
        with open(file, 'r') as f:
            reader = csv.reader(f)
            data = list(reader)
            amp_data = list(np.array(data).flatten())
            height_Array = height_Array + amp_data
            time = int(file.split('min')[0])
            time_Array = time_Array + list(np.ones(90000)*time)

    elif file.endswith('Center.csv'):
        with open(file, 'r') as f:
            reader = csv.reader(f)
            data = list(reader)
            center_data = list(np.array(data).flatten())
            position_Array = position_Array + center_data

    elif file.endswith('Sigma.csv'):
        with open(file, 'r') as f:
            reader = csv.reader(f)
            data = list(reader)
            sigma_data = list(np.array(data).flatten())
            width_Array = width_Array + sigma_data

for item in data_list:
    classification_Array = classification_Array + item

# Combine into one large array and process into dataframe
data_Array = np.array([height_Array, position_Array, width_Array, classification_Array, time_Array]).T

In [None]:
Curve_DF = pd.DataFrame(data_Array, columns=['Height', 'Center', 'Width', 'Classification', 'Time'], dtype='float')
Curve_DF

In [None]:
# Scale original data and use class_weight function to balance the mismatch in classes
scaler = StandardScaler()
x_train, x_test, y_train, y_test = train_test_split(scaler.fit_transform(Curve_DF[['Height', 'Center', 'Width']]), Curve_DF['Classification'], train_size=0.8)
LogReg = LogisticRegression(class_weight={1:2, 0:1}, solver='lbfgs')
LogReg.fit(x_train, y_train)
print(f"Accuracy = {np.round(LogReg.score(x_test, y_test), 5)}")
ConfusionMatrixDisplay.from_predictions(y_test, LogReg.predict(x_test), display_labels=LogReg.classes_, cmap='viridis')
plt.show()