# Notebook for preprocessing and visualizing the measurements

Preprocessing includes assembling the snippets of a measurement into one file, converting them into the same measuring unit, deleting NaN values (∞) and cropping the beginning of the measurement, giving alle measurements the same length.

In [None]:
import os
import random

import pandas
import numpy as np
from matplotlib import pyplot as plt

In [None]:
def get_file_paths(directory: str, file_format: str = "csv") -> list:
    """Returns the paths to the csv files in the subdirectories of the given directory.

    Args:
        directory (str): the directory whose subdirectories should be searched for csv files
        file_format (str): file ending of the files that should be considered (without the dot)

    Returns:
        list: paths to the csv files as a list of strings
    """
    files = []
    for path_tuple in os.walk(directory):
        for filename in path_tuple[2]:
            if filename.endswith("."+file_format):
                files.append(os.path.join(path_tuple[0], filename))
    return files


def load_measurement_original_format(filename: str, return_measuring_units: bool = False) -> pandas.DataFrame:
    """Load measurement csv file in the original standard format, containing the lines
    with the channel names and measuring units and a blank line in the beginning of the file.


    Args:
        filename (str): path to the csv file
        return_measuring_units (bool): whether the second line of the file should be returned

    Returns:
        pandas.DataFrame: dataframe containing the time series in the csv file
        str (optional): string containing the measuring units
    """
    df = pandas.read_csv(filename, delimiter=";")
    if return_measuring_units:
        measuring_units = df[:1]
    df = df[1:].apply(lambda x: x.str.replace(",", "."))
    df = df.replace("∞",1000000000)
    df = df.replace("-∞",-1000000000)
    df = df.astype(float)
    if return_measuring_units:
        return df, measuring_units
    else:
        return df


def save_in_original_format(
    df: pandas.DataFrame, output_filename: str, 
    first_line: str = "Zeit;Kanal A\n", 
    second_line: str = "(s);(V)\n"
):
    """Saves the DataFrame in the original format.

    Args:
        df: pandas.DataFrame that should be saved
        output_filename: csv file to which the time series should be saved
        first_line (str): first line of the created file; should contain channel names
        second_line (str): second line of the created file; should contain measuring units
    """
    with open(output_filename, "w", encoding='UTF-8') as file:
        file.write(first_line)
        file.write(second_line)
        file.write("\n")
        df.to_csv(file, index=False, sep=";", header=False, line_terminator='\n', \
                  float_format="%.8f", decimal=",")

## Preprocessing of the measurement snippets

In [None]:
label = "POS"
assert label in ["POS","NEG"]

In [None]:
input_dir = ".\Messdaten\Messwoche_KW16\Gut-Bilder\Zyklusfahrt ohne Fehler mit angeklemmten Saugrohrdrucksensor" if \
    label == "POS" else ".\Messdaten\Messwoche_KW16\Schlecht-Bilder\Zyklusfahrten, angeklemmter (defekt - P0172) "\
    "Saugrohrdrucksensor+vorkat Lambdasonde,"

output_dir = ".\Messdaten\Messwoche_KW16_zusammengesetzt_gekürzt\Gut-Bilder\Zyklusfahrt ohne Fehler mit angeklemmten "\
    "Saugrohrdrucksensor" if label == "POS" else ".\Messdaten\Messwoche_KW16_zusammengesetzt_gekürzt\Schlecht-"\
    "Bilder\Zyklusfahrten, angeklemmter (defekt - P0172) Saugrohrdrucksensor+vorkat Lambdasonde,"

In [None]:
measurement_dirs = [os.path.join(input_dir, sub_directory) for sub_directory in os.listdir(input_dir) if \
                    os.path.isdir(os.path.join(input_dir, sub_directory))]

In [None]:
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

In [None]:
expected_measuring_units = np.array(['(s)', '(V)', '(V)', '(V)', '(V)', '(V)', '(V)', '(V)', '(V)'],
      dtype=object)

use_second_line_of_first_snippet_as_measuring_units = False
cut_NaN = True
desired_length = 42676

for m_dir in measurement_dirs:
    snippets = []
    whole_measurement = None
    time_offset = 0
    
    csv_snippets = get_file_paths(m_dir)
    assert len(csv_snippets) < 10
    for snippet_file in csv_snippets:
        ts, current_measuring_units = load_measurement_original_format(snippet_file, return_measuring_units = True)
        current_measuring_units = current_measuring_units.to_numpy().squeeze()
        ts_array = ts.to_numpy().T 
        
        if whole_measurement is None and use_second_line_of_first_snippet_as_measuring_units:
            expected_measuring_units = current_measuring_units
            
                # convert from mV to V and from ms to s
        for i in range(len(current_measuring_units)):
            if expected_measuring_units[i] != current_measuring_units[i] :
                if expected_measuring_units[i] == "(mV)" and current_measuring_units[i] == "(V)":
                    ts_array[i] *= 1000
                elif expected_measuring_units[i] == "(V)" and current_measuring_units[i] == "(mV)":
                    ts_array[i] /= 1000
                elif expected_measuring_units[i] == "(ms)" and current_measuring_units[i] == "(s)":
                    ts_array[i] *= 1000
                elif expected_measuring_units[i] == "(s)" and current_measuring_units[i] == "(ms)":
                    ts_array[i] /= 1000
                else:
                    print(
                        "Warning: Unexpected case of measurement unit mismatch!", 
                        expected_measuring_units[0,i], current_measuring_units[0,i]
                    )
        
        if whole_measurement is None:
            whole_measurement = ts_array

        else:
            
            time_point_difference = ts_array[0,1] - ts_array[0,0]
            
            ts_array[0] = ts_array[0] - ts_array[0,0] + time_offset + time_point_difference
            
            whole_measurement = np.append(whole_measurement,ts_array, axis = 1)
            
        time_offset = ts_array[0, -1]
        
    else:
        
        df = pandas.DataFrame(whole_measurement).T
        df = df.replace(1000000,1000000000)
        df = df.replace(1000000000000,1000000000)
        df = df.replace(-1000000,-1000000000)
        df = df.replace(-1000000000000,-1000000000)
    
    if (df == 1000000000).any().any() or (df == -1000000000).any().any():
        print("NaN in ", m_dir)
        
        if cut_NaN:
            last_NaN = np.where(df == 1000000000)[0][-1]
            df = df[last_NaN+1:]
    
    if desired_length is not None:
        df = df[-desired_length:]
        
    # save measurements
    save_in_original_format(
        df, 
        output_filename = os.path.join(output_dir, label + "_" + m_dir.split(os.path.sep)[-1]+".csv"), 
        first_line = "Zeit;Kanal A;Kanal B;Kanal C;Kanal D;Kanal E;Kanal F;Kanal G;Kanal H\n", 
        second_line = pandas.DataFrame(np.expand_dims(expected_measuring_units, axis=0)).\
            to_csv(header=None, index=False, sep=";").replace("\r","")
    )

## Visualize the new files

In [None]:
new_file_input_dir = ".\Messdaten\Messwoche_KW16_zusammengesetzt_gekürzt"

csv_files = get_file_paths(new_file_input_dir)

positive_files = [csv_file for csv_file in csv_files if "POS" in csv_file]
negative_files = [csv_file for csv_file in csv_files if "NEG" in csv_file]

### Visualize a positive measurement

In [None]:
pos_idx = random.randint(0,len(positive_files)-1)
print(positive_files[pos_idx])
df = load_measurement_original_format(positive_files[pos_idx])
df.plot(subplots=True, figsize=(20,30))

### Visualize a negative measurement

In [None]:
neg_idx = random.randint(0,len(negative_files)-1)
print(negative_files[neg_idx])
df = load_measurement_original_format(negative_files[neg_idx])
df.plot(subplots=True, figsize=(20,30))