# PHME21 Data Challenge Submission by Team-GTU
## Cluster Performance

Tested with
* Python 3.7
* Scikit-learn 0.23.2


In [None]:
from os.path import isfile, join
from os import listdir
from numpy import nan
import pandas as pd
import numpy as np
import os
import pickle
from os.path import isfile, join
import re
from collections import Counter

In [None]:
# Get class id and run id from filename
def parse_class_name(fname):
    p = re.compile("^class[^\d]*(\d+)_(\d+).*.csv")
    m = p.match(fname)

    return m.groups()

In [None]:
def load_pickle_files():
    
    with open("models_to_submit.pkl", "rb") as input_file:
        l_pickles = pickle.load(input_file)
        
    fields_dict = l_pickles[0]
    sensor_list = l_pickles[1]
    scaler = l_pickles[2]
    lda = l_pickles[3]
    model4 = l_pickles[4]
    model = l_pickles[5]
                
    return fields_dict, sensor_list, scaler, lda, model4, model

In [None]:
# Load one data file and return in a data frame
def load_data_file(path, fname, fields_dict):
    
    fullpath = join(path, fname)
    df = pd.read_csv(fullpath)
    df.columns = ['name', 'data']

    dfx = []

    for f in fields_dict:
        name = fields_dict[f]['name']
        fields = fields_dict[f]['fields']

        data = eval(df.loc[f, 'data'])  # convert data to array

        new_df = pd.DataFrame(data)
        if (f == 33) and (new_df.shape[1] == 6):  # NumberFuseDetected has a special case!
            new_df[6] = new_df[5]
            new_df[5] = np.NaN

        new_df.columns = fields_dict[f]['fields']

        dfx.append(new_df)

    merged_df = pd.concat(dfx, axis=1)  # Merge columns

    # # Do some imputation on the data file
    # merged_df = impute_df(merged_df.copy())

    c, r = parse_class_name(fname)  # Get class id and run id

    # Add class labels and run id
    merged_df['class'] = int(c)
    merged_df['run'] = int(r)

    return merged_df

In [None]:
def fill_nan_values(data, name, fields):

    field_df = data[fields]

    if field_df.isnull().values.any():
        data[fields] = field_df.interpolate(method='linear', limit_direction='both')

    return data[fields]

In [None]:
#BONUS POINT: This script is used to assess the performance of the clustering result
#Test CreateCluster is the prototype of the function that each team can develop to cluster fault-free experiments
#This function must handle all the operation to: read the input files and return the clustering result
#Input: 
# - Folder Name: The name of the folder where the experiment file are stored
#Output:
# - ExperimentList: the name of the exeperiments in the input Folder. 
### IMPORTANT: This list must return the experiment in the same order as processed by the clustering. 
# - ClusterLabels: The cluster ID for each Experiment in the ExperimentList list

def CreateCluster(FolderName):

    ExperimentList = [f for f in listdir(FolderName) if isfile(join(FolderName, f))]
    
    ClusterLabels = []
    
    ws = 40    
    
    fields_dict, sensor_list, scaler, lda, _, model = load_pickle_files()
    
    
    for Experiment in ExperimentList:
        
        df = load_data_file(FolderName, Experiment, fields_dict)
        # print(df.isnull().sum().any())

        for f in fields_dict:
            name = fields_dict[f]['name']
            fields = fields_dict[f]['fields']

            # print("\nname:", name, "fields:", fields)
            df_ = df.groupby(["class", "run"]).apply(fill_nan_values, name, fields)
            df_.reset_index(drop=True, inplace=True)
            df[fields] = df_[fields]

        df = df[sensor_list + ["class", "run"]]
        df = df.rename(columns={'run': 'runId'})

        X_test_df = df[sensor_list + ["class", "runId"]].copy()

        scaler_cols = sensor_list.copy()  # list(set(sensor_list).difference(["class", "runId"]))
        # scaler_cols = ['Temperature_value', 'Humidity_value']

        scaler_data_ts = scaler.transform(X_test_df[scaler_cols])
        scaler_data_ts = pd.DataFrame(scaler_data_ts, index=X_test_df.index, columns=scaler_cols)
        X_test = scaler_data_ts[['Temperature_value', 'Humidity_value']]
        
        y_pred = model.predict(X_test)
        
        results_dict = Counter(y_pred)
        
        most_common = dict(results_dict.most_common(1))
        true_class = list(most_common.keys())[0]
        
        ClusterLabels.append(true_class)        
    

    return ExperimentList, ClusterLabels

In [None]:
#Logperformance function stores the final performance. Only this performance will be used to compute the Penalty score of each team
def LogPerformance(ExperimentList, ClusterLabels):

    if not os.path.exists('Cluster_Results'):
        os.makedirs('Cluster_Results')
        
    PerformanceOutput = open("Cluster_Results/ClusterPerformance.csv","w")
    PerformanceOutput.write("Experiment;ClusterLabel\n")
    for i in range(len(ExperimentList)):
        Experiment   = ExperimentList[i]
        ClusterLabel = ClusterLabels[i]
        PerformanceOutput.write(Experiment+";"+str(ClusterLabel)+"\n")
    PerformanceOutput.close()
    
    return

In [None]:

#Example of the validation pipleline for the BONUS POINT.
#Data/ is the folder where the experiment is stored

def main():
    
    FolderName = "Data_Cluster/"
    ExperimentList, ClusterLabels = CreateCluster(FolderName)
    print(ExperimentList, ClusterLabels)
    
    LogPerformance(ExperimentList, ClusterLabels)
    return

main()