# 1. TEST PERFORMACNE

In [75]:
from numpy import nan
import pandas as pd
import numpy as np
import os
import pickle
from os.path import isfile, join
import re
from collections import Counter


In [45]:
def load_pickle_files():
    
    with open("../../data/results/models/_sub_fields_dict.pickle", "rb") as input_file:
        fields_dict = pickle.load(input_file)
    
    with open("../../data/results/models/_sub_sensor_list.pickle", "rb") as input_file:
        sensor_list = pickle.load(input_file)
    
    with open("../../data/results/models/_sub_scaler.pickle", "rb") as input_file:
        scaler = pickle.load(input_file)
        
    with open("../../data/results/models/_sub_lda.pickle", "rb") as input_file:
        lda = pickle.load(input_file)
        
    with open("../../data/results/models/_sub_model_xgb.pickle", "rb") as input_file:
        model = pickle.load(input_file)
        
    return fields_dict, sensor_list, scaler, lda, model    

In [11]:
# Get class id and run id from filename
def parse_class_name(fname):
    p = re.compile("^class[^\d]*(\d+)_(\d+).*.csv")
    m = p.match(fname)

    return m.groups()

In [26]:
# Load one data file and return in a data frame
def load_data_file(path, fname):
    
    fname = "{}.csv".format(fname)
    fullpath = join(path, fname)
    df = pd.read_csv(fullpath)
    df.columns = ['name', 'data']

    dfx = []

    for f in fields_dict:
        name = fields_dict[f]['name']
        fields = fields_dict[f]['fields']

        data = eval(df.loc[f, 'data'])  # convert data to array

        new_df = pd.DataFrame(data)
        if (f == 33) and (new_df.shape[1] == 6):  # NumberFuseDetected has a special case!
            new_df[6] = new_df[5]
            new_df[5] = np.NaN

        new_df.columns = fields_dict[f]['fields']

        dfx.append(new_df)

    merged_df = pd.concat(dfx, axis=1)  # Merge columns

    # # Do some imputation on the data file
    # merged_df = impute_df(merged_df.copy())

    c, r = parse_class_name(fname)  # Get class id and run id

    # Add class labels and run id
    merged_df['class'] = int(c)
    merged_df['run'] = int(r)

    return merged_df

In [32]:
def fill_nan_values(data, name, fields):

    field_df = data[fields]

    if field_df.isnull().values.any():
        data[fields] = field_df.interpolate(method='linear', limit_direction='both')

    return data[fields]

In [56]:
# split a sequence into samples
def create_sequence(sequence, n_steps):
    X = list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence):
            break
        # gather input and output parts of the pattern
        seq_x = sequence[i:end_ix]
        X.append(seq_x)

    return np.array(X)

def create_dataset_for_run(df, ws):
    #     data_data = np.empty((0, ws * len(sensor_list))) # for 1D
    #     data_data = np.empty((0, ws, len(sensor_list))) # for 2D
    #     data_data = np.empty((0, len(sensor_list), ws)) # for 2D
    #     label_data = np.empty((0, 1))

    sensors_df = df.filter(sensor_list)

    # Calculate seq of windows_size len
    seq = create_sequence(sensors_df.values, n_steps=ws)
    #     seq = np.transpose(seq, axes=(0, 2, 1))
    seq_count = seq.shape[0]
    seq = seq.reshape((seq_count, -1))  # for 1D

    # add new seq to data_data array
    # data_data = np.vstack((data_data, seq))

    # Calculate RULS
    labels = df['class'].values[:seq_count]

    # add rul to rul_data array
    #     rul_data = np.vstack((rul_data, ruls))

    # TODO: What is RUL_Max in this context?

    # print ("Shape:", seq.shape, labels.shape)

    return seq, labels

# TODO: X_t, X_tp1, y_t, y_tp1 should be calculated per run.
# TODO: Then should be merged into one X_t, X_tp1, y_t, y_tp1.

def create_datasets(df, ws):
    run_list = df['runId'].unique()
    l_len_runs = []

    X_df_list = []
    y_df_list = []

    for r in run_list:
        r_df = df[df['runId'] == r]
        # print ("--> r: ", r, r_df.shape)
        sensor_data, label_data = create_dataset_for_run(r_df, ws)

        # Post Processing for the model

        # Padding for model input
        padded_sensor_data = sensor_data.copy()  # np.hstack((sensor_data, np.zeros((sensor_data.shape[0], 2)))) # for AE

        # Calculate X(t) and X(t+1) for model input/output
        X_t = padded_sensor_data[:]

        # Calculate y(t) and y(t+1) for model input/output
        y_t = label_data[:]

        X_df_list.append(pd.DataFrame(X_t))
        y_df_list.append(pd.DataFrame(y_t))
        l_len_runs.append(len(X_t))

    X_t = pd.concat(X_df_list, axis=0)  # Merge data frames
    y_t = pd.concat(y_df_list, axis=0)  # Merge data frames

    return X_t.values, y_t.values.flatten(), run_list, l_len_runs

In [214]:
def get_optimum_point_degradation(y_pred, ws):
    
    results_dict = Counter(y_pred)
    
    ws_step = 5

    most_common = dict(results_dict.most_common(1))
    most_commons = dict(results_dict.most_common(2))
    
    true_class = list(most_common.keys())[0]
    # print("most_commons:", most_commons)
    
    if len(most_commons) == len(most_common): # if only one class
        return true_class, ws + ws_step
            
    l_optimum = []
    
    for i in range(0, len(y_pred), ws_step):
        x_steps = y_pred[i: i+ws_step]
        l_optimum.extend(list(x_steps))
        # print(i, i + ws_step, list(x_steps), len(l_optimum))
        
        results_dict_ = Counter(l_optimum)
    
        most_common2 = dict(results_dict_.most_common(1))
        pred_class = list(most_common2.keys())[0]
        
        if pred_class == true_class:
            
            return true_class, ws + i + ws_step                          
        
    # print("burda!!!")      
    return true_class, len(y_pred)

In [182]:
# true_class, opt_cut_point = get_optimum_point_degradation(run_data, ws)
# true_class, opt_cut_point

In [183]:
# true_class, opt_cut_point = get_optimum_point_degradation(run_data[:opt_cut_point], ws)
# true_class, opt_cut_point

In [242]:
# ws = 40
# x_data = pd.read_excel("../../data/results/excels/xgb_ws_40_fold_1_mcc.xlsx", index_col=False)
# x_data = x_data[["Unnamed: 0", "pred"]]
# run_data = x_data.loc[x_data["Unnamed: 0"] == 25]["pred"]
# run_data

In [243]:
# Counter(run_data)

In [204]:
def get_rank_class(label):
    
    if label == 0:
        return ["S4","S2","S1","S3"]
    elif label == 2:
        return ["S3","S4","S1","S3"]
    # ...
    
    else:
        return ["S1","S2","S4","S3"]
    

In [273]:
#Test Classification is the prototype of the function that each team must develop to classify new data
#This function must handle all the operation to: read in a streaming order the input file, make the earlier possibile classification, return the required data
#Input: 
# - Folder Name: The name of the folder where the experiment file is stored
# - Experiment: The experiment name that must be read 
#Output:
# - Predicted Label: the label predicted by the classifier
# - Time for classification: how much time of the input data was required to perform the classification task
# - Ranking: The Features ranked according to the team solution

def TestClassification(FolderName, Experiment):
          
    Label = ""
    Time = -1
    Ranking = []
    
    
    ws = 40
    
    fields_dict, sensor_list, scaler, lda, model = load_pickle_files()
    
    df = load_data_file(FolderName, Experiment)
    # print(df.isnull().sum().any())
    
    for f in fields_dict:
        name = fields_dict[f]['name']
        fields = fields_dict[f]['fields']

        # print("\nname:", name, "fields:", fields)
        df_ = df.groupby(["class", "run"]).apply(fill_nan_values, name, fields)
        df_.reset_index(drop=True, inplace=True)
        df[fields] = df_[fields]
        
    # print(df.isnull().sum().any())
    
    # for column in df.columns:
    #     if column not in ["class", "run"]:
    #         if (len(df[column].unique()) == 1) or (df[column].isnull().all()):
    #             df.drop(column, inplace=True, axis=1)
    #             print(column, "droped-unique")
    # 
    #         else:
    #             zero_rows = df.loc[df[column] == float(0)]
    #             if zero_rows.shape[0] >= df.shape[0] * 50:
    #                 df.drop(column, inplace=True, axis=1)
    #                 print(column, "droped-zero")
    # 
    # print(df.isnull().sum().any()) 
    # print(df.shape)

    df = df[sensor_list + ["class", "run"]]
    df = df.rename(columns={'run': 'runId'})
    
    X_test_df = df[sensor_list + ["class", "runId"]].copy()
    
    scaler_cols = sensor_list.copy()  # list(set(sensor_list).difference(["class", "runId"]))

    scaler_data_ts = scaler.transform(X_test_df[scaler_cols])
    scaler_data_ts = pd.DataFrame(scaler_data_ts, index=X_test_df.index, columns=scaler_cols)
    X_test_df = pd.concat([X_test_df[["class", "runId"]], scaler_data_ts], axis=1)

    X_test, y_test, runList_test, l_len_runs_test = create_datasets(X_test_df, ws)
    X_test = lda.transform(X_test)
    
    y_pred = model.predict(X_test)
    
    #Outputs
    Label, Time = get_optimum_point_degradation(y_pred, ws)
    Ranking = get_rank_class(Label)

    return Label, Time, Ranking

In [206]:
#The Record Performance function store the information of the performance achieved in the 1st run of the classification
def RecordPerformance(Experiment, Label, Time, Ranking):

    if not os.path.exists('First'):
        os.makedirs('First')
    
    PerformanceOutput = open("First/%s.csv"%Experiment,"w")
    PerformanceOutput.write("Experiment;Label;Time;Ranking\n")
    PerformanceOutput.write(Experiment+";"+str(Label)+";"+str(Time)+";"+str(Ranking)+"\n")
    PerformanceOutput.close()
    
    return

#The CutExperiment function is used to cut the input experiment in the time order. For each experiment, the cut is performed accoring to the time to classification declared by the team 
def CutExperiment(FolderName, Experiment, Time):

    if not os.path.exists('Cut'):
        os.makedirs('Cut')
    
    data = pd.read_csv(FolderName+"/%s.csv"%Experiment,sep=",")  

    df = pd.DataFrame(columns = ["c1","c2"])
    
    for i in range(len(data)):
        field = data.iloc[i][0]
        records = eval(data.iloc[i][1])[:Time]

        df = df.append({"c1": field, "c2": records}, ignore_index=True)    
    
    columns =list(data.columns)
    df.columns = ["",columns[1]]
    df.to_csv("Cut/%s.csv"%Experiment,index=False)
    return

#ComparePerfomance check if the team achieved the same performance in the 1st and in the 2nd run 
def ComparePerformance(Experiment,Label, Time, Ranking):
    
    Performance = pd.read_csv("First/%s.csv"%Experiment,sep=";")        
        
    if(Performance["Label"].iloc[0]!=Label): return False
    if(Performance["Time"].iloc[0]!=Time): return False
    if(Performance["Ranking"].iloc[0]!=str(Ranking)): return False
        
    return True

#GetWorst function returns the worst performance in case the 1st and 2nd run performance does not match
def GetWorst(Experiment, Label, Time, Ranking):
    
    Performance = pd.read_csv("First/%s.csv"%Experiment,sep=";")  
    
    if(Performance["Label"].iloc[0]!=Label): return Performance["Label"].iloc[0], -1, Performance["Ranking"].iloc[0]
    if(Performance["Time"].iloc[0]!=Time): return Performance["Label"].iloc[0], Performance["Time"].iloc[0], Performance["Ranking"].iloc[0]
    if(Performance["Ranking"].iloc[0]!=str(Ranking)): return Performance["Label"].iloc[0], Performance["Time"].iloc[0], Performance["Ranking"].iloc[0]
    
    return

#Logperformance function stores the final performance. Only this performance will be used to compute the Penalty score of each team
def LogPerformance(Experiment,Label, Time, Ranking):

    if not os.path.exists('Results'):
        os.makedirs('Results')
        
    PerformanceOutput = open("Results/%s.csv"%Experiment,"w")
    PerformanceOutput.write("Experiment;Label;Time;Ranking\n")
    PerformanceOutput.write(Experiment+";"+str(Label)+";"+str(Time)+";"+str(Ranking)+"\n")
    PerformanceOutput.close()
    
    return

In [241]:
#Example of the validation pipleline by using a single experiment.
#Data/ is the folder where the experiment is stored
#class_0_0_data is the experiment name
#Cut/ is the folder where only the cut experiment will be saved

def main():
    
    FolderName = "Data/"
    Experiment = "class_ 0_0_data"
    Label, Time, Ranking = TestClassification(FolderName, Experiment)
    
    RecordPerformance(Experiment, Label, Time, Ranking)
    CutExperiment(FolderName,Experiment,Time)
     
    FolderName = "Cut/"
    Label, Time, Ranking = TestClassification(FolderName,Experiment)
    
    Equal = ComparePerformance(Experiment,Label, Time, Ranking)
    if(Equal==False):
        Label, Time, Ranking = GetWorst(Experiment,Label, Time, Ranking)
     
    LogPerformance(Experiment,Label, Time, Ranking)
    return 
    
    
main()

(361, 249)
most_commons: {0: 285, 5: 20}
0 5 [11, 11, 11, 11, 11] 5
5 10 [11, 11, 11, 11, 11] 10
10 15 [11, 11, 11, 11, 11] 15
15 20 [11, 5, 5, 5, 5] 20
20 25 [5, 5, 5, 5, 5] 25
25 30 [5, 5, 5, 5, 5] 30
30 35 [5, 5, 5, 5, 5] 35
35 40 [5, 2, 0, 0, 0] 40
40 45 [0, 0, 0, 0, 0] 45
45 50 [0, 0, 0, 0, 0] 50
50 55 [0, 0, 0, 0, 0] 55
55 60 [0, 0, 0, 0, 0] 60
0 100 ['S4', 'S2', 'S1', 'S3']
(100, 249)
most_commons: {0: 64, 5: 20}
0 5 [11, 11, 11, 11, 11] 5
5 10 [11, 11, 11, 11, 11] 10
10 15 [11, 11, 11, 11, 11] 15
15 20 [11, 5, 5, 5, 5] 20
20 25 [5, 5, 5, 5, 5] 25
25 30 [5, 5, 5, 5, 5] 30
30 35 [5, 5, 5, 5, 5] 35
35 40 [5, 2, 0, 0, 0] 40
40 45 [0, 0, 0, 0, 0] 45
45 50 [0, 0, 0, 0, 0] 50
50 55 [0, 0, 0, 0, 0] 55
55 60 [0, 0, 0, 0, 0] 60
0 100 ['S4', 'S2', 'S1', 'S3']
Equal: True


# 2. CLUSTERING

In [248]:
from os.path import isfile, join
from os import listdir

In [258]:
def load_pickle_files2():
    
    with open("../../data/results/models/_sub_fields_dict.pickle", "rb") as input_file:
        fields_dict = pickle.load(input_file)
    
    with open("../../data/results/models/_sub_sensor_list.pickle", "rb") as input_file:
        sensor_list = pickle.load(input_file)
    
    with open("../../data/results/models/_sub_scaler.pickle", "rb") as input_file:
        scaler = pickle.load(input_file)
        
    with open("../../data/results/models/_sub_model_kmeans.pickle", "rb") as input_file:
        model = pickle.load(input_file)
        
    return fields_dict, sensor_list, scaler, model 

In [259]:
# Load one data file and return in a data frame
def load_data_file2(path, fname):
    
    fullpath = join(path, fname)
    df = pd.read_csv(fullpath)
    df.columns = ['name', 'data']

    dfx = []

    for f in fields_dict:
        name = fields_dict[f]['name']
        fields = fields_dict[f]['fields']

        data = eval(df.loc[f, 'data'])  # convert data to array

        new_df = pd.DataFrame(data)
        if (f == 33) and (new_df.shape[1] == 6):  # NumberFuseDetected has a special case!
            new_df[6] = new_df[5]
            new_df[5] = np.NaN

        new_df.columns = fields_dict[f]['fields']

        dfx.append(new_df)

    merged_df = pd.concat(dfx, axis=1)  # Merge columns

    # # Do some imputation on the data file
    # merged_df = impute_df(merged_df.copy())

    c, r = parse_class_name(fname)  # Get class id and run id

    # Add class labels and run id
    merged_df['class'] = int(c)
    merged_df['run'] = int(r)

    return merged_df

In [271]:
#BONUS POINT: This script is used to assess the performance of the clustering result
#Test CreateCluster is the prototype of the function that each team can develop to cluster fault-free experiments
#This function must handle all the operation to: read the input files and return the clustering result
#Input: 
# - Folder Name: The name of the folder where the experiment file are stored
#Output:
# - ExperimentList: the name of the exeperiments in the input Folder. 
### IMPORTANT: This list must return the experiment in the same order as processed by the clustering. 
# - ClusterLabels: The cluster ID for each Experiment in the ExperimentList list

def CreateCluster(FolderName):

    ExperimentList = [f for f in listdir(FolderName) if isfile(join(FolderName, f))]
    
    ClusterLabels = []
    
    ws = 40    
    fields_dict, sensor_list, scaler, model = load_pickle_files2()
    
    for Experiment in ExperimentList:
        
        df = load_data_file2(FolderName, Experiment)
        # print(df.isnull().sum().any())

        for f in fields_dict:
            name = fields_dict[f]['name']
            fields = fields_dict[f]['fields']

            # print("\nname:", name, "fields:", fields)
            df_ = df.groupby(["class", "run"]).apply(fill_nan_values, name, fields)
            df_.reset_index(drop=True, inplace=True)
            df[fields] = df_[fields]

        # print(df.isnull().sum().any())

        # for column in df.columns:
        #     if column not in ["class", "run"]:
        #         if (len(df[column].unique()) == 1) or (df[column].isnull().all()):
        #             df.drop(column, inplace=True, axis=1)
        #             print(column, "droped-unique")
        # 
        #         else:
        #             zero_rows = df.loc[df[column] == float(0)]
        #             if zero_rows.shape[0] >= df.shape[0] * 50:
        #                 df.drop(column, inplace=True, axis=1)
        #                 print(column, "droped-zero")
        # 
        # print(df.isnull().sum().any()) 
        # print(df.shape)

        df = df[sensor_list + ["class", "run"]]
        df = df.rename(columns={'run': 'runId'})

        X_test_df = df[sensor_list + ["class", "runId"]].copy()

        scaler_cols = sensor_list.copy()  # list(set(sensor_list).difference(["class", "runId"]))
        # scaler_cols = ['Temperature_value', 'Humidity_value']

        scaler_data_ts = scaler.transform(X_test_df[scaler_cols])
        scaler_data_ts = pd.DataFrame(scaler_data_ts, index=X_test_df.index, columns=scaler_cols)
        X_test = scaler_data_ts[['Temperature_value', 'Humidity_value']]
        
        y_pred = model.predict(X_test)
        
        results_dict = Counter(y_pred)
        
        most_common = dict(results_dict.most_common(1))
        true_class = list(most_common.keys())[0]
        
        ClusterLabels.append(true_class)        
    

    return ExperimentList, ClusterLabels

In [266]:
#Logperformance function stores the final performance. Only this performance will be used to compute the Penalty score of each team
def LogPerformance(ExperimentList, ClusterLabels):

    if not os.path.exists('Cluster_Results'):
        os.makedirs('Cluster_Results')
        
    PerformanceOutput = open("Cluster_Results/ClusterPerformance.csv","w")
    PerformanceOutput.write("Experiment;ClusterLabel\n")
    for i in range(len(ExperimentList)):
        Experiment   = ExperimentList[i]
        ClusterLabel = ClusterLabels[i]
        PerformanceOutput.write(Experiment+";"+str(ClusterLabel)+"\n")
    PerformanceOutput.close()
    
    return

In [272]:

#Example of the validation pipleline for the BONUS POINT.
#Data/ is the folder where the experiment is stored

def main2():
    
    FolderName = "Data/"
    ExperimentList, ClusterLabels = CreateCluster(FolderName)
    print(ExperimentList, ClusterLabels)
    
    LogPerformance(ExperimentList, ClusterLabels)
    return

main2()

['class_ 0_0_data.csv', 'class_ 0_100_data.csv', 'class_ 0_10_data.csv', 'class_ 0_13_data.csv', 'class_ 0_26_data.csv', 'class_ 0_28_data.csv', 'class_ 0_42_data.csv', 'class_ 0_44_data.csv', 'class_ 0_49_data.csv', 'class_ 0_51_data.csv', 'class_ 0_58_data.csv', 'class_ 0_61_data.csv', 'class_ 0_64_data.csv', 'class_ 0_69_data.csv', 'class_ 0_73_data.csv', 'class_ 0_82_data.csv', 'class_ 0_89_data.csv', 'class_ 0_92_data.csv', 'class_ 0_94_data.csv', 'class_ 0_99_data.csv'] [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
