# Data Processing 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mat4py
import stumpy
import matplotlib.pylab as pl
#import ot 
import tensorflow as tf
import keras 
from keras.layers import Dense, Dropout
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from vmdpy import VMD 

import time

KeyboardInterrupt: 

In [None]:
# Previous data structure from a prior data61 collaboration 
# Not the form you would find on the previous website

# Input file is .mat type from Matlab --- engineering collaborations
data = mat4py.loadmat('Building_Model.mat')
print(data.keys())

# Extracting columns
nTests = data['nTests']
nHealthy = data['nHealthyTests']
nDamage = data['nDamagedTests']
nSensors = data['nSensors']
X = data['X']

# Constructing columns for new data frame (which will be defined later)
info_cols = ['y_labels','damage_location','damage_level','voltage_level']
X_cols_time = ['Xt_'+str(i) for i in range(len(X[0]))]
cols = [X_cols_time + info_cols]

print('Total number of sensors: '+str(nSensors))
print('Total number of tests: '+str(nTests))
print()
print('Total number of healthy data: '+str(data['nHealthyTests']))
print('Total number of damage data: '+str(data['nDamagedTests']))
print()
print('Total number of data instances: '+str(len(X)) + ' (which is equal to 24 sensors x 270 events)') # There are 24 sensors x 270 events = 6480 data instances 
print('Length of each data recording: '+str(len(X[0]))) # Each data recording is 8192 longn sampled at 1600 hz = 1600 samples/sec

len(data["labels_binary"]) #healthy or not. labels are arranged linearly (1st sensor is the first 270 datapoints)
data["EventIDs"]

time_start = time.time()

data_dict = {}
# Should I do some nise reduction in input signals???
for i in range(1,nSensors+1):
    
    # Extracting Data   
    X_time = X[nTests*(i-1):nTests*i]
    y = data['labels_binary'][nTests*(i-1):nTests*i] # collects LXX, DXX and VXX information
    locations = [ L[0][0:3] for L in data['EventIDs'][nTests*(i-1):nTests*i] ]
    damage_levels = [ L[0][4:7] for L in data['EventIDs'][nTests*(i-1):nTests*i] ] 
    shaker_voltage = [ L[0][8:11] for L in data['EventIDs'][nTests*(i-1):nTests*i] ]
    
    # Building Lists for Pandas
    sensor_data = []
    for j in range(len(X_time)):
        sensor_data.append(X_time[j]+y[j]+[locations[j]]+[damage_levels[j]]+[shaker_voltage[j]]) 
    
    # Constructing Time DataFrame
    sensor_time_df = pd.DataFrame(data=sensor_data, columns=cols) # 
    data_dict['Sensor'+str(i)] = sensor_time_df #
    
    print(f'Data processed for sensor {i}.')
    
time_end = time.time()

print(f'Time Taken: {time_end-time_start:.2f}s')

sens1 = data_dict["Sensor1"] 

sens1.to_csv("sens1.csv")


dict_keys(['SensorList', 'nSensors', 'EventIDs', 'X', 'nTests', 'nDamagedTests', 'nHealthyTests', 'EventSizes', 'labels_binary', 'F'])
Total number of sensors: 24
Total number of tests: 270

Total number of healthy data: 150
Total number of damage data: 120

Total number of data instances: 6480 (which is equal to 24 sensors x 270 events)
Length of each data recording: 8192
Data processed for sensor 1.
Data processed for sensor 2.
Data processed for sensor 3.
Data processed for sensor 4.
Data processed for sensor 5.
Data processed for sensor 6.
Data processed for sensor 7.
Data processed for sensor 8.
Data processed for sensor 9.
Data processed for sensor 10.
Data processed for sensor 11.
Data processed for sensor 12.
Data processed for sensor 13.
Data processed for sensor 14.
Data processed for sensor 15.
Data processed for sensor 16.
Data processed for sensor 17.
Data processed for sensor 18.
Data processed for sensor 19.
Data processed for sensor 20.
Data processed for sensor 21.
Dat

In [None]:
# Some helper functions

# plot function
def multiplot(data, orientation = "vertical", col_names = False):
    '''data is a list of datasets to be plotted. '''
    n = len(data) 
    if orientation == "vertical":
        fig, axs = plt.subplots(1,n,sharey=True)
    elif orientation == "horizontal":
        fig, axs = plt.subplots(n,1, sharex=True)
    fig.suptitle("Comparing differing damage levels")
    fig.set_size_inches(15,10)
    for i in range(n):
        axs[i].plot(data[i].values, linewidth=0.2)
        if col_names != False: 
            axs[i].set_xlabel(col_names[i])
            axs[i].set_ylabel("Amplitude")

    return 

# Boolean for df.apply()
def boolean_rows(a):
    return(a[0] & a[1] & a[2])

def extract_inf(sens):
    '''
    Input: sens is dataframe (as in usual input, like data_dict["Sensor1"].iloc[i] (ith test)'''
    y = sens["y_labels"]
    loc = sens["damage_location"]
    dam_level = sens["damage_level"]
    volt = sens["voltage_level"]
    data = sens[:-4]
    return data, y, loc, dam_level, volt

In [None]:
cases = [['L00','D00','V08'],   ['L1C','DB0','V08'],     ['L1C','DBB','V08'],      ['L13','DBB','V08']]
col_names = [["damage_location"], ["damage_level"], ["voltage_level"]]

# Generating TS_CAT 
L00_D = [] #initialise lists for indicators, rows, then data
data_L00_D = []
ind_D = []
num_samples = 5 # number of segments you wish to extract from each type of data

for i in range(len(cases)):
    ind_D.append(sens1[col_names]==cases[i]) # turn the indicators into a single T/F and find corresponding rows
    L00_D.append(sens1[ind_D[i].apply(boolean_rows, axis=1)])
    data = L00_D[i].iloc[0:num_samples, :-4]
    data_L00_D.append(data)

ts_pre_cat = []
for j in range(len(cases)):
    temp_data = []
    for i in range(num_samples):
        temp_data.append(data_L00_D[j].iloc[i,:])
    ts_pre_cat.append(pd.concat(temp_data))
ts_cat = pd.concat(ts_pre_cat)

ts_cat.to_csv("ts_cat_555.csv")