# Initialization

In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
%cd /gdrive/My Drive/phuc_code_file

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np 
import pandas as pd 
from google.colab import widgets
import matplotlib.pyplot as plt
from sklearn import preprocessing 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from utils import file_helper, feature_extraction

import time

In [None]:
%ls 

# Construct Filter

In [None]:
from scipy.signal import butter, lfilter, freqz
from scipy import signal 
from utils.peaks_util import get_echo_peaks


class DataProcessor:
    def __init__(self, frequency = "1.14MHz"):
        self.config = file_helper.get_config()
        self.options = file_helper.get_config()['options']
#         self.selected_frequency = list(self.options.keys())[0]
        self.selected_frequency = frequency
        self.selected_element = self.options[self.selected_frequency]
        self.LOW_PASS = self.config['low']
        self.HIGH_PASS = self.config['high']

        if 'low' in self.selected_element:
            self.LOW_PASS = self.selected_element['low']
        if 'high' in self.selected_element:
            self.HIGH_PASS = self.selected_element['high']
            
        self.NOISE_SIZE = self.selected_element['noise_size']
        self.DATA_HEADERS_SIZE = self.config['raw_data_header']
        self.ECHO_SIZE_LEFT = self.selected_element['echo_size_left']
        self.ECHO_SIZE = self.selected_element['echo_size']
        self.order = self.config['order']
        self.VARIANCE_THRESHOLD = self.selected_element['variance_threshold']


    def butter_lowpass(self, cutoff, fs, order):
        nyq = 0.5 * fs
        normal_cutoff = cutoff / nyq
        b, a = butter(order, normal_cutoff, btype='low', analog=False)
        return b, a

    def butter_highpass(self, cutoff, fs, order):
        nyq = 0.5 * fs
        normal_cutoff = cutoff / nyq
        b, a = butter(order, normal_cutoff, btype='high', analog=False)
        return b, a

    def apply_filter(self, data):
        b, a = self.butter_lowpass(
            self.LOW_PASS, self.selected_element['value'], order=self.order)
        y = signal.filtfilt(b, a, data)

        b, a = self.butter_highpass(
            self.HIGH_PASS, self.selected_element['value'], self.order)
        y_ = signal.filtfilt(b, a, y)
        return y_

    # Step 1 and 2
    # This method removes offset from time domain data
    # @params filename
    # @returns dataframe

    def get_time_domain_without_offset(self, filename):
        data_frame = pd.read_csv(filename, skiprows=[0], header=None)
        required_data_frame = data_frame.iloc[:, self.DATA_HEADERS_SIZE:]
        return required_data_frame.sub(required_data_frame.mean(axis=1), axis=0)

    # Step 3
    # this method applies low and high pass filter to time domain data
    # @params data_frame
    # @returns list

    def get_filtered_values(self, data_frame):
        new_data = []
        for data in data_frame.values:
            new_data.append(data)
        new_data = np.array(new_data)
        return new_data

    def get_echo_set_location(self, data):
        data = np.array(data)
        no_of_windows = round(data.size[1]/self.ECHO_SIZE)
        echo_set = []
        for d in data:
            window_peak_locations = []
            for i in range(0, no_of_windows):
                window_peak = d[i*self.ECHO_SIZE:(i+1)*self.ECHO_SIZE].max()
                if window_peak >= self.THRESHOLD:
                    window_peak_location = i*self.ECHO_SIZE + \
                        d[i*self.ECHO_SIZE:(i+1)*self.ECHO_SIZE].argmax()
                    if i > 0 or window_peak_location > self.ECHO_SIZE_LEFT:
                        window_peak_locations.append(window_peak_location)
            echos = []
            if len(window_peak_locations):
                echos = [window_peak_locations[0]]
                prev_echo = window_peak_locations[0]
                for w in window_peak_locations:
                    if prev_echo - w > self.ECHO_SIZE:
                        echos.append(w)
                    prev_echo = w
            echo_set.append(echos)
        return echo_set

    def get_echo_with_index(self, data_values, rolling_variance_dataframe_value=[]):
        echo_list = []
        if not len(rolling_variance_dataframe_value):
            data_values = data_values[self.NOISE_SIZE:]
            dfObj = pd.DataFrame([data_values])

            rolling_variance_dataframe = dfObj.rolling(
                window=100, axis=1).var()
            peaks, _ = signal.find_peaks(
                rolling_variance_dataframe.values[0], height=self.VARIANCE_THRESHOLD)
        else:
            peaks, _ = signal.find_peaks(
                rolling_variance_dataframe_value, height=self.VARIANCE_THRESHOLD)
        if len(peaks):
            echo_peaks = get_echo_peaks(
                peaks, self.ECHO_SIZE, self.ECHO_SIZE_LEFT)
            if len(echo_peaks):
                for index, e in enumerate(echo_peaks):
                    echo_left_size = e - self.ECHO_SIZE_LEFT
                    echo_right_size = echo_left_size + self.ECHO_SIZE
                    if echo_right_size > len(data_values):
                        continue
                    echo_data = {
                        #Take the entire waveform
                        'ECHO': data_values,
                    }
                    echo_list.append(echo_data)
            return echo_list
        return None

    def find_echos(self, data_values):
        data_values = data_values[:, self.NOISE_SIZE:]
        dfObj = pd.DataFrame(data_values)
        rolling_variance_dataframe = dfObj.rolling(
            window=100, axis=1).var()
        echo_list = []
        for i, d in enumerate(rolling_variance_dataframe.values):
            echo_data = self.get_echo_with_index(
                data_values[i], d)
            if echo_data:
                echo_list = echo_list + echo_data
        if len(echo_list):
            df = pd.DataFrame.from_dict(echo_list)
            df_with_echo = pd.DataFrame(df.ECHO.tolist()) 
            return df_with_echo
        return None
    def get_features_from_echo(self, echos_data, row):
        df_fft = echos_data.iloc[:, 1:]
        fft_list = feature_extraction.fft_from_data_frame(
            df_fft, self.selected_element['value'], 
            self.config['low'], 
            self.config['high'])
        fft_set = pd.DataFrame(fft_list)
        fft_set['type'] = row['type']
        fft_set = fft_set.set_index(
            ['type']).reset_index()

        return fft_set


# Data Filtering

In [None]:
def load_file(directory):
    try:
        directory = directory
        files = file_helper.files_from_directory(directory)
        return files
    except:
        raise Exception('Please choose a correct file!')

In [None]:
def save_echo_to_file(directory, freq = '1.14MHz'):
    files = load_file(directory)
    print(files)
    processor = DataProcessor(frequency = freq)
    echo_data_set = pd.DataFrame()
    fft_data_set = pd.DataFrame()
    begin = time.time()
    for index, file in enumerate(files):
        print("Index: {}".format(index))
        if len(file['absolute_path'].split('/')) >= 4:
            try: 
                time_domain_data_set = processor.get_time_domain_without_offset(
                    file['absolute_path'])

            except: 
                continue
            start = time.time()
            filtered_data_values = processor.get_filtered_values(
                time_domain_data_set)
            echos_data = processor.find_echos(
                filtered_data_values)
            end = time.time()
            print("Process time: {}".format(end - start))
            if isinstance(echos_data, pd.DataFrame):
                row = {
                    'type': file['absolute_path'].split('/')[-4].upper(),
                }
                fft_data_set = fft_data_set.append(
                    processor.get_features_from_echo(echos_data, row), ignore_index=True)
                echos_data['type'] = row['type']
                echos_data = echos_data.set_index(
                    ['type']).reset_index()
                echo_data_set = echo_data_set.append(
                    echos_data, ignore_index=True)
    final = time.time()
    print("Total run time: {}".format(final - begin))      
    echo_data_set = echo_data_set.dropna()
    fft_data_set = fft_data_set.dropna()
    return echo_data_set

# Export Echo

In [None]:
echo_dataset= save_echo_to_file(directory = './moving_class/114MHz', freq='1.14MHz')

In [None]:
echo_dataset

# Save Datasets into CSV files

In [None]:
# echo_dataset.to_csv(r'full_echo.csv', index = False,header = True)

# fft_dataset.to_csv(r'updated_fft_dataset_merged_newestfreq.csv',index = False, header = True)

In [None]:
# echo_dataset = pd.read_csv('full_echo.csv')

In [None]:
calculate_instance(echo_dataset)

Calculating Instance:

HUMAN :  47607 training instances

BICYCLE :  11057 training instances

WALL :  12974 training instances

PILLAR :  16832 training instances

CAR :  28367 training instances

Non Human:  69230 training instances

# Binary Split

In [None]:
from sklearn.preprocessing import LabelEncoder
def process_data(line):
    label, feature = line.iloc[:,0],line.iloc[:,1:]
    label = label.replace(['BICYCLE','WALL','PILLAR','CAR'], 'NON_HUMAN')
    label = label.replace('NON_HUMAN', 0)
    label = label.replace('HUMAN', 1)
    feature_norm = preprocessing.normalize(feature)
    feature_X = feature_norm.reshape(-1,1,5704)
    return label, feature_X

def split_dataset(dataframe, method = True): 
    if method == True: 
        # Use pandas copy method to avioid any modifications to the data or indices of the copy will not be
        # reflected in the original object. 
        labels = dataframe.iloc[:,0].copy() 
        features = dataframe.iloc[:,1:].copy()
    # else: 
    #     labels = fft.iloc[:,0].copy()
    #     features = fft.iloc[:,1:].copy()
    return labels, features

def calculate_instance(data): 
    print("Calculating Instance:")
    for i in ['HUMAN','BICYCLE','WALL','PILLAR','CAR']:
        name = data.loc[data['type'] == i]
        print(i, ": ", name.shape[0], "training instances")
    non_name = data.loc[data['type'] != 'HUMAN']
    print("Non Human: ", non_name.shape[0], "training instances")

def calculate_valid_size (test_size, train_size): 
    train_valid_size = 1.0 - test_size
    valid_size = np.around((test_size*1.0)/train_valid_size, 2)
    return valid_size

# Train/Test/Valid split function
def ttv_split(features, labels, size_test, size_valid):
    X_train_val, X_test, y_train_val, y_test = train_test_split(features,labels, test_size = size_test, shuffle = True, 
                                                                random_state = 42, stratify = labels)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size = size_valid , shuffle = True, 
                                                      random_state = 42, stratify = y_train_val)
    return X_train, y_train, X_val, y_val, X_test, y_test

In [None]:
import csv
chunksize = 1000
label, feature = None, None
for chunk in pd.read_csv("full_echo.csv",chunksize=chunksize, skiprows = 0):
    label_en, feature_X = process_data(chunk)
    if label is None and feature is None : 
        label = label_en
        feature = feature_X
    else:
        label = np.append(label,label_en, axis = 0)
        feature = np.append(feature, feature_X, axis = 0)
        

In [None]:
# len(val_y[val_y == 1])/len(val_y)
print(len(label[label == 1]))
print(len(label[label == 0]))

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(feature,label, test_size = 0.1, shuffle = True, 
                                                                random_state = 42, stratify = label)

In [None]:
# from numpy import save 
# save('X_train_val.npy', X_train_val)
# save('y_train_val.npy', y_train_val)
# save('X_test.npy', X_test)
# save('y_test.npy',y_test)
train_val_X = np.load('X_train_val.npy')
train_val_y = np.load('y_train_val.npy')
# test_X = np.load('Dataset/split_data/Binary Split/X_test.npy')
# test_y = np.load('Dataset/split_data/Binary Split/y_test.npy')

In [None]:
 X_train, X_val, y_train, y_val = train_test_split(train_val_X, train_val_y, test_size = 0.11 , shuffle = True, 
                                                      random_state = 42, stratify = train_val_y)

In [None]:
from numpy import save 
save('X_train.npy', X_train)
save('y_train.npy', y_train)
save('X_val.npy', X_val)
save('y_val.npy',y_val)

# Multiclass Split

In [None]:
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf

def process_data(line):
    label, feature = line.iloc[:,0],line.iloc[:,1:]
    label = np.where(label =='HUMAN',0, label)
    label = np.where(label == 'BICYCLE',1,label)
    label = np.where(label == 'PILLAR',2,label)
    label = np.where(label == 'WALL',3,label)
    label = np.where(label == 'CAR',4,label)
    label_en = tf.keras.utils.to_categorical(label, num_classes = 5)
    feature_norm = preprocessing.normalize(feature)
    feature_X = feature_norm.reshape(-1,1,5704)
    return label_en, feature_X

def print_label_en(label):
    print("Human: ",len(label[label == 0]))
    print("Bicycle: ", len(label[label == 1]))
    print("Wall: ", len(label[label == 2]))
    print("Pillar: ", len(label[label == 3]))
    print("Car: ", len(label[label == 4]))

In [None]:
import csv
import time 
chunksize = 1000
label, feature = None, None
count = 0

for chunk in pd.read_csv("full_echo.csv",engine = 'python', chunksize=chunksize, skiprows = 0):
    label_en, feature_X = process_data(chunk)
    count += 1 
    print(count,"\n",label_en, label_en.shape)
    if label is None and feature is None : 
        label = label_en
        feature = feature_X
    else:
        label = np.append(label,label_en, axis = 0)
        feature = np.append(feature, feature_X, axis = 0)
    stop = time.time()

In [None]:
count_label = tf.math.reduce_sum(label, axis = 0)
print(count_label) 
#HUMAN, BICYCLE, PILLAR, WALL, CAR
#0    ,    1   ,    2  ,  3  ,  4
#Desired Output:
#47607,11057, 16832, 12974, 28367

In [None]:
#Train_Val/Test split
X_train_val, X_test, y_train_val, y_test = train_test_split(feature,label, test_size = 0.1, shuffle = True, 
                                                                random_state = 42, stratify = label)

In [None]:
from numpy import save 
# save('X_train_val.npy', X_train_val)
# save('y_train_val.npy', y_train_val)
# save('X_test.npy', X_test)
# save('y_test.npy',y_test)
train_val_X = np.load('X_train_val.npy')
train_val_y = np.load('y_train_val.npy')

In [None]:
#Train/Val split
X_train, X_val, y_train, y_val = train_test_split(train_val_X, train_val_y, test_size = 0.11 , shuffle = True, 
                                                      random_state = 42, stratify = train_val_y)

In [None]:
from numpy import save 
save('X_train.npy', X_train)
save('y_train.npy', y_train)
save('X_val.npy', X_val)
save('y_val.npy',y_val)

# Binary Split (But One-hot encoding)

In [None]:
# HUMAN [1,0]
# NONHUMAN [0,1]
def process_data(line):
    label, feature = line.iloc[:,0],line.iloc[:,1:]
    label = label.replace(['BICYCLE','WALL','PILLAR','CAR'], 'NON_HUMAN')
    label = np.where(label =='HUMAN',0, label)
    label = np.where(label =='NON_HUMAN',1,label)
    # for i in ['BICYCLE','WALL','PILLAR','CAR']:
    #     label = np.where(label == i,1,label)
    label_en = tf.keras.utils.to_categorical(label, num_classes = 2, dtype ='float16')
    feature_norm = preprocessing.normalize(feature)
    feature_X = feature_norm.reshape(-1,1,5704)
    return label_en, feature_X

In [None]:
import csv
import time 
chunksize = 1000
label, feature = None, None
count = 0

for chunk in pd.read_csv("Dataset/full_echo.csv",engine = 'python', chunksize=chunksize, skiprows = 0):
    start = time.time()
    label_en, feature_X = process_data(chunk)
    end = time.time()
    count += 1 
    print(count,label_en, label_en.shape)
    print(end-start)
    if label is None and feature is None : 
        label = label_en
        feature = feature_X
    else:
        label = np.append(label,label_en, axis = 0)
        feature = np.append(feature, feature_X, axis = 0)
    stop = time.time()
    print(stop - start)

In [None]:
# count_label = tf.math.reduce_sum(label,0)
# counting = label.sum(axis=0)
# print(counting)
label_count = tf.math.argmax(label,axis=-1)
unique, counts = np.unique(label_count, return_counts=True)
dict(zip(unique, counts))

In [None]:
X_train_val, X_test, y_train_val, y_test = train_test_split(feature,label, test_size = 0.1, shuffle = True, 
                                                                random_state = 42, stratify = label)

In [None]:
len(X_train_val), len(X_test)

In [None]:
from numpy import save 

# save('X_train_val.npy', X_train_val)
# save('y_train_val.npy', y_train_val)
# save('X_test.npy', X_test)
# save('y_test.npy',y_test)
train_val_X = np.load('X_train_val.npy')
train_val_y = np.load('y_train_val.npy')

In [None]:
 X_train, X_val, y_train, y_val = train_test_split(train_val_X, train_val_y, test_size = 0.11 , shuffle = True, 
                                                      random_state = 42, stratify = train_val_y)

In [None]:
len(X_train), len(X_val)

In [None]:
from numpy import save 
save('X_train.npy', X_train)
save('y_train.npy', y_train)
save('X_val.npy', X_val)
save('y_val.npy',y_val)