<a href="https://colab.research.google.com/github/sunnnymskang/NIST_BLE_challenge/blob/master/1D_ConvNet_NewData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# MOUNT DRIVE + UNZIP THE DATASET ZIPS
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
!unzip /content/gdrive/My\ Drive/tc4tl2.zip
!unzip /content/gdrive/My\ Drive/tc4tl_mitre_dev.zip
!unzip /content/gdrive/My\ Drive/MITRE-Range-Angle-Structured-master.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: tc4tl_mitre_dev/data/train/ivjuwoze_tc4tl20.csv  
  inflating: tc4tl_mitre_dev/data/train/fldkwlev_tc4tl20.csv  
  inflating: tc4tl_mitre_dev/data/train/nlbwsooh_tc4tl20.csv  
  inflating: tc4tl_mitre_dev/data/train/conwqlwu_tc4tl20.csv  
  inflating: tc4tl_mitre_dev/data/train/etwnpobu_tc4tl20.csv  
  inflating: tc4tl_mitre_dev/data/train/nyoxsukg_tc4tl20.csv  
  inflating: tc4tl_mitre_dev/data/train/yebkyrrs_tc4tl20.csv  
  inflating: tc4tl_mitre_dev/data/train/jqplylmi_tc4tl20.csv  
  inflating: tc4tl_mitre_dev/data/train/qofbzkgv_tc4tl20.csv  
  inflating: tc4tl_mitre_dev/data/train/kksjdinw_tc4tl20.csv  
  inflating: tc4tl_mitre_dev/data/train/trakzoec_tc4tl20.csv  
  inflating: tc4tl_mitre_dev/data/train/umimzqwb_tc4tl20.csv  
  inflating: tc4tl_mitre_dev/data/train/qbfmuwmg_tc4tl20.csv  
  inflating: tc4tl_mitre_dev/data/train/ylzmucwt_tc4tl20.csv  
  inflating: tc4tl_mitre_dev/data/train/fsjjtahf_tc4t

In [None]:
# HELPER FUNCTIONS

import statistics
from sklearn.model_selection import train_test_split
import os
import random
import math
import time
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt


torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(0)
import numpy as np
np.random.seed(0)
random.seed(0)


NUM_READINGS_PER_INTERVAL = 150
INTERVAL_LENGTH = 4  # length in seconds of each interval


def CSELoss(predictions, targets, epsilon=1e-12):
    predictions = torch.clamp(predictions, epsilon, 1. - epsilon)
    N = predictions.shape[0]
    ce = -torch.sum(targets*torch.log(predictions+1e-9))/N
    return ce

def load_data(key_path, data_path):
    """
    This loads the NIST dataset format data.
    """
    # first pass to find the various values for the categorical features to use for one-hot encoding
    fixed_variables_possible_values = [set() for _ in range(9)]
    class_labels_possible_values = set()
    for file_id in os.listdir(data_path):
        if file_id.startswith("."):
            # there are some weird extra files starting with .
            continue
        with open(os.path.join(data_path, file_id), 'r', errors="ignore") as data_file:
            for index in range(7):
                value = data_file.readline().strip().split(",")[1]
                fixed_variables_possible_values[index].add(value)
    with open(key_path, 'r', errors="ignore") as key_file:
        key_file.readline()  # skip header
        for line in key_file:
            record = line.split("\t")
            transmitter_position, receiver_position = record[1].split("_")
            if len(record) == 5:
                # has labels
                class_labels_possible_values.add(float(record[2]))
            fixed_variables_possible_values[7].add(transmitter_position)
            fixed_variables_possible_values[8].add(receiver_position)
    fixed_variables_possible_values = [list(fixed_variable_possible_values) for fixed_variable_possible_values in fixed_variables_possible_values]
    class_labels_possible_values = list(class_labels_possible_values)


    X = list()  # each intervals index in this list gives you the input features
    y = list()  # each intervals index in this list gives you the label
    interval_to_file = list()  # this is used to output predictions. Each intervals index in this list gives you the file that the interval was from
    with open(key_path, 'r') as key_file:
        key_file.readline()  # skip header
        for line in key_file:
            record = line.split("\t")
            file_id = record[0]
            with open(os.path.join(data_path, file_id)) as data_file:
                # fixed variables tx_device, tx_power, rx_device, tx_carry, rx_carry, rx_pose, tx_pose,
                #                               transmitter_position, receiver_position
                fixed_variables = [data_file.readline().strip().split(",")[1] for _ in range(7)]
                fixed_variables.extend(record[1].split("_"))
                fixed_part = list()
                for variable, key in zip(fixed_variables, fixed_variables_possible_values):
                    fixed_part.extend([int(possible_value == variable) for possible_value in key])
                print("Loading file {} with fixed variables of {}".format(file_id, fixed_part))
                interval_start_time = 0
                interval_data = list()
                num_intervals = 0
                reading_count = 0
                # this is used because each line only has one sensors reading. The other valaues need to be populated by the previous sensor readings
                previous_value = {
                    "Bluetooth": (0,),
                    "Accelerometer": (0,0,0),
                    "Gyroscope": (0,0,0),
                    "Altitude": (0,0),
                    "Attitude": (0,0,0),
                    # "Activity": (0,0),
                    "Gravity": (0,0,0),
                    "Magnetic-field": (0,0,0),
                    "Heading": (0,0,0,0)
                }
                for line in data_file:
                    reading = line.strip().split(",")
                    curr_time = float(reading[0])
                    if (curr_time - interval_start_time) > INTERVAL_LENGTH:
                        if reading_count > NUM_READINGS_PER_INTERVAL:
                            # randomly remove readings
                            for _ in range(reading_count - NUM_READINGS_PER_INTERVAL):
                                interval_data.pop(math.floor(random.random() * len(interval_data)))
                        else:
                            # randomly duplicate readings
                            # todo: try other methods such as averaging
                            for _ in range(NUM_READINGS_PER_INTERVAL - reading_count):
                                random_index = math.floor(random.random() * len(interval_data))
                                interval_data.insert(random_index, interval_data[random_index])
                        X.append(interval_data)
                        num_intervals += 1

                        # reset values
                        interval_start_time = curr_time
                        reading_count = 0
                        interval_data = list()
                        previous_value = {
                            "Bluetooth": (0,),
                            "Accelerometer": (0,0,0),
                            "Gyroscope": (0,0,0),
                            "Altitude": (0,0),
                            "Attitude": (0,0,0),
                            # "Activity": (0,0),
                            "Gravity": (0,0,0),
                            "Magnetic-field": (0,0,0),
                            "Heading": (0,0,0,0)
                        }
                    type = reading[1]
                    if type in {"Pedometer", "Activity"}: #, "Heading", "Altitude", "Attitude", "Gravity"}:
                        continue
                    elif type == "Bluetooth":
                        previous_value[type] = (float(reading[2]), )
                    elif type == "Heading":
                        previous_value[type] = (float(reading[2]), float(reading[3]), float(reading[4]),
                                                float(reading[5]))
                    elif type == "Altitude":
                        previous_value[type] = (float(reading[2]), float(reading[3]))
                    elif type == "Activity":
                        # todo: add one-hot encoding for reading[3]
                        # todo: remove this, it is useless with oh
                        previous_value[type] = (float(reading[2]), float(reading[4]))
                    else:
                        previous_value[type] = (float(reading[2]), float(reading[3]), float(reading[4]))
                    # combine the values in previous_value into one giant list
                    # Uses 0 as angle for nist data
                    interval_data.append([0] + [reading for value in previous_value.values() for reading in value] )# + fixed_part)
                    reading_count += 1
                # the last interval needs to be added manually
                if reading_count > NUM_READINGS_PER_INTERVAL:
                    # randomly remove readings
                    for i in range(reading_count - NUM_READINGS_PER_INTERVAL):
                        interval_data.pop(math.floor(random.random() * len(interval_data)))
                else:
                    # randomly duplicate readings
                    # todo: try other methods such as averaging
                    for i in range(NUM_READINGS_PER_INTERVAL - reading_count):
                        random_index = math.floor(random.random() * len(interval_data))
                        interval_data.insert(random_index, interval_data[random_index])
                X.append(interval_data)
                num_intervals += 1

            if len(record) == 5:
                # this means this file has labels
                distance = float(record[2])
                label = torch.zeros(INTERVAL_LENGTH)
                label[class_labels_possible_values.index(distance)] = 1
                for _ in range(num_intervals):
                    y.append(label)
                    
            for _ in range(num_intervals):
                interval_to_file.append(file_id)

            # good for debugging
            # if len(X) > 105:
            #     break
    return [torch.Tensor(interval) for interval in X], y, class_labels_possible_values, interval_to_file

# tc4tl challenge organizer has asked us not to use these files
BAD_DATASETS = {"set_7", "set_8", "set_15", "set_24", "set_25", "set_29", "set_35", "set_38", "set_47", "set_48", "set_58", "set_61", "set_62", "set_63", "set_66", "set_67", "set_68", "set_69", "set_70" "set_1", "set_18", "set_30", "set_32", "set_33", "set_36", "set_41", "set_45", "set_51", "set_59", "set_61", "set_62", "set_63", "set_66", "set_68", "set_69", "set_70", "set_72", "set_73"}
print(len(BAD_DATASETS))

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    interval = len(lst) // n
    for i in range(0, len(lst), interval):
        yield lst[i:i + interval]


def load_mitre_data(data_path):
    """
    loading the mitre dataset
    """
    # YOU CAN IGNORE THIS COMMENT FOR NOW
    # first pass to find the various values for the categorical features to use for one-hot encoding
    # fixed_variables_possible_values = [set() for _ in range(9)]
    # class_labels_possible_values = set()
    # for file_id in os.listdir(data_path):
    #     if file_id.startswith("."):
    #         # there are some weird extra files starting with .
    #         continue
    #     with open(os.path.join(data_path, file_id), 'r', errors="ignore") as data_file:
    #         for index in range(7):
    #             value = data_file.readline().strip().split(",")[1]
    #             fixed_variables_possible_values[index].add(value)

    # one hot encoded label vector
    class_labels_possible_values = [1.2, 1.8, 3.0, 4.5]

    X = list()  # each intervals index in this list gives you the input features
    y = list()  # each intervals index in this list gives you the label

    experiment_paths = [os.path.join(data_path, name) for name in os.listdir(data_path)
                   if os.path.isdir(os.path.join(data_path, name)) and not name.startswith(".") and name not in BAD_DATASETS]
    for experiment_path in experiment_paths:
        print(experiment_path)
        log_file_paths = [os.path.join(experiment_path, log_filename) for log_filename in os.listdir(experiment_path)
                    if os.path.isfile(os.path.join(experiment_path, log_filename))]
        for log_file_path in log_file_paths:
            if log_file_path.endswith(".swp"):
                continue
            with open(log_file_path, 'r', errors='ignore') as log_file:
                # same logic as load_data
                previous_value = {
                    "Bluetooth": (0,),
                    "Accelerometer": (0, 0, 0),
                    "Gyroscope": (0, 0, 0),
                    "Altitude": (0, 0),
                    "Attitude": (0, 0, 0),
                    # "Activity": (0, 0),
                    "Gravity": (0, 0, 0),
                    "Magnetic-field": (0, 0, 0),
                    "Heading": (0, 0, 0, 0)
                }
                # this list contains of all the readings for each ten second interval
                ten_sec_data = list()
                label = 0
                for line in log_file:
                    record = line.strip().split(",")
                    if "app_name" in line:
                        # new interval
                        # timestamp = datetime.datetime.strptime(record[0], "%Y-%m-%dT%H:%M:%S.%fZ")
                        if len(ten_sec_data) > 10 and feet_distance != 4.5:
                            # nist dataset uses 4 second intervals, so splitting this into 10 1 second intervals allows us to get 7 4 second intervals from the 10 second interval.
                            seconds_data = list(chunks(ten_sec_data, 10))
                            for i in range(0, len(seconds_data) - INTERVAL_LENGTH):
                                interval_data = [item for sublist in seconds_data[i:i + INTERVAL_LENGTH] for item in sublist]
                                if len(interval_data) > NUM_READINGS_PER_INTERVAL:
                                    # remove readings based on skip
                                    selected_indexes = np.round(np.linspace(0, len(interval_data) - 1, NUM_READINGS_PER_INTERVAL)).astype(int)
                                    interval_data = [interval_data[idx] for idx in selected_indexes]
                                else:
                                    # randomly duplicate readings
                                    # todo: try other methods such as averaging
                                    for i in range(NUM_READINGS_PER_INTERVAL - len(interval_data)):
                                        random_index = math.floor(random.random() * len(interval_data))
                                        interval_data.insert(random_index, interval_data[random_index])
                                X.append(interval_data)
                                y.append(label)
                        ten_sec_data = list()
                        reading_count = 0
                        previous_value = {
                            "Bluetooth": (0,),
                            "Accelerometer": (0, 0, 0),
                            "Gyroscope": (0, 0, 0),
                            "Altitude": (0, 0),
                            "Attitude": (0, 0, 0),
                            # "Activity": (0, 0),
                            "Gravity": (0, 0, 0),
                            "Magnetic-field": (0, 0, 0),
                            "Heading": (0, 0, 0, 0)
                        }
                        distance = next(log_file)
                        while "Range" not in distance:
                            # skip fixed for now
                            distance = next(log_file)
                        distance = int(distance.split(",")[-1])

                        # convert feet to meters
                        if distance <= 5.5:
                          feet_distance = 1.2
                        elif distance <= 8:
                          feet_distance = 1.8
                        elif distance <= 12:
                          feet_distance = 3.0
                        else:
                          feet_distance = 4.5
                        label = torch.zeros(len(class_labels_possible_values))
                        label[class_labels_possible_values.index(feet_distance)] = 1
                        
                        angle_record = next(log_file)
                        angle = int(angle_record.split(",")[2])

                        record = next(log_file).split(",")
                    # timestamp - log_file_interval_data[-1]["start"]
                    type = record[1]

                    # these are the sensor readings we want to ignore for now
                    if type in {"Pedometer", "Activity"}: #, "Heading", "Altitude", "Attitude", "Gravity"}:
                        continue
                    elif type == "Bluetooth":
                        previous_value[type] = (float(record[3]),)
                    elif type == "Heading":
                        previous_value[type] = (float(record[2]), float(record[3]), float(record[4]),
                                                float(record[5]))
                    elif type == "Altitude":
                        previous_value[type] = (float(record[2]), float(record[3]))
                    elif type == "Activity":
                        # todo: add one-hot encoding for reading[3]
                        previous_value[type] = (0, float(record[4]))
                    else:
                        previous_value[type] = (float(record[2]), float(record[3]), float(record[4]))
                    # combine the values in previous_value into one giant list
                    ten_sec_data.append(
                        [angle] + [reading for value in previous_value.values() for reading in value])  # + fixed_part)
                    reading_count += 1
                # last needs to be added manually
                if len(ten_sec_data) > 10 and feet_distance != 4.5:
                    seconds_data = list(chunks(ten_sec_data, 10))
                    for i in range(0, len(seconds_data) - INTERVAL_LENGTH):
                        interval_data = [item for sublist in seconds_data[i:i + INTERVAL_LENGTH] for item in sublist]
                        if len(interval_data) > NUM_READINGS_PER_INTERVAL:
                            # remove readings based on skip
                            selected_indexes = np.round(np.linspace(0, len(interval_data) - 1, NUM_READINGS_PER_INTERVAL)).astype(int)
                            interval_data = [interval_data[idx] for idx in selected_indexes]
                        else:
                            # randomly duplicate readings
                            # todo: try other methods such as averaging
                            for i in range(NUM_READINGS_PER_INTERVAL - len(interval_data)):
                                random_index = math.floor(random.random() * len(interval_data))
                                interval_data.insert(random_index, interval_data[random_index])
                        X.append(interval_data)
                        y.append(label)
    return [torch.Tensor(interval) for interval in X], y


def eval_model(model, val_data_loader, best_acc, save=True):
  model.eval()
  predict_dist = {0:0,1:0,2:0,3:0}
  label_dist = {0:0,1:0,2:0,3:0}
  with torch.no_grad():
      test_total_loss = list()
      test_total_acc = list()
      test_total_tc_acc = list()
      for idx, batch in enumerate(val_data_loader):
          input = batch[0]
          label = torch.max(batch[1],axis=1)[1]
          for lab in label:
            label_dist[int(lab)] += 1
          # if idx % 100 == 0:
          #   print("label {}".format(label))
          prediction = model(input) #, batch_size=len(label))
          loss = loss_fn(prediction, batch[1])
          prediction = torch.max(prediction, 1)[1].view(label.size())
          for pred in prediction:
            predict_dist[int(pred)] += 1
          # if idx % 100 == 0:
          #   print("pred {}".format(prediction))
          num_corrects = (prediction == label).float().sum()
          acc = 100.0 * num_corrects / len(label)
          prediction = ((prediction == 0) + (prediction == 3).float())
          label = ((label == 0) + (label == 3).float())
          num_tc_corrects = (label == prediction).float().sum()
          tc_acc = 100.0 * num_tc_corrects / len(label)
          test_total_loss.extend([loss.item()] * len(label))
          test_total_acc.extend([acc.item()] * len(label))
          test_total_tc_acc.extend([tc_acc.item()] * len(label))
          # if idx % 100 == 0:
          #   print("acc {}".format(acc))
      curr_acc = statistics.mean(test_total_acc)
      if best_acc < curr_acc and save:
        torch.save(model.state_dict(), MODEL_PATH)
        best_acc = curr_acc
      print(predict_dist)
      print(label_dist)
      return statistics.mean(test_total_loss), statistics.mean(test_total_acc), statistics.mean(test_total_tc_acc), best_acc
import json

def output_predictions(model, X, labels_to_distance, intervals_to_file, output_path):
  model.eval()
  print(list(intervals_to_file))
  with torch.no_grad():
      with open(output_path, "w") as f:
        f.write("fileid\tdistance\n")
        file_to_interval_pred = dict()
        i = 0
        for tensor, file_id in zip(X, intervals_to_file):
            i+=1
            print(i)
            if file_id not in file_to_interval_pred:
              file_to_interval_pred[file_id] = list()            
            input = tensor.view(1, tensor.shape[0], tensor.shape[1])
            prediction = model(input)#, batch_size=1)
            prediction = labels_to_distance[torch.max(prediction, 1)[1]]
            file_to_interval_pred[file_id].append(str(prediction))
        file_to_pred = [file_id + "\t" + max(set(preds), key=preds.count) for file_id, preds in file_to_interval_pred.items()]
        print(len(file_to_pred))
        file_to_pred.sort()
        f.write("\n".join(file_to_pred))
  return

31


In [None]:
class GRU(nn.Module):
    def __init__(self, input_dim, hidden_dim, batch_size, output_dim, num_layers=1):
        super(GRU, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size

        # GRU Layer
        self.gru = nn.GRU(self.input_dim, self.hidden_dim, num_layers=num_layers)

        # Define the output layer
        self.linear = nn.Linear(self.hidden_dim, 128)
        self.linear2 = nn.Linear(128, output_dim)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax()
        self.dropout = nn.Dropout()

    def forward(self, input, batch_size=None):
        input = input.permute(1, 0, 2)
        # Forward pass through LSTM layer
        # uncomment to switch to LSTM
        # lstm_out, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))
        gru_out, final_hidden_state = self.gru(input)

        # Only take the output from the final timetep
        # Can pass on the entirety of lstm_out to the next layer if it is a seq2seq prediction
        y_pred = self.softmax(self.linear2(self.dropout(self.sigmoid(self.linear(final_hidden_state[-1])))))
        return y_pred


class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, batch_size, output_dim, num_layers=1):
        super(LSTM, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size

        # LSTM Layer
        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, num_layers=num_layers)

        # Define the output layer
        self.linear = nn.Linear(self.hidden_dim, 128)
        self.linear2 = nn.Linear(128, output_dim)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax()
        self.dropout = nn.Dropout()

    def init_states(self, batch_size):
        # This is what we'll initialise our hidden state as
        return Variable(torch.zeros(1, batch_size, self.hidden_dim)), Variable(torch.zeros(1, batch_size, self.hidden_dim))

    def forward(self, input, batch_size=None):
        input = input.permute(1, 0, 2)
        # Forward pass through LSTM layer
        h0, c0 = self.init_states(batch_size)
        lstm_out, (final_hidden_state, final_cell_state) = self.lstm(input, (h0, c0))

        # Only take the output from the final timetep
        # Can pass on the entirety of lstm_out to the next layer if it is a seq2seq prediction
        y_pred = self.softmax(self.linear2(self.dropout(self.sigmoid(self.linear(final_hidden_state[-1])))))
        return y_pred

class OldTwoCellGRU(nn.Module):
    def __init__(self, input_dim, hidden_dim, batch_size, output_dim):
        super(OldTwoCellGRU, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size

        # LSTM Layer
        self.lstm = nn.LSTM(self.input_dim, self.hidden_dim)
        
        # GRU Layer
        self.gru = nn.GRU(self.input_dim, self.hidden_dim)
        self.gru2 = nn.GRU(self.hidden_dim, self.hidden_dim)

        # Define the output layer
        self.linear = nn.Linear(self.hidden_dim, 128)
        self.linear2 = nn.Linear(128, output_dim)
        self.softmax = nn.Softmax()
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout()

    def init_states(self, batch_size):
        # This is what we'll initialise our hidden state as
        return Variable(torch.zeros(1, batch_size, self.hidden_dim)), Variable(torch.zeros(1, batch_size, self.hidden_dim))

    def forward(self, input, batch_size=None):
        input = input.permute(1, 0, 2)
        # Forward pass through LSTM layer
        # uncomment to switch to LSTM
        # lstm_out, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))
        gru_out, hidden_state = self.gru(input)
        gru_out, final_hidden_state = self.gru2(hidden_state)

        # Only take the output from the final timetep
        # Can pass on the entirety of lstm_out to the next layer if it is a seq2seq prediction
        y_pred = self.softmax(self.linear2(self.dropout(self.sigmoid(self.linear(final_hidden_state[-1])))))
        return y_pred


import numpy as np
import torch.nn.functional as F
from torch.nn import init


class ConvGRUCell(nn.Module):
    """
    Generate a convolutional GRU cell
    """

    def __init__(self, input_size, hidden_size, kernel_size):
        super().__init__()
        padding = kernel_size // 2
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.reset_gate = nn.Conv1d(input_size + hidden_size, hidden_size, kernel_size, padding=padding)
        self.update_gate = nn.Conv1d(input_size + hidden_size, hidden_size, kernel_size, padding=padding)
        self.out_gate = nn.Conv1d(input_size + hidden_size, hidden_size, kernel_size, padding=padding)

        init.orthogonal(self.reset_gate.weight)
        init.orthogonal(self.update_gate.weight)
        init.orthogonal(self.out_gate.weight)
        init.constant(self.reset_gate.bias, 0.)
        init.constant(self.update_gate.bias, 0.)
        init.constant(self.out_gate.bias, 0.)


    def forward(self, input_, prev_state):

        # get batch and spatial sizes
        batch_size = input_.data.size()[0]
        spatial_size = input_.data.size()[2:]

        # generate empty prev_state, if None is provided
        if prev_state is None:
            state_size = [batch_size, self.hidden_size] + list(spatial_size)
            if torch.cuda.is_available():
                prev_state = Variable(torch.zeros(state_size)).cuda()
            else:
                prev_state = Variable(torch.zeros(state_size))

        # data size is [batch, channel, height, width]
        stacked_inputs = torch.cat([input_, prev_state], dim=1)
        update = F.sigmoid(self.update_gate(stacked_inputs))
        reset = F.sigmoid(self.reset_gate(stacked_inputs))
        out_inputs = F.tanh(self.out_gate(torch.cat([input_, prev_state * reset], dim=1)))
        new_state = prev_state * (1 - update) + out_inputs * update

        return new_state


class ConvGRU(nn.Module):

    def __init__(self, input_size, hidden_sizes, kernel_sizes, num_layers, output_dim):
        '''
        Generates a multi-layer convolutional GRU.
        Preserves spatial dimensions across cells, only altering depth.
        Parameters
        ----------
        input_size : integer. depth dimension of input tensors.
        hidden_sizes : integer or list. depth dimensions of hidden state.
            if integer, the same hidden size is used for all cells.
        kernel_sizes : integer or list. sizes of Conv2d gate kernels.
            if integer, the same kernel size is used for all cells.
        num_layers : integer. number of chained `ConvGRUCell`.
        '''

        super(ConvGRU, self).__init__()

        self.linear_input_size = 23 * hidden_sizes

        self.input_size = input_size

        if type(hidden_sizes) != list:
            self.hidden_sizes = [hidden_sizes]*num_layers
        else:
            assert len(hidden_sizes) == num_layers, '`hidden_sizes` must have the same length as num_layers'
            self.hidden_sizes = hidden_sizes
        if type(kernel_sizes) != list:
            self.kernel_sizes = [kernel_sizes]*num_layers
        else:
            assert len(kernel_sizes) == num_layers, '`kernel_sizes` must have the same length as num_layers'
            self.kernel_sizes = kernel_sizes

        self.n_layers = num_layers

        cells = []
        for i in range(self.n_layers):
            if i == 0:
                input_dim = self.input_size
            else:
                input_dim = self.hidden_sizes[i-1]

            cell = ConvGRUCell(input_dim, self.hidden_sizes[i], self.kernel_sizes[i])
            name = 'ConvGRUCell_' + str(i).zfill(2)

            setattr(self, name, cell)
            cells.append(getattr(self, name))

        self.cells = cells
        self.linear = nn.Linear(self.linear_input_size, 128)
        self.linear2 = nn.Linear(128, output_dim)
        self.softmax = nn.Softmax()
        self.sigmoid = nn.Sigmoid()
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout()


    def forward(self, x, hidden=None):
        '''
        Parameters
        ----------
        x : 4D input tensor. (batch, channels, height, width).
        hidden : list of 4D hidden state representations. (batch, channels, height, width).
        Returns
        -------
        upd_hidden : 5D hidden representation. (layer, batch, channels, height, width).
        '''
        if not hidden:
            hidden = [None]*self.n_layers

        input_ = x

        upd_hidden = []

        for layer_idx in range(self.n_layers):
            cell = self.cells[layer_idx]
            cell_hidden = hidden[layer_idx]

            # pass through layer
            upd_cell_hidden = cell(input_, cell_hidden)
            upd_hidden.append(upd_cell_hidden)
            # update input_ to the last updated hidden layer for next pass
            input_ = upd_cell_hidden

        flattened = self.dropout(self.flatten(upd_hidden[-1]))

        # retain tensors in list to allow different hidden sizes
        return self.softmax(self.linear2(self.dropout(self.sigmoid(self.linear(flattened)))))

In [None]:
# dev_X, dev_y = load_mitre_data("/content/MITRE-Range-Angle-Structured-master")
# train_X, val_X, train_y, val_y = train_test_split(dev_X, dev_y, test_size=.1, random_state=100)


train_key_path = os.path.join("/content", "tc4tl_mitre_dev", "docs", "tc4tl_train_key.tsv")
train_data_path = os.path.join("/content", "tc4tl_mitre_dev", "data", "train")
train_X, train_y, train_labels_to_distance, train_intervals_to_file = load_data(train_key_path, train_data_path)
np.save("/content/gdrive/My Drive/train_X_numpy.npy".format(EXP_NAME), [tensor.numpy() for tensor in train_X])
np.save("/content/gdrive/My Drive/train_y_numpy.npy".format(EXP_NAME), [tensor.numpy() for tensor in train_y])
np.save("/content/gdrive/My Drive/train_file_names_numpy.npy", np.array(train_intervals_to_file))
print(set(train_y))
test_key_path = os.path.join("/content", "tc4tl", "docs", "tc4tl_test_metadata.tsv")
test_data_path = os.path.join("/content", "tc4tl", "data", "test")
test_X, _, test_labels_to_distance, test_intervals_to_file = load_data(test_key_path, test_data_path)
val_key_path = os.path.join("/content", "tc4tl", "docs", "tc4tl_dev_key.tsv")
val_data_path = os.path.join("/content", "tc4tl", "data", "dev")
val_X, val_y, labels_to_distance, dev_interval_to_file = load_data(val_key_path, val_data_path)

# print(set(val_y))
# np.save("/content/gdrive/My Drive/_X_mitre_removed_sensors_numpy.npy".format(EXP_NAME), [tensor.numpy() for tensor in test_X])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Loading file nltvhoak_tc4tl20.csv with fixed variables of [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0]
Loading file nlvtaurs_tc4tl20.csv with fixed variables of [0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0]
Loading file nlvtsnuh_tc4tl20.csv with fixed variables of [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0]
Loading file nlvwfigx_tc4tl20.csv with fixed variables of [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]
Loading file nlxpbvqj_tc4tl20.csv with fixed variables of [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0]
Loading file nlzllykf_tc4tl20.csv with fixed variables of [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1]
Loading file nlzmtdke_t

In [None]:
# RUN TO LOAD SAVED NPY DATA
EXP_NAME = "CONV_GRU_k3_hs_100"

import numpy as np
def generate_synthetic_hybrids(X, y, num_new_samples=1000):
  orig_X = X.copy()
  orig_y = y.copy()
  dataset_size = len(orig_y)
  for i in range(num_new_samples):
    a_index = random.randint(0, dataset_size - 1)
    a_y = orig_y[a_index]
    a_X = orig_X[a_index]
    b_index = random.randint(0, dataset_size - 1)
    b_y = orig_y[b_index]
    while (int(sum((b_y + a_y)[1:3])) % 2 != 0) or torch.equal(b_y, a_y):
      b_index = random.randint(0, dataset_size - 1)
      b_y = orig_y[b_index]
    b_X = orig_X[b_index]
    lam = np.random.beta(0.2, 0.2)
    new_X = lam * a_X + (1 - lam) * b_X
    new_y = lam * a_y + (1 - lam) * b_y
    X.append(new_X)
    y.append(new_y)
  return X, y

# with open("/cn("/content/gdrive/My Drive/train_y_mitre_numpy.npy", 'rb') as f:
#   train_ontent/gdrive/My Drive/test_X_mitre_numpy.npy", 'rb') as f:
#   test_X = [torch.from_numpy(el) for el in np.load(f)]
with open("/content/gdrive/My Drive/train_X_numpy.npy", 'rb') as f:
  train_X = [torch.from_numpy(el) for el in np.load(f)]
with open("/content/gdrive/My Drive/train_y_numpy.npy", 'rb') as f:
  train_y = [torch.from_numpy(el) for el in np.load(f)]
# with open("/content/gdrive/My Drive/val_X_numpy.npy", 'rb') as f:
#   val_X = [torch.from_numpy(el) for el in np.load(f)]
# with open("/content/gdrive/My Drive/val_y_numpy.npy", 'rb') as f:
#   val_y = [torch.from_numpy(el) for el in np.load(f)]
# with open("/content/gdrive/My Drive/train_file_names_numpy.npy", 'rb') as f:
#   train_intervals_to_file = np.load(f)
# with open("/content/gdrive/My Drive/dev_file_names_numpy.npy", 'rb') as f:
#   val_intervals_to_file = np.load(f)
# with open("/content/gdrive/My Drive/test_file_names_numpy.npy", 'rb') as f:
#   test_intervals_to_file = np.load(f)

train_data_loader = DataLoader(list(zip(train_X, train_y)), batch_size=25)
val_data_loader = DataLoader(list(zip(val_X, val_y)), batch_size=25)
# dev_data_loader = DataLoader(list(zip(dev_X, dev_y)), batch_size=25)
print(len(train_y))
train_X, train_y = generate_synthetic_hybrids(train_X[:500], train_y[:500], num_new_samples=0)
print(len(train_y))

127369
500


In [None]:
EXP_NAME = "CONV_GRU_k3_hs_20"
MODEL_PATH = "/content/gdrive/My Drive/MITRE_layers_2_epochs_20_{}".format(EXP_NAME)
labels_to_distance = [1.2, 3.0, 4.5, 1.8]


model = ConvGRU(150, 5, output_dim=len(labels_to_distance), num_layers=1, kernel_sizes=3)
best_acc = 0
total_loss = []  # for plotting
total_test_loss = []
loss_fn = CSELoss
train_start_time = time.time()
for epoch in range(200):
    total_epoch_loss = 0
    total_epoch_acc = 0
    total_epoch_tc_acc = 0
    optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-3)
    steps = 0
    model.train()
    for idx, batch in enumerate(train_data_loader):
        input = batch[0]
        label = torch.max(batch[1], axis=1)[1]
        optim.zero_grad()
        prediction = model(input)#, batch_size=len(label))
        loss = loss_fn(prediction, batch[1])
        prediction = torch.max(prediction, 1)[1].view(label.size())
        num_corrects = (prediction == label).float().sum()
        acc = 100.0 * num_corrects / len(label)
        prediction = ((prediction == 0) + (prediction == 3).float())
        label = ((label == 0) + (label == 3).float())
        num_tc_corrects = (label == prediction).float().sum()
        tc_acc = 100.0 * num_tc_corrects / len(label)
        loss.backward()
        optim.step()
        steps += 1
        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()
        total_epoch_tc_acc += tc_acc.item()
    # eval
    loss, acc, tc_acc, best_acc = eval_model(model, val_data_loader, best_acc)
    print("Testing Loss: {}, AVG: {}, TC4TL or Not AVG: {}".format(loss, acc, tc_acc))
    total_loss.append((epoch, total_epoch_loss/steps))
    total_test_loss.append((epoch, loss))
    print (f'Epoch: {epoch+1}, Training Loss: {total_epoch_loss/steps:.4f}, Training Accuracy: {total_epoch_acc/steps: .2f}% TC4TL or Not ACC: {total_epoch_tc_acc/steps: .2f}%')
print("finished training, took {} seconds".format(time.time() - train_start_time))
plt.scatter(*zip(*total_loss))
plt.xlabel("Epochs")
plt.ylabel("loss")
plt.show()
plt.scatter(*zip(*total_test_loss))
plt.xlabel("Epochs")
plt.ylabel("test loss")
plt.show()

# eval
loss, acc, tc_acc, best_acc = eval_model(model, val_data_loader, best_acc)
print("Testing Loss: {}, AVG: {}, TC4TL or Not AVG: {}".format(loss, acc, tc_acc))



{0: 6714, 1: 0, 2: 0, 3: 0}
{0: 758, 1: 881, 2: 2657, 3: 2418}
Testing Loss: 1.67492096170024, AVG: 11.289842120941316, TC4TL or Not AVG: 47.30414060172773
Epoch: 1, Training Loss: 1.2739, Training Accuracy:  37.95% TC4TL or Not ACC:  50.61%
{0: 6607, 1: 0, 2: 107, 3: 0}
{0: 758, 1: 881, 2: 2657, 3: 2418}
Testing Loss: 1.6300492211221973, AVG: 12.243074173369079, TC4TL or Not AVG: 48.03395889186774
Epoch: 2, Training Loss: 1.2560, Training Accuracy:  39.99% TC4TL or Not ACC:  53.28%
{0: 6395, 1: 0, 2: 319, 3: 0}
{0: 758, 1: 881, 2: 2657, 3: 2418}
Testing Loss: 1.6187896088524734, AVG: 13.032469466785821, TC4TL or Not AVG: 48.03395889186774
Epoch: 3, Training Loss: 1.2387, Training Accuracy:  41.69% TC4TL or Not ACC:  55.63%
{0: 6264, 1: 0, 2: 450, 3: 0}
{0: 758, 1: 881, 2: 2657, 3: 2418}
Testing Loss: 1.580395136253799, AVG: 12.764372952040512, TC4TL or Not AVG: 47.63181411974978
Epoch: 4, Training Loss: 1.2290, Training Accuracy:  42.22% TC4TL or Not ACC:  56.29%
{0: 5824, 1: 0, 2: 89

In [None]:
# TEST ON VAL SECTION OF DEV DATA
print(best_acc)
val_data_loader = DataLoader(list(zip(val_X, val_y)), batch_size=25)
model2 = ConvGRU(150,200, output_dim=len(labels_to_distance), num_layers=2, kernel_sizes=3)
model2.load_state_dict(torch.load(MODEL_PATH))
loss, acc, tc_acc, best_acc = eval_model(model2, val_data_loader, best_acc, save=False)
print("Testing Loss: {}, AVG: {}, TC4TL or Not AVG: {}".format(loss, acc, tc_acc))

33.88442061335905




KeyboardInterrupt: ignored

In [None]:
# OUTPUT SAVE TO FILE
labels_to_distance = [1.2, 3.0, 4.5, 1.8]
EXP_NAME = "CONV_GRU_k3_hs_20"
MODEL_PATH = "/content/gdrive/My Drive/MITRE_layers_2_epochs_20_{}".format(EXP_NAME)
model2 = ConvGRU(150,200, output_dim=len(labels_to_distance), num_layers=2, kernel_sizes=3)
print(sum(p.numel() for p in model2.parameters()))
model2.load_state_dict(torch.load(MODEL_PATH))
output_predictions(model2, val_X, labels_to_distance, dev_interval_to_file, "/content/gdrive/My Drive/MITRE_{}_layers_2_final_output.tsv".format(EXP_NAME))



1991844
['abgikaek_tc4tl20.csv', 'abgikaek_tc4tl20.csv', 'abgikaek_tc4tl20.csv', 'abgikaek_tc4tl20.csv', 'acehqsss_tc4tl20.csv', 'acehqsss_tc4tl20.csv', 'acehqsss_tc4tl20.csv', 'acehqsss_tc4tl20.csv', 'acehqsss_tc4tl20.csv', 'acehqsss_tc4tl20.csv', 'adcmsfnp_tc4tl20.csv', 'adcmsfnp_tc4tl20.csv', 'adcmsfnp_tc4tl20.csv', 'adcmsfnp_tc4tl20.csv', 'adljjzjj_tc4tl20.csv', 'adljjzjj_tc4tl20.csv', 'adzvqmmg_tc4tl20.csv', 'adzvqmmg_tc4tl20.csv', 'adzvqmmg_tc4tl20.csv', 'adzvqmmg_tc4tl20.csv', 'adzvqmmg_tc4tl20.csv', 'adzvqmmg_tc4tl20.csv', 'adzvqmmg_tc4tl20.csv', 'adzvqmmg_tc4tl20.csv', 'adzvqmmg_tc4tl20.csv', 'adzvqmmg_tc4tl20.csv', 'adzvqmmg_tc4tl20.csv', 'adzvqmmg_tc4tl20.csv', 'adzvqmmg_tc4tl20.csv', 'adzvqmmg_tc4tl20.csv', 'adzvqmmg_tc4tl20.csv', 'adzvqmmg_tc4tl20.csv', 'aedrrlnv_tc4tl20.csv', 'aedrrlnv_tc4tl20.csv', 'aedrrlnv_tc4tl20.csv', 'afgsvjro_tc4tl20.csv', 'agfvzuay_tc4tl20.csv', 'agfvzuay_tc4tl20.csv', 'agfvzuay_tc4tl20.csv', 'agfvzuay_tc4tl20.csv', 'agfvzuay_tc4tl20.csv', 'agfvzu



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
