#**Enhanced Pedestrian Intention Detection for ADAS and Autonomous Vehicles**
Training and evaluation of a new model through the use of the PIE dataset.


##Step 1: Mount google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Steps 2-4
## Step 2: Clone the PIE repositry to the content directory on google colab:
https://github.com/aras62/PIE.git

##Step 3: Unzip annotations and annotations_vehicle

##Step 4: Clone the YOLOP repository into the content directory:
https://github.com/hustvl/YOLOP.git

In [None]:
!git clone https://github.com/aras62/PIE.git
!unzip /content/PIE/annotations/annotations.zip -d /content/PIE
!unzip /content/PIE/annotations/annotations_vehicle.zip -d /content/PIE
!git clone https://github.com/hustvl/YOLOP.git
!mkdir /content/PIE/PIE_clips

##Step 5: Download set1/set2/set3/set4/set5/set6 video clips from the PIE dataset

In [None]:
!mkdir /content/PIE/PIE_clips/set01
!wget -r --no-parent -P/content/PIE/PIE_clips/set01 -nH -nd "https://data.nvision2.eecs.yorku.ca/PIE_dataset/PIE_clips/set01/"

In [None]:
!mkdir /content/PIE/PIE_clips/set02
!wget -r --no-parent -P/content/PIE/PIE_clips/set02 -nH -nd "https://data.nvision2.eecs.yorku.ca/PIE_dataset/PIE_clips/set02/"

In [None]:
!mkdir /content/PIE/PIE_clips/set03
!wget -r --no-parent -P/content/PIE/PIE_clips/set03 -nH -nd "https://data.nvision2.eecs.yorku.ca/PIE_dataset/PIE_clips/set03/"

In [None]:
!mkdir /content/PIE/PIE_clips/set04
!wget -r --no-parent -P/content/PIE/PIE_clips/set04 -nH -nd "https://data.nvision2.eecs.yorku.ca/PIE_dataset/PIE_clips/set04/"

In [None]:
!mkdir /content/PIE/PIE_clips/set05
!wget -r --no-parent -P/content/PIE/PIE_clips/set05 -nH -nd "https://data.nvision2.eecs.yorku.ca/PIE_dataset/PIE_clips/set05/"

In [None]:
!mkdir /content/PIE/PIE_clips/set06
!wget -r --no-parent -P/content/PIE/PIE_clips/set06 -nH -nd "https://data.nvision2.eecs.yorku.ca/PIE_dataset/PIE_clips/set06/"

## Step 6: Import keras and tensorflow utilities

In [None]:
#Requires Keras 2.2.5 and Tensorflow 1.15.0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import keras
#import keras.backend as K
#K.image_data_format('channels_last')
from keras.layers import Flatten
from keras.layers import Activation
from keras.layers import AveragePooling3D
from keras.layers import BatchNormalization
from keras.layers import Conv3D
from keras.layers import Conv3DTranspose
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Add
from keras.layers import Multiply
from keras.layers import GlobalAveragePooling3D
from keras.layers import GlobalMaxPooling3D
from keras.layers import Input
from keras.layers import MaxPooling3D
from keras.layers import Reshape
from keras.layers import UpSampling3D
from keras.layers import concatenate
from keras.models import Model
from keras.regularizers import l2
!sudo pip install git+https://www.github.com/keras-team/keras-contrib.git
from keras_contrib.layers import SubPixelUpscaling
import csv
import os
import pandas as pd
from keras.utils import to_categorical
import h5py
import numpy as np
import matplotlib.pyplot as plt
import cv2

import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.keras.utils import get_source_inputs
from tensorflow.keras.metrics import Precision, Recall, AUC, F1Score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from PIE.utilities.pie_data import PIE
pie = PIE(data_path='/content/PIE')

import gc

## Step 7: Define a function YOLOPdetect that returns the drivable area and lane segmentation for an image.

 This is a modified version of the YOLOPdetect function that is in demo.py in the YOLOP directory. All credit goes to its authors.

In [None]:
import argparse
import os, sys

BASE_DIR = os.path.dirname('/content/YOLOP')
sys.path.append('/content/YOLOP')

import shutil
import time
from pathlib import Path
import imageio

print(sys.path)
import torch
import torch.backends.cudnn as cudnn
from numpy import random
import scipy.special
import torchvision.transforms as transforms
import PIL.Image as image

!sudo pip install yacs
from lib.config import cfg
from lib.config import update_config
from lib.utils.utils import create_logger, select_device, time_synchronized
from lib.models import get_net
from lib.dataset import LoadImages, LoadStreams
from lib.core.general import non_max_suppression, scale_coords
from lib.utils import plot_one_box,show_seg_result
from lib.core.function import AverageMeter
from lib.core.postprocess import morphological_process, connect_lane
from lib.utils import letterbox_for_img, clean_str
from tqdm import tqdm
from numba import cuda

from google.colab.patches import cv2_imshow

normalize = transforms.Normalize(
        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
    )

transform=transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ])

class opt:
      device = '0'
      weights = '/content/YOLOP/weights/End-to-end.pth'
      source = '/content/YOLOPstream.png'
      img_size = 640
      conf_thres = 0.25
      iou_thres = 0.45
      save_dir = '/content/YOLOP/output'

opt = opt()

# logger, _, _ = create_logger(
#     cfg, cfg.LOG_DIR, 'demo')

device = select_device(None,opt.device)
half = device.type != 'cpu'  # half precision only supported on CUDA

# Load model
yolop_model = get_net(cfg)
checkpoint = torch.load(opt.weights, map_location= device)
yolop_model.load_state_dict(checkpoint['state_dict'])
yolop_model = yolop_model.to(device)
if half:
    yolop_model.half()  # to FP16

img = torch.zeros((1, 3, opt.img_size, opt.img_size), device=device)  # init img
_ = yolop_model(img.half() if half else img) if device.type != 'cpu' else None  # run once
yolop_model.eval()

def YOLOPdetect(img0):

    h0, w0 = img0.shape[:2]
    img, ratio, pad = letterbox_for_img(img0, new_shape=opt.img_size, auto=True)
    h, w = img.shape[:2]
    shapes = (h0, w0), ((h / h0, w / w0), pad)

    dataset = [(None, img, img0, None, shapes)]
    bs = 1  # batch_size


    # Get names and colors
    names = yolop_model.module.names if hasattr(yolop_model, 'module') else yolop_model.names
    colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))]


    # Run inference
    t0 = time.time()

    inf_time = AverageMeter()
    nms_time = AverageMeter()



    da_seg_mask = None

    for i, (path, img, img_det, vid_cap,shapes) in enumerate(dataset):
        with torch.no_grad():
          torch.cuda.empty_cache()
          img = transform(img).to(device)
          img = img.half() if half else img.float()  # uint8 to fp16/32
          if img.ndimension() == 3:
              img = img.unsqueeze(0)
          # Inference
          t1 = time_synchronized()
          det_out, da_seg_out,ll_seg_out= yolop_model(img)
          t2 = time_synchronized()
          # if i == 0:
          #     print(det_out)
          inf_out, _ = det_out
          inf_time.update(t2-t1,img.size(0))

          # Apply NMS
          t3 = time_synchronized()
          det_pred = non_max_suppression(inf_out, conf_thres=opt.conf_thres, iou_thres=opt.iou_thres, classes=None, agnostic=False)
          t4 = time_synchronized()

          nms_time.update(t4-t3,img.size(0))
          det=det_pred[0]

          _, _, height, width = img.shape
          h,w,_=img_det.shape
          pad_w, pad_h = shapes[1][1]
          pad_w = int(pad_w)
          pad_h = int(pad_h)
          ratio = shapes[1][0][1]

          da_predict = da_seg_out[:, :, pad_h:(height-pad_h),pad_w:(width-pad_w)]
          da_seg_mask = torch.nn.functional.interpolate(da_predict, scale_factor=int(1/ratio), mode='bilinear')
          _, da_seg_mask = torch.max(da_seg_mask, 1)
          da_seg_mask = da_seg_mask.int().squeeze().cpu().numpy()
          # da_seg_mask = morphological_process(da_seg_mask, kernel_size=7)


          ll_predict = ll_seg_out[:, :,pad_h:(height-pad_h),pad_w:(width-pad_w)]
          ll_seg_mask = torch.nn.functional.interpolate(ll_predict, scale_factor=int(1/ratio), mode='bilinear')
          _, ll_seg_mask = torch.max(ll_seg_mask, 1)
          ll_seg_mask = ll_seg_mask.int().squeeze().cpu().numpy()
          # Lane line post-processing
          #ll_seg_mask = morphological_process(ll_seg_mask, kernel_size=7, func_type=cv2.MORPH_OPEN)
          #ll_seg_mask = connect_lane(ll_seg_mask)

          img_det = show_seg_result(img_det, (da_seg_mask, ll_seg_mask), _, _, is_demo=True)

          del ll_predict
          del da_predict
          del ll_seg_mask
          del da_seg_mask
          del inf_out
          del det_out
          del det_pred
          del img
          torch.cuda.empty_cache()

    img_det = cv2.resize(img_det, (1920,1080))
    return img_det

##Step 7: Setup posenet

In [None]:
# Create a new directory and initialize Git
!mkdir posenet
!cd posenet
!git init

# Add the remote repository
!git remote add origin https://github.com/michellelychan/posenet-pytorch.git

# Enable sparse checkout
!git config core.sparseCheckout true

# Specify the folder to clone
!echo "/posenet" >> .git/info/sparse-checkout

# Pull the specified folder from the main branch
!git pull origin master

In [None]:
import posenet
from posenet.decode_multi import decode_multiple_poses
from posenet.decode import decode_pose
pose_model = posenet.load_model(101)
pose_model = pose_model.cuda()
output_stride = pose_model.output_stride

def process_input(source_img, scale_factor=1.0, output_stride=16):
    target_width, target_height = posenet.valid_resolution(
        source_img.shape[1] * scale_factor, source_img.shape[0] * scale_factor, output_stride=output_stride)
    scale = np.array([source_img.shape[0] / target_height, source_img.shape[1] / target_width])
    input_img = cv2.resize(source_img, (target_width, target_height), interpolation=cv2.INTER_LINEAR)
    input_img = cv2.cvtColor(input_img, cv2.COLOR_BGR2RGB).astype(np.float32)
    input_img = input_img * (2.0 / 255.0) - 1.0
    input_img = input_img.transpose((2, 0, 1)).reshape(1, 3, target_height, target_width)
    return input_img, source_img, scale

##Step 8: Setup the DenseNet model through keras

In [None]:
from keras.layers import GRU, concatenate
def DenseNet3D_3(input_shape_image, input_shape_pose, input_shape_speed, input_shape_action, input_shape_bbox, num_classes):
    # Image input
    image_input = Input(shape=input_shape_image, name='image_input')

    # Initial Convolution Layer
    x = Conv3D(64, (3, 3, 3), padding='same')(image_input)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = AveragePooling3D(pool_size=(2, 2, 2))(x)

    # Dense Block 1
    x = Conv3D(128, (3, 3, 3), padding='same')(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = AveragePooling3D(pool_size=(2, 2, 2))(x)

    # Dense Block 2
    x = Conv3D(256, (3, 3, 3), padding='same')(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = AveragePooling3D(pool_size=(2, 2, 2))(x)

    # Dense Block 3
    x = Conv3D(512, (3, 3, 3), padding='same')(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = GlobalAveragePooling3D()(x)

    # Ensure image branch has a fixed output size
    x = Dense(128, activation='relu')(x)

    # Pose input
    pose_input = Input(shape=input_shape_pose, name='pose_input')

    # GRU Layer for Pose
    pose_x = GRU(64, return_sequences=False)(pose_input)

    # Ensure pose branch has the same output size
    pose_x = Dense(128, activation='relu')(pose_x)

    # Speed input
    speed_input = Input(shape=input_shape_speed, name='speed_input')

    # GRU Layer for Speed
    speed_x = GRU(32, return_sequences=False)(speed_input)

    # Ensure speed branch has the same output size
    speed_x = Dense(128, activation='relu')(speed_x)

    # Action input
    action_input = Input(shape=input_shape_action, name='action_input')

    # GRU Layer for Action
    action_x = GRU(32, return_sequences=False)(action_input)

    # Dense layer for Action
    action_x = Dense(128, activation='relu')(action_x)

    # Bounding Box input
    bbox_input = Input(shape=input_shape_bbox, name='bbox_input')

    # GRU Layer for Bounding Box
    bbox_x = GRU(32, return_sequences=False)(bbox_input)

    # Dense layer for Bounding Box
    bbox_x = Dense(128, activation='relu')(bbox_x)

    # Learnable weights for each input
    image_weight = Dense(1, activation='sigmoid')(x)
    pose_weight = Dense(1, activation='sigmoid')(pose_x)
    speed_weight = Dense(1, activation='sigmoid')(speed_x)
    action_weight = Dense(1, activation='sigmoid')(action_x)
    bbox_weight = Dense(1, activation='sigmoid')(bbox_x)

    # Multiply inputs by their respective weights
    weighted_image = Multiply()([x, image_weight])
    weighted_pose = Multiply()([pose_x, pose_weight])
    weighted_speed = Multiply()([speed_x, speed_weight])
    weighted_action = Multiply()([action_x, action_weight])
    weighted_bbox = Multiply()([bbox_x, bbox_weight])

    # Combine weighted inputs with an addition (weighted average)
    combined = Add()([weighted_image, weighted_pose, weighted_speed, weighted_action, weighted_bbox])

    # Fully connected layers
    x = Dense(256, activation='relu')(combined)
    x = Dropout(0.5)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)

    # Output layer
    outputs = Dense(num_classes, activation='softmax')(x)

    # Define the model
    model = Model(inputs=[image_input, pose_input, speed_input, action_input, bbox_input], outputs=outputs)

    return model

## Step 9: Define a function parse_video that returns 100x100x16x3 images, 16x34 pose, 16x2 vehicle speed, 16x2 ped action and 16x4 bbox coords for each pedestrian in each frame of a video from the PIE dataset

The input to the function is the ID of the set and ID of the video (eg. 'set01','video_0001')

In [None]:
def parse_video(set_id,video_id):
    rolling_data = {}
    rolling_data_pose = {}
    rolling_data_speed = {}
    rolling_data_action = {}
    rolling_data_bbox = {}
    annotations = pie._get_annotations(set_id,video_id)
    annotations = annotations["ped_annotations"]
    vehicle_annotations = pie._get_vehicle_attributes(set_id,video_id)
    print(vehicle_annotations)
    #vehicle_annotations = vehicle_annotations["vehicle_annotations"]
    #video_bboxes = tf.data.Dataset.from_tensor_slices(np.empty((0,100,100,16,3)).astype('uint8'))
    #video_labels = tf.data.Dataset.from_tensor_slices(np.empty((0,2)).astype('float32'))
    video_bboxes = []
    video_speed = []
    video_poses = []
    video_labels = []
    video_action = []
    video_bbox_coords = []
    video = cv2.VideoCapture(f"/content/PIE/PIE_clips/{set_id}/{video_id}.mp4")
    for key in annotations.keys():

        if(annotations[key]["behavior"]=={}):
            continue
        bboxes = annotations[key]["bbox"]
        frames = annotations[key]["frames"]
        labels = annotations[key]["behavior"]["cross"]
        actions = annotations[key]["behavior"]["action"]
        cropped_imgs = []
        poses = []
        speed = []
        action_list = []
        bbox_coords = []
        video.set(cv2.CAP_PROP_POS_FRAMES, frames[0]-1)
        for i, frame_id in enumerate(frames):

            ret, frame = video.read()

            frame = YOLOPdetect(frame)

            bbox = np.array(bboxes[i])
            bbox = bbox.astype('int')
            x1 = bbox[0] - 10 if bbox[0]  - 10 > 0 else 0
            y1 = bbox[1] - 10 if bbox[1]  - 10 > 0 else 0
            x2 = bbox[2] + 10 if bbox[2]  + 10 < frame.shape[1] else frame.shape[1]
            y2 = bbox[3] + 10 if bbox[3]  + 10 < frame.shape[0] else frame.shape[0]
            bbox = [x1, y1, x2, y2]
            cropped_img = frame[bbox[1]:bbox[3], bbox[0]:bbox[2]]

            # media_predictions = model_pose.predict(cropped_img, conf=0.2, skip_image_resizing=True)
            # poses= media_predictions.prediction.poses
            # conf = media_predictions.prediction.scores
            # edge_links= media_predictions.prediction.edge_links
            # edge_colors=media_predictions.prediction.edge_colors
            # keypoint_colors=media_predictions.prediction.keypoint_colors

            # clear_output()

            #for i in range(len(poses)):
            #    cropped_img = draw_skeleton(cropped_img,poses[i],conf[i],edge_links,edge_colors,2,keypoint_colors,4,0.0,False)

            with torch.no_grad():

              input_image, _, _ = process_input(cropped_img)
              input_image = torch.Tensor(input_image).cuda()
              heatmaps_result, offsets_result, displacement_fwd_result, displacement_bwd_result = pose_model(input_image)
              pose_scores, keypoint_scores, keypoint_coords, pose_offsets = decode_multiple_poses(
                  heatmaps_result.squeeze(0),
                  offsets_result.squeeze(0),
                  displacement_fwd_result.squeeze(0),
                  displacement_bwd_result.squeeze(0),
                  output_stride=output_stride,
                  max_pose_detections=1,
                  min_pose_score=0.1)

            cropped_img = cv2.resize(cropped_img, (100,100))

            if key not in list(rolling_data.keys()):
                rolling_data[key] = [np.asarray(cropped_img)]
            elif len(rolling_data[key]) < 16: # bboxes values for 16 frames
               rolling_data[key].append(np.asarray(cropped_img)) # append the image
            else:
               del rolling_data[key][0] # delete oldest frame bbox and append latest frame bbox
               rolling_data[key].append(np.asarray(cropped_img))

            keypoint_coords = np.reshape(keypoint_coords[0],(34))

            if key not in list(rolling_data_pose.keys()):
                rolling_data_pose[key] = [np.asarray(keypoint_coords)]
            elif len(rolling_data_pose[key]) < 16: # pose values for 16 frames
               rolling_data_pose[key].append(np.asarray(keypoint_coords)) # append the pose
            else:
               del rolling_data_pose[key][0] # delete oldest frame bbox and append latest frame bbox
               rolling_data_pose[key].append(np.asarray(keypoint_coords))

            OBD_speed = np.asarray([vehicle_annotations[frame_id]["OBD_speed"]]*2)

            if key not in list(rolling_data_speed.keys()):
                rolling_data_speed[key] = [OBD_speed]
            elif len(rolling_data_speed[key]) < 16: # speed values for 16 frames
               rolling_data_speed[key].append(OBD_speed) # append the speed
            else:
               del rolling_data_speed[key][0] # delete oldest frame speed and append latest frame speed
               rolling_data_speed[key].append(np.asarray(OBD_speed))

            action = np.asarray([actions[i]]*2)

            if key not in list(rolling_data_action.keys()):
                rolling_data_action[key] = [action]
            elif len(rolling_data_action[key]) < 16: # speed values for 16 frames
               rolling_data_action[key].append(action) # append the speed
            else:
               del rolling_data_action[key][0] # delete oldest frame speed and append latest frame speed
               rolling_data_action[key].append(np.asarray(action))

            bbox_coord = np.asarray(bboxes[i])

            if key not in list(rolling_data_bbox.keys()):
                rolling_data_bbox[key] = [bbox_coord]
            elif len(rolling_data_bbox[key]) < 16: # speed values for 16 frames
               rolling_data_bbox[key].append(bbox_coord) # append the speed
            else:
               del rolling_data_bbox[key][0] # delete oldest frame speed and append latest frame speed
               rolling_data_bbox[key].append(np.asarray(bbox_coord))

            if(i%2!=0):
              continue
            if len(rolling_data[key]) == 16:
              seq = np.stack(np.array(rolling_data[key]),axis=2) # (100*100*16*3)
              #seq = np.expand_dims(seq,axis=0)
              cropped_imgs = cropped_imgs+[seq] # classification output
            else:
              seq = np.stack(np.array([rolling_data[key][-1]] * 16),axis=2)
              #seq = np.expand_dims(seq,axis=0)
              cropped_imgs = cropped_imgs+[seq]

            if len(rolling_data_pose[key]) == 16:
              seq = np.stack(np.array(rolling_data_pose[key]),axis=0) # (16*17*2)
              poses = poses+[seq] # classification output
            else:
              seq = np.stack(np.array([rolling_data_pose[key][-1]] * 16),axis=0)
              poses = poses+[seq]

            if len(rolling_data_speed[key]) == 16:
              seq = np.stack(np.array(rolling_data_speed[key]),axis=0) # (16*1)
              speed = speed+[seq] # classification output
            else:
              seq = np.stack(np.array([rolling_data_speed[key][-1]] * 16),axis=0)
              speed = speed+[seq]

            if len(rolling_data_action[key]) == 16:
              seq = np.stack(np.array(rolling_data_action[key]),axis=0) # (16*1)
              action_list = action_list+[seq] # classification output
            else:
              seq = np.stack(np.array([rolling_data_action[key][-1]] * 16),axis=0)
              action_list = action_list+[seq]

            if len(rolling_data_bbox[key]) == 16:
              seq = np.stack(np.array(rolling_data_bbox[key]),axis=0) # (16*1)
              bbox_coords = bbox_coords+[seq] # classification output
            else:
              seq = np.stack(np.array([rolling_data_bbox[key][-1]] * 16),axis=0)
              bbox_coords = bbox_coords+[seq]

        video_bboxes = video_bboxes + cropped_imgs
        video_poses = video_poses + poses
        video_speed = video_speed + speed
        video_action = video_action + action_list
        video_bbox_coords = video_bbox_coords + bbox_coords

        labels_ = []
        for i in range(len(labels)):
          if(i%2==0):
            if(labels[i]==-1):
              labels_.append(1)
            else:
              labels_.append(labels[i])
        labels = labels_
        video_labels = video_labels + labels

    print(f"parsed {video_id}", flush=True)

    return video_bboxes, video_labels, video_poses, video_speed, video_action, video_bbox_coords

## Step 10: Compile the model
* 100x100x16x3 input (16 consecutive 100x100 images of pedestrians)
* 16x34 pose input
* 16x2 speed input
* 16x2 action input
* 16x4 BBox coords input
* Evaluation metrics for accuracy, precision, recall, AUC and F1 score.

In [None]:
model = DenseNet3D_3((100, 100, 16, 3), (16, 34), (16, 2), (16, 2), (16, 4), 2)

model.compile(
    loss=keras.losses.categorical_crossentropy,
    optimizer = tf.keras.optimizers.RMSprop(learning_rate=1e-4),
    metrics=['accuracy', Precision(name='precision'), Recall(name='recall'), AUC(name='auc'), F1Score(name='f1score')]
)
print(model.summary())

## Run to train the model
Set *curr_set* and *curr_video* to desired starting set and video.
This trains the model video by video till the end of the set to save RAM.

In [None]:
video_num_per_set = [4, 3, 19, 16, 2, 9]
curr_set = 1
curr_video = 1

for i in range(curr_video,video_num_per_set[curr_set-1]+1):

  if i!=curr_video:
    del X_train, X_val, X_train2, X_val2, X_train3, X_val3, y_train, y_val, dataset_bboxes, dataset_labels, dataset_poses, dataset_speed
  gc.collect()

  dataset_bboxes = []
  dataset_labels = []
  dataset_poses = []
  dataset_speed = []

  video_id = f"video_000{i}" if i < 10 else (f"video_00{i}" if i < 100 else f"video_0{i}")
  video_bboxes, video_labels, video_poses, video_speed, dataset_actions, dataset_bbox_coords = parse_video(f'set0{curr_set}', video_id)
  dataset_bboxes = dataset_bboxes + video_bboxes
  dataset_labels = dataset_labels + video_labels
  dataset_poses = dataset_poses + video_poses
  dataset_speed = dataset_speed + video_speed

  del video_bboxes, video_labels, video_poses, video_speed
  gc.collect()

  print(dataset_labels)

  X_train, X_val, y_train, y_val, X_train2, X_val2, X_train3, X_val3, X_train4, X_val4, X_train5, X_val5 = train_test_split(np.array(dataset_bboxes), np.array(dataset_labels), np.array(dataset_poses), np.array(dataset_speed), np.array(dataset_actions), np.array(dataset_bbox_coords), test_size=0.2, random_state=42)

  y_train=tf.one_hot(y_train,depth=2)
  y_val=tf.one_hot(y_val,depth=2)

  print(X_train.shape)
  print(X_train2.shape)
  print(X_train3.shape)
  print(X_train4.shape)
  print(X_train5.shape)

  print(y_train.shape)

  # Add a checkpoint callback to store the checkpoint that has the highest
  # validation accuracy.
  checkpoint_path = f"weights{curr_set}v{i}.best.hdf5.keras"
  checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path,
                              monitor='val_accuracy',
                              verbose=1,
                              save_best_only=True,
                              mode='max')
  earlystopping = keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                                patience=20)

  # Train the model
  history = model.fit([X_train, X_train2, X_train3, X_train4, X_train5], y_train,
                      batch_size=16,
                      epochs=20,
                      validation_data=([X_val, X_val2, X_val3, X_val4, X_val5], y_val),
                      callbacks=[checkpoint, earlystopping])

  # Save the model architecture to JSON file
  model_json = model.to_json()
  with open(f"densenet_model_11 (s{curr_set}v{i}).json", "w") as json_file:
      json_file.write(model_json)

  # Save the model weights to HDF5 file
  model.save_weights(f"densenet_weights_11 (s{curr_set}v{i}).weights.h5")

  # Save the model architecture to JSON file
  model_json = model.to_json()
  with open(f"/content/drive/My Drive/densenet_model_11 (s{curr_set}v{i}).json", "w") as json_file:
      json_file.write(model_json)

  # Save the model weights to HDF5 file
  model.save_weights(f"/content/drive/My Drive/densenet_weights_11 (s{curr_set}v{i}).weights.h5")

## Run to evaluate the model on the 6th set

In [None]:
total_cm = np.array([[0,0],[0,0]])
for j in range (1,10):
  if(j!=1):
    del X_test, X_test2, X_test3, X_test4, X_test5, y_test, dataset_bboxes, dataset_labels, dataset_poses, dataset_speed, dataset_action, dataset_bbox_coords
    gc.collect()

  dataset_bboxes = []
  dataset_labels = []
  dataset_poses = []
  dataset_speed = []

  for i in range (j,j+1):

      video_id = f"video_000{i}" if i < 10 else (f"video_00{i}" if i < 100 else f"video_0{i}")
      video_bboxes, video_labels, video_poses, video_speed, dataset_action, dataset_bbox_coords = parse_video('set06', video_id)
      dataset_bboxes = dataset_bboxes + video_bboxes
      dataset_labels = dataset_labels + video_labels
      dataset_poses = dataset_poses + video_poses
      dataset_speed = dataset_speed + video_speed

      del video_bboxes, video_labels, video_poses, video_speed
      gc.collect()

  print(model.summary())

  # Evaluate the model
  X_test = np.array(dataset_bboxes)
  X_test2 = np.array(dataset_poses)
  X_test3 = np.array(dataset_speed)
  X_test4 = np.array(dataset_action)
  X_test5 = np.array(dataset_bbox_coords)
  y_test = tf.one_hot(np.array(dataset_labels),depth=2)
  results = model.evaluate([X_test,X_test2,X_test3,X_test4,X_test5], y_test , verbose=1)
  print("Test Loss: {}, Test Accuracy: {}, Test Precision: {}, Test Recall: {}, Test AUC: {}".format(results[0], results[1], results[2], results[3], results[4]))

  # Open a file for writing ('w' mode), creates the file if it does not exist
  with open(f'/content/drive/My Drive/pie model 11 accuracy {j}.txt', 'w') as file:
  # Write some text to the file
    file.write("Test Loss: {}, Test Accuracy: {}, Test Precision: {}, Test Recall: {}, Test AUC: {}".format(results[0], results[1], results[2], results[3], results[4]))
    file.write("\nF1 Score: {}\n".format(results[5]))

  # Predict probabilities for test set
  y_probs = []
  y_probs = model.predict([X_test,X_test2,X_test3,X_test4,X_test5])  # these are probabilities of the positive class
  y_pred = (np.array(y_probs) > 0.5).astype('int32')  # convert probabilities to binary predictions
  y_pred = [x[1] for x in y_pred]

  for i in range(len(dataset_labels)):
    if(dataset_labels[i]==-1):
      dataset_labels[i]=1

  # Calculate F1 Score
  print(y_pred)
  print(dataset_labels)
  f1 = f1_score(dataset_labels, y_pred)
  print("F1 Score:", f1)

  from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

  cm = confusion_matrix(dataset_labels, y_pred)
  print("Confusion Matrix:\n", cm)

  total_cm = np.add(total_cm,cm)

  accuracy = accuracy_score(dataset_labels, y_pred)
  precision = precision_score(dataset_labels, y_pred, zero_division=1)
  recall = recall_score(dataset_labels, y_pred, zero_division=1)

  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("Recall:", recall)

  # Open a file for writing ('w' mode), creates the file if it does not exist
  with open(f'/content/drive/My Drive/pie model 11 accuracy {j}.txt', 'w') as file:
  # Write some text to the file
    file.write("Test Loss: {}, Test Accuracy: {}, Test Precision: {}, Test Recall: {}, Test AUC: {}".format(results[0], results[1], results[2], results[3], results[4]))
    file.write("\nF1 Score: {}\n".format(results[5]))
    file.write("Confusion Matrix:\n{}\n".format(cm))
    file.write("Accuracy: {}\n".format(accuracy))
    file.write("Precision: {}\n".format(precision))
    file.write("Recall: {}\n".format(recall))
    file.write("F1 Score: {}\n".format(f1))
    file.write("Total Confusion Matrix:\n{}\n".format(total_cm))