In [None]:
import cv2
import numpy as np
import tensorflow as tf

class VideoFeatureExtractor:

    def __init__(self, model, resolution=224, clip_length=32, stride=32):
        self.model = model
        self.resolution = resolution
        self.clip_length = clip_length
        self.stride = stride

    def sliding_window(self, arr):
        size = self.clip_length
        stride = self.stride
        num_chunks = int((len(arr) - size) / stride) + 2
        result = []
        for i in range(0, num_chunks * stride, stride):
            if len(arr[i:i + size]) > 0:
                result.append(arr[i:i + size])
                
        if len(result) % self.clip_length != 0: result = result[:-1]
        print("\tSLIDED IN2",np.shape(result)[0],np.shape(result)[1:])
        return np.array(result)

    def get_video_clips(self, video_path):
        cap = cv2.VideoCapture(video_path)
        frames = []
        while (cap.isOpened()):
            sucess, frame = cap.read()
            if not sucess: break
            frame = cv2.resize(frame, (self.resolution,self.resolution))
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
        cap.release()
        print("\tFRAMED",np.shape(frames)[0],np.shape(frames)[1:])

        # Create clips using sliding window
        clips = self.sliding_window(frames)
        return clips

    def normalize_input(self, batch_clips):
        mean = np.array([0.485 * 255, 0.456 * 255, 0.406 * 255], dtype=np.float32)
        std = np.array([0.229 * 255, 0.224 * 255, 0.225 * 255], dtype=np.float32)
        return (batch_clips - mean) / std

    def extract_features_per_clip(self, video_path):
        # Load video clips
        clips = self.get_video_clips(video_path)

        # Preprocess and extract features
        features = []
        for clip in clips:
            input_batch = clip.astype(np.float32)  # Keep pixel values in the [0, 255] range
            input_batch = self.normalize_input(input_batch)  # Normalize input using ImageNet mean and std values
            input_batch = np.expand_dims(input_batch, axis=0)  # Add batch dimension
            input_batch = np.transpose(input_batch, (0, 4, 1, 2, 3))  # Rearrange to [batch_size, channels, frames, height, width]
            feature = self.model(input_batch)
            features.append(feature.numpy())

        return np.array(features)

    def extract_features_per_video(self, video_path):
        clip_features = self.extract_features_per_clip(video_path)
        video_features = np.mean(clip_features, axis=0)
        return video_features

### FEATURE EXTRACTOR USING VSWIN
- LOAD XDV TRAIN
- GET EACH VIDEO INTO CLIPS OF clips_length frames
- EXTRACT FEATURES WITH VSWIN TRANFORMER
- THEN TRY TO INTERPOLATE FEATURRES USING THE SULTANI METHODS SO FEATURES ARE ALL SAME LENGTH

In [1]:
from utils import globo , xdv
import cv2 , os
import numpy as np
import tensorflow as tf

fn , labels , tframes = xdv.load_train_valdt_npy('train')

/raid/DATASETS/.zuble/vigia /raid/DATASETS/anomaly

LOADING train data (3210,) (3210,) (3210,) 


	normal 1618
	abnormal 1592


In [8]:
model = tf.saved_model.load("/raid/DATASETS/.zuble/vigia/zurgb11/.pretrained/swin_tiny_patch244_window877_kinetics400_1k_1")

In [10]:
resolution = 224
channels = 3
clip_length = 32
features_per_bag = 32

def get_video_clips(video_path, clip_length = clip_length , stride = clip_length):
    
    def sliding_window(arr, size, stride):
        num_chunks = int((len(arr) - size) / stride) + 2
        result = []
        for i in range(0,  num_chunks * stride, stride):
            if len(arr[i:i + size]) > 0:
                #print(i,len(arr[i:i + size]))
                result.append(arr[i:i + size])
                
        # Remove last clip if number of frames is not equal to 32
        if len(result) % clip_length != 0: result = result[:-1]
        print("\tSLIDED IN2",np.shape(result)[0],np.shape(result)[1:])
        return np.array(result)   
    
    cap = cv2.VideoCapture(video_path)
    frames = []
    while (cap.isOpened()):
        sucess, frame = cap.read()
        if not sucess: break
        frame = cv2.resize(frame, (resolution,resolution))
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)
    cap.release()
    print("\tFRAMED",np.shape(frames)[0],np.shape(frames)[1:])
    
    return sliding_window(frames, clip_length, stride)


In [11]:
for i,tframe in enumerate(tframes):
    if tframe > 1000 and labels[i] == 1:
        print(f'{i} {os.path.basename(fn[i])}\n\tlabel {labels[i]}\n\t{tframes[i]} frames')
        break

3 Kingsman.The.Golden.Circle.2017__#02-00-02_02-01-21_label_B1-0-0.mp4
	label 1
	1898 frames


In [12]:
clips = get_video_clips(fn[3])
clips.shape

	FRAMED 1898 (224, 224, 3)
	SLIDED IN2 59 (32, 224, 224, 3)


(59, 32, 224, 224, 3)

In [13]:
def normalize_input(batch_clips):
    mean = np.array([0.485 * 255, 0.456 * 255, 0.406 * 255], dtype=np.float32)
    std = np.array([0.229 * 255, 0.224 * 255, 0.225 * 255], dtype=np.float32)
    return (batch_clips - mean) / std

# Preprocess and extract features
features = []
for clip in clips:
    input_batch = clip.astype(np.float32)  # Keep pixel values in the [0, 255] range
    input_batch = normalize_input(input_batch)  # Normalize input using ImageNet mean and std values
    input_batch = np.expand_dims(input_batch, axis=0)  # Add batch dimension
    input_batch = np.transpose(input_batch, (0, 4, 1, 2, 3))  # Rearrange to [batch_size, channels, frames, height, width]
    feature = model(input_batch)[0].numpy()
    features.append(feature)

In [17]:
np.shape(features[0])

(1, 768, 16, 7, 7)

In [None]:
## AnomalyDetection_CVPR18
def fsegmentation_sultani(features, features_per_bag):
    feature_size = np.array(features).shape[1]
    interpolated_features = np.zeros((features_per_bag, feature_size))
    interpolation_indicies = np.round(np.linspace(0, len(features) - 1, num=features_per_bag + 1))
    count = 0
    for index in range(0, len(interpolation_indicies)-1):
        print("interpolate",index)
        start = int(interpolation_indicies[index])
        end = int(interpolation_indicies[index + 1])

        assert end >= start

        if start == end: temp_vect = features[start, :]
        else: temp_vect = np.mean(features[start:end+1, :], axis=0)

        temp_vect = temp_vect / np.linalg.norm(temp_vect)

        if np.linalg.norm(temp_vect) == 0: print("Error")

        interpolated_features[count,:]=temp_vect
        count = count + 1
    return np.array(interpolated_features)

## tfm-anomaly-detection
def fsegmentation_tfm(features, n_segments=32):
    import sklearn.preprocessing
    if features.shape[0] < n_segments:
        raise RuntimeError("Number of prev segments lesser than expected output size")

    cuts = np.linspace(0, features.shape[0], n_segments, dtype=int, endpoint=False)

    new_feats = []
    for i, j in zip(cuts[:-1], cuts[1:]):
        new_feats.append(np.mean(features[i:j,:], axis=0))

    new_feats.append(np.mean(features[cuts[-1]:,:], axis=0))

    new_feats = np.array(new_feats)
    new_feats = sklearn.preprocessing.normalize(new_feats, axis=1)
    return new_feats

In [None]:
def extract_features_per_clip(video_path, model):
    # Load video clips
    clips = get_video_clips(video_path)

    features = []
    for clip in clips:
        input_batch = clip.astype(np.float32)  # Keep pixel values in the [0, 255] range
        input_batch = normalize_input(input_batch)  # Normalize input using ImageNet mean and std values
        input_batch = np.expand_dims(input_batch, axis=0)  # Add batch dimension
        feature = model(input_batch)
        features.append(feature.numpy())

    return np.array(features)

def extract_features_per_video(video_path, model, spatial_size=224):
    clip_features = extract_features_per_clip(video_path, model, spatial_size)
    video_features = np.mean(clip_features, axis=0)
    return video_features

model = tf.saved_model.load("/raid/DATASETS/.zuble/vigia/zurgb11/.pretrained/swin_tiny_patch244_window877_kinetics400_1k_1")

vf = extract_features_per_video(fn[1] , model)