In [1]:
from transformers import CLIPProcessor, CLIPModel
import torch



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Carica il modello CLIP
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [5]:
import cv2
import numpy as np
from PIL import Image
import librosa

def load_video(video_path, frame_skip=30):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frame_skip == 0:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame_pil = Image.fromarray(frame_rgb)
            frames.append(frame_pil)
        frame_count += 1
    cap.release()
    return frames

def load_audio(audio_path):
    y, sr = librosa.load(audio_path)
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128, fmax=8000)
    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)
    return spectrogram_db




In [4]:
# !pip install librosa

Collecting librosa
  Downloading librosa-0.10.2.post1-py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.1/260.1 KB[0m [31m79.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting soundfile>=0.12.1
  Downloading soundfile-0.12.1-py2.py3-none-manylinux_2_31_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m272.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting soxr>=0.3.2
  Downloading soxr-0.3.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hCollecting pooch>=1.1
  Downloading pooch-1.8.2-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.6/64.6 KB[0m [31m566.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting audioread>=2.1.9
  Downloading audioread-

In [6]:
# Supponiamo di avere una lista di file video e audio
video_files = ["data/Bald_Headed_Eagle_catches_salmon.mp4"]
audio_files = ["data/file_example_WAV_1MG.wav"]
videos = [load_video(video) for video in video_files]
audios = [load_audio(audio) for audio in audio_files]

video_embeddings = []
audio_embeddings = []

for video in videos:
    inputs = processor(images=video, return_tensors="pt", padding=True)
    with torch.no_grad():
        embeddings = model.get_image_features(**inputs)
    video_embeddings.append(embeddings.mean(dim=0))  # Media degli embedding per rappresentare il video

for audio in audios:
    audio_embeddings.append(torch.from_numpy(audio.flatten()))

video_embeddings = torch.stack(video_embeddings)
audio_embeddings = torch.stack(audio_embeddings)

In [8]:
audio_embeddings.size()

torch.Size([1, 32768])

In [32]:
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# Definisci il modello
class MIModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MIModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x.view(1, 1, -1))
        output = self.fc(hidden[0])
        return output

In [34]:
# Crea il modello
model_mi = MIModel(video_embeddings.size(1), 128, audio_embeddings.size(1))

# Definisci la funzione di perdita
criterion = nn.MSELoss()

# Definisci l'ottimizzatore
optimizer = torch.optim.Adam(model_mi.parameters())

# Crea il dataset e il dataloader
dataset = TensorDataset(video_embeddings, audio_embeddings)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

In [11]:
# Addestra il modello
epochs = 10
for epoch in range(epochs):
    for audio, video in dataloader:
        optimizer.zero_grad()
        output = model_mi(audio)
        loss = criterion(output, video)
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

Epoch 1/10, Loss: 0.25863176584243774
Epoch 2/10, Loss: 0.24095959961414337
Epoch 3/10, Loss: 0.22759336233139038
Epoch 4/10, Loss: 0.215243399143219
Epoch 5/10, Loss: 0.20388157665729523
Epoch 6/10, Loss: 0.19346177577972412
Epoch 7/10, Loss: 0.18392232060432434
Epoch 8/10, Loss: 0.17519310116767883
Epoch 9/10, Loss: 0.1672051101922989
Epoch 10/10, Loss: 0.15989616513252258


In [35]:
import torch
import torch.nn as nn 

# ... (definizione del modello linguistico e altri parametri)

def calcola_entropia(modello, dati_minacce):
  """Calcola l'entropia del modello linguistico.

  Args:
    modello: il modello linguistico (LSTM).
    dati_minacce: un tensore che contiene i vettori di minaccia.

  Returns:
    L'entropia calcolata.
  """
  
  # Metti il modello in modalità di valutazione
  modello.eval()

  # Genera una distribuzione di probabilità sui segnali d'allarme predetti dal modello
  with torch.no_grad():
    predizioni = modello(dati_minacce) 
    probabilità_predizioni = torch.softmax(predizioni, dim=1)

  # Calcola l'entropia 
  entropia = -torch.sum(probabilità_predizioni * torch.log(probabilità_predizioni), dim=1).mean()

  return entropia 

In [36]:
# Crea il dataset casuale di minacce
dati_minacce_casuali = torch.rand(len(video_embeddings), video_embeddings.size(1))

# Crea il dataset reale di minacce
dati_minacce = video_embeddings

In [39]:
# Calcola l'entropia H(S)
entropia_s = calcola_entropia(model_mi, dati_minacce_casuali)

# Calcola l'entropia condizionata H(S|T)
entropia_s_t = calcola_entropia(model_mi, dati_minacce)

In [42]:
(entropia_s - entropia_s_t) / entropia_s *100

tensor(-0.0075)

In [None]:
# 4.2 Entropy Calculation
    # Using the trained model, generate alarm signal predictions
    # using a set of random threat vectors (or real threat vectors)
    # and calculate the entropy of the probability distribution over the predicted signals.
# 4.3 Conditional Entropy Calculation
    # Using the trained model, generate alarm signal predictions
    # using the real dataset of threat-alarm signal pairs
    # and calculate the conditional entropy of the probability distribution over the predicted signals.
# 4.4 MI Calculation
    # Subtract the conditional entropy from the entropy to obtain the MI.