Create dataset for downloaded videos

In [15]:
import pandas as pd
import seaborn as sns
import matplotlib as plt
import os
from EDA.eda_functions import get_label

In [16]:
ijmond_df = pd.read_json('data/datasets/ijmond_dataset.json')
#predict label for each video in ijmond dir
files = os.listdir("data/ijmond_videos/")

# Extract file names without extensions
file_names = [os.path.splitext(file)[0] for file in files]

# Filter the ijmond DataFrame to include only rows with matching file names
filtered_ijmond_df = ijmond_df[ijmond_df['file_name'].isin(file_names)]

In [18]:
filtered_ijmond_df['label'] = filtered_ijmond_df.apply(get_label, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_ijmond_df['label'] = filtered_ijmond_df.apply(get_label, axis = 1)


In [19]:
filtered_ijmond_df['label'].value_counts()

1    13
0    13
Name: label, dtype: int64

In [None]:
#Save the created dataset
filtered_ijmond_df.to_json("data/datasets/experimental_ijmond_dataset.json", orient='records')

Create scrpit that can transform videos to images and use clip on them

In [10]:
import os
import clip
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import json
import cv2
from PIL import Image
from EDA.eda_functions import get_label

# Define device
if torch.cuda.is_available():
    device = torch.device("cuda") # use CUDA device
elif torch.backends.mps.is_available():
    device = torch.device("mps") # use MacOS GPU device (e.g., for M2 chips)
else:
    device = torch.device("cpu") # use CPU device
device

#Load the model
model, preprocess = clip.load('ViT-B/32', device)

#Define class names in a list - it needs prompt engineering
class_names = ["a photo of industrial plants emiting smoke from chimney", "a photo of industrial plants with no smoke above chimney"]

#Function to create an image from the video
def preprocess_video_to_image(video_path):
    # Open the video file
    video = cv2.VideoCapture(video_path)
    frames = []
    if not video.isOpened():
        print("Error: Could not open video file")
    else:
        while True:
            ret, frame = video.read()
            if not ret:
                break
            frames.append(frame)
        video.release()
    # Concatenate frames horizontally to create a single image
    concatenated_frame = np.concatenate(frames, axis=1)
    return concatenated_frame

#func to get the true label of the video
def get_true_label(file_name):
    row = ijmond_df[ijmond_df['file_name'] == file_name].iloc[0]
    return get_label(row)

#function to gwt label of the image produced from the video
def vanilla_clip(video_path):
    # Preprocess video into a single image
    video_image = preprocess_video_to_image(video_path)
    
    # Read image and preprocess
    image = preprocess(Image.fromarray(video_image)).unsqueeze(0).to(device)

    # Prepare text inputs based on class names list
    text_inputs = clip.tokenize(class_names).to(device)

    # Calculate features
    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text_inputs)

    # Calculate similarity
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    #values are the probabilities, indices are the classes
    values, indices = similarity[0].topk(2)

    # Print predictions
    print(f"\nPredictions for the entire video:\n")
    for value, index in zip(values, indices):
        print(f"{class_names[index]:>16s}: {100 * value.item():.2f}%")

#predict label for each video in ijmond dir
files = os.listdir("data/ijmond_videos/")
for file in files:
    video_path = f"data/ijmond_videos/{file}"
    file_name = file.split('.')[0]
    vanilla_clip(video_path)
    get_true_label(file_name)




Predictions for the entire video:

a photo of industrial plants with no smoke above chimney: 92.43%
a photo of industrial plants emiting smoke from chimney: 7.59%

Predictions for the entire video:

a photo of industrial plants with no smoke above chimney: 91.99%
a photo of industrial plants emiting smoke from chimney: 8.04%

Predictions for the entire video:

a photo of industrial plants with no smoke above chimney: 92.33%
a photo of industrial plants emiting smoke from chimney: 7.70%

Predictions for the entire video:

a photo of industrial plants with no smoke above chimney: 85.60%
a photo of industrial plants emiting smoke from chimney: 14.42%

Predictions for the entire video:

a photo of industrial plants with no smoke above chimney: 80.57%
a photo of industrial plants emiting smoke from chimney: 19.43%

Predictions for the entire video:

a photo of industrial plants with no smoke above chimney: 87.35%
a photo of industrial plants emiting smoke from chimney: 12.59%

Predictions 

In [16]:
import os
import clip
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import json
import cv2
from PIL import Image
from EDA.eda_functions import get_label

# Define device
if torch.cuda.is_available():
    device = torch.device("cuda") # use CUDA device
elif torch.backends.mps.is_available():
    device = torch.device("mps") # use MacOS GPU device (e.g., for M2 chips)
else:
    device = torch.device("cpu") # use CPU device
device

#Load the model
model, preprocess = clip.load('ViT-B/32', device)

#Define class names in a list - it needs prompt engineering
class_names = ["a photo of industrial plants emiting smoke from chimney", "a photo of industrial plants with no smoke above chimney"]

#Function to create an image from the video
def preprocess_video_to_image_grid_version(video_path, num_rows=6, num_cols=6):
    # Open the video file
    video = cv2.VideoCapture(video_path)
    frames = []
    if not video.isOpened():
        print("Error: Could not open video file")
    else:
        while True:
            ret, frame = video.read()
            if not ret:
                break
            frames.append(frame)
        video.release()
    
        # Split frames into grid
    grid = []
    for i in range(num_rows):
        row = np.concatenate(frames[i * num_cols: (i + 1) * num_cols], axis=1)
        grid.append(row)
    
    # Concatenate grid vertically to create a single image
    concatenated_frame = np.concatenate(grid, axis=0)
    return concatenated_frame

#func to get the true label of the video
def get_true_label(file_name):
    row = ijmond_df[ijmond_df['file_name'] == file_name].iloc[0]
    return get_label(row)

#function to gwt label of the image produced from the video
def vanilla_clip(video_path):
    # Preprocess video into a single image
    video_image = preprocess_video_to_image_grid_version(video_path)
    # Convert numpy array to PIL Image
    #image = Image.fromarray(video_image)
    # Save the image
    #image.save('square_image.png')
    
    # Read image and preprocess
    image = preprocess(Image.fromarray(video_image)).unsqueeze(0).to(device)

    # Prepare text inputs based on class names list
    text_inputs = clip.tokenize(class_names).to(device)

    # Calculate features
    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text_inputs)

    # Calculate similarity
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    #values are the probabilities, indices are the classes
    values, indices = similarity[0].topk(2)

    # Print predictions
    print(f"\nPredictions for the entire video {video_path}:\n")
    for value, index in zip(values, indices):
        print(f"{class_names[index]:>16s}: {100 * value.item():.2f}%")

#predict label for each video in ijmond dir
files = os.listdir("data/ijmond_videos/")
for file in files:
    video_path = f"data/ijmond_videos/{file}"
    file_name = file.split('.')[0]
    vanilla_clip(video_path)
    get_true_label(file_name)

#test:
#vanilla_clip('data/ijmond_videos/5PurGkmy0aw-1.mp4')



Predictions for the entire video data/ijmond_videos/5PurGkmy0aw-1.mp4:

a photo of industrial plants with no smoke above chimney: 95.90%
a photo of industrial plants emiting smoke from chimney: 4.15%


In [6]:
import os
import clip
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import json
import cv2
import pandas as pd

# Define device
if torch.cuda.is_available():
    device = torch.device("cuda") # use CUDA device
elif torch.backends.mps.is_available():
    device = torch.device("mps") # use MacOS GPU device (e.g., for M2 chips)
else:
    device = torch.device("cpu") # use CPU device
device

#Load the model
model, preprocess = clip.load('ViT-B/32', device)
ijmond_df = pd.read_json('data/datasets/ijmond_dataset.json')

#Define class names in a list - it needs prompt engineering
class_names = ["a photo of industrial plants emiting smoke from chimney", "a photo of industrial plants with no smoke above chimney"]

def preprocess_video_to_image(video_path):
    # Open the video file
    video = cv2.VideoCapture(video_path)
    frames = []
    if not video.isOpened():
        print("Error: Could not open video file")
    else:
        while True:
            ret, frame = video.read()
            if not ret:
                break
            frames.append(frame)
        video.release()
    # Concatenate frames horizontally to create a single image
    concatenated_frame = np.concatenate(frames, axis=1)
    return concatenated_frame

# Define the dataset class
class MyDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        file_name = self.data[idx]['file_name']
        label = int(self.data[idx]['label'])  # Convert label to integer
        video_path = os.path.join('data/ijmond_videos', f"{file_name}.mp4")
        video_frames = preprocess_video_to_image(video_path)
        if self.transform:
            video_frames = self.transform(video_frames)
        return video_frames, label

# Load the dataset
with open('data/datasets/experimental_ijmond_dataset.json', 'r') as f:
    dataset = json.load(f)

# Split the dataset into train and test sets
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)

# Load the dataset
train_dataset = MyDataset(train_data, transform=preprocess)
test_dataset = MyDataset(test_data, transform=preprocess)

def get_features(dataset):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for videos, labels in tqdm(DataLoader(dataset, batch_size=1)):
            features = model.encode_image(videos.to(device))

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

# Calculate the video features
train_features, train_labels = get_features(train_dataset)
test_features, test_labels = get_features(test_dataset)

# Perform logistic regression
classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
classifier.fit(train_features, train_labels)

# Evaluate using the logistic regression classifier
predictions = classifier.predict(test_features)
accuracy = np.mean((test_labels == predictions).astype(float)) * 100.
print(f"Accuracy = {accuracy:.3f}")


  0%|          | 0/20 [00:00<?, ?it/s]


TypeError: Unexpected type <class 'numpy.ndarray'>

In [17]:

import clip

clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [19]:

model, preprocess = clip.load("ViT-B/32")
model.eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


In [21]:

model, preprocess = clip.load("ViT-L/14")
model.eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 427,616,513
Input resolution: 224
Context length: 77
Vocab size: 49408


In [22]:

model, preprocess = clip.load("ViT-B/16")
model.eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

100%|███████████████████████████████████████| 335M/335M [00:28<00:00, 12.4MiB/s]


Model parameters: 149,620,737
Input resolution: 224
Context length: 77
Vocab size: 49408
