To-do:
- Add generate dir, if not-exist

### Marlin Preprocessing

The code reads video files, processes them using a Marlin model to extract features, and saves these features as PyTorch (.pt) files while maintaining logs of processed files and errors. \
\
Currently, model cant be downloaded, so have to download it online

### Input and Output
Input files:
- Videos: in `marlin_preprocessing/chunks` folder, with `mp4` videos
- Model: in `marlin_preprocessing/marlin_models` folder, with `pt` format [[Marlin models download here](https://github.com/ControlNet/MARLIN/releases)]

Output files:
- Marlin feature files: `pt` files, in `marlin_preprocessing/marlin_features_large`

In [None]:
from marlin_pytorch import Marlin
import torch.multiprocessing as mp

from tqdm import tqdm
import torch
import os
import sys
import pandas as pd

# base path directory
base_path = 'marlin_preprocessing'

# can pick 'small', 'base', 'large'
marlin_feature_type = 'large'

In [None]:
def read_file(path):
    try:
        with open(path) as f:
            dat = [i.strip('\n') for i in f.readlines()]
    except:
        return []
    return dat

def log(path, content):
    with open(path, 'a') as f:
        f.write(content)
        f.write('\n')
        
def load_model(feature_type):
    # model = Marlin.from_file(f"marlin_vit_{feature_type}_ytf", f"marlin_models/marlin_vit_{feature_type}_ytf.encoder.pt")
    # download model locally not working, use this instead (temporarily)
    model = Marlin.from_online(f"marlin_vit_{feature_type}_ytf")
    return model

def main(marlin_feature_type, rank):
    model = load_model(marlin_feature_type)
    model = model.cuda()

    _todo_ = read_file(f'todo{rank}.txt')
    errors = []
    processed = read_file(f'{marlin_feature_type}_processed_{rank}.txt')
        
    todo = set(['chunks/' + f for f in _todo_]) - set(processed)
    proc = os.listdir(f'marlin_features_{marlin_feature_type}/')
    todo = list(set(todo) - set(['chunks/' + f.strip('.pt') for f in proc]))

    for vname in tqdm(todo):
        try:
#             print (vname)
            
            path = os.path.join(base_path, vname)
            print (path)
            features = model.extract_video(path, crop_face=True)
            # saving pt file
            
            torch.save(features, f"marlin_features_{marlin_feature_type}/{vname.split('/')[-1]}.pt")
            # logging
            log(f'{marlin_feature_type}_processed_{rank}.txt', vname)

        except Exception as e:
            
#           logging errors
            print (e)
            log(f'{marlin_feature_type}_errors_{rank}.txt', vname)
            
if __name__ == '__main__':
    args = sys.argv
    main('large', 'ESC')