In [1]:
from pathlib import Path
import datetime
import re
from my_videolibrary.mkv_enzyme import MkvFile

In [2]:

DATA_DIR = Path("//OnzeNas/FILM_Eigen/")

In [3]:
FILENAME_PATTERN = re.compile("^(?P<title>.+?) \((?P<year>\d{4})\).mkv$")

In [8]:
def parse_track(track, movie_id):
    d = track.__dict__
    d["movie_id"] = movie_id
    if "duration" in d:
        d["duration"] = str(d["duration"]).split(".")[0]
    return {key: value for key, value in d.items() if value is not None}


def parse_tracks(tracks, movie_id):
    return [parse_track(track, movie_id) for track in tracks]


def parse_info(movie_file: Path, movie_info: MkvFile):
    movie_name = movie_file.name
    movie_id = hash(f"{movie_name} + {str(movie_info.info)}")
    match = FILENAME_PATTERN.match(movie_name)
    if match:
        name, year = FILENAME_PATTERN.match(movie_name).groups()
    else:
        print(f"{movie_name} does not match the pattern")
        name, year = movie_name, None
    info = dict(name=name, year=year, location=str(movie_file))
    info.update(parse_track(movie_info.info, movie_id))

    video_info = parse_tracks(movie_info.video_tracks, movie_id)
    if len(video_info) != 1:
        raise ValueError(f"more than 1 videotrack for {movie_name}")
    info.update(video_info[0])
        
    audio_info = parse_tracks(movie_info.audio_tracks, movie_id)
    sub_info = parse_tracks(movie_info.subtitle_tracks, movie_id)
    return dict(
        info=info,
        audio_info=audio_info,
        sub_info=sub_info,
    )

In [5]:
import pymongo
from pymongo import MongoClient


In [6]:
def insert_non_existing(collection: pymongo.collection.Collection, document):
    if collection.find_one(document):
        return None
    return collection.insert_one(document).inserted_id

In [None]:
DATA_DIR = Path("//OnzeNas/FILM_Eigen/")
# DATA_DIR = DATA_DIR / "Madagascar Collection"
movies = list(DATA_DIR.glob("**/*.mkv"))
movies_info = {
    file: MkvFile(file=file)
    for file in movies
}

In [59]:
with MongoClient() as client:
    db = client.my_videos
    collection_movies = db.movies
    collection_videos = db.video_tracks
    collection_audio = db.audio_tracks
    collection_sub = db.subtitle_tracks
    for movie_name, movie_info in movies_info.items():
        print(movie_name.name)
        info = parse_info(movie_name, movie_info)
        insert_non_existing(collection_movies, info["info"])
        for info_track in info["audio_info"]:
            insert_non_existing(collection_audio, info_track)
        for info_track in info["sub_info"]:
            insert_non_existing(collection_sub, info_track)

West Side Story (1961).mkv
3 -10 to Yuma (2007).mkv
Ben-Hur (1959).mkv
The Lord of the Rings - The Two Towers (2002).mkv
The Lord of the Rings - The Fellowship of the Ring (2001).mkv
The Lord of the Rings - The Return of the King (2003).mkv
C'est arrivé pres de chez vous (1992).mkv
Fight Club (1999).mkv
Walk the Line (2005).mkv
There Will Be Blood (2007).mkv
Taxi Driver (1976).mkv
The Hidden Fortress (1958).mkv
Cars (2006).mkv
The Shining (1980).mkv
Waltz with Bashir (2008).mkv
Eyes Wide Shut (1999).mkv
Dog Day Afternoon (1975).mkv
Seven Samurai (1954).mkv
American Gangster (2007).mkv
The Great Gatsby (2013).mkv
Full Metal Jacket (1987).mkv
Steve + Sky (2004).mkv
Spring, Summer, Fall, Winter... and Spring (2003).mkv
Flags of Our Fathers (2006).mkv
Se7en (1995).mkv
Ratatouille (2007).mkv
House of Flying Daggers (2004).mkv
Romeo + Juliet (1996).mkv
Apocalypse Now (1979).mkv
Sanjuro (1962).mkv
The Wolf of Wall Street (2013).mkv
Carlito's Way (1993).mkv
Yojimbo (1961).mkv
Scarface (1983).m

In [52]:
def _summarize_tracks(tracks):
    return dict(amount=len(tracks), summary="\n".join(map(str, tracks)))


class Movie:
    def __init__(self, properties):
        self._properties = properties.copy()
        self._audiotracks = [
            AudioTrack(track) for track in properties.pop("audio_tracks", [])
        ]
        self._subtitle_tracks = [
            SubtitleTrack(track)
            for track in properties.pop("subtitle_tracks", [])
        ]

    def __getitem__(self, key):
        return self._properties[key]

    def get(self, key, default=None):
        self._properties.get(key, default)

    @property
    def summary_general(self):
        items = (
            "name",
            "year",
            "duration",
            "display_width",
            "display_height",
            "codec_id",
            "location",
        )
        return {item: self[item] for item in items}

    @property
    def summary_audio(self):
        return _summarize_tracks(self._audiotracks)

    @property
    def summary_subtitle(self):
        return _summarize_tracks(self._subtitle_tracks)

    @property
    def summary_complete(self):
        return dict(
            **self.summary_general,
            **{
                f"audio_{key}": value
                for key, value in self.summary_audio.items()
            },
            **{
                f"sub_{key}": value for key, value in self.summary_subtitle.items()
            },
        )

    def __repr__(self):
        properties = dict(
            **self._properties,
            audiotracks=self._audiotracks,
            subtitletracks=self._subtitle_tracks,
        )
        return f"{type(self).__name__}({properties})"

In [53]:
class Track:
    _class_fields: tuple
    _suffixes: dict = {}

    def __init__(self, properties):
        self._properties = properties

    @property
    def _fields(self):
        return [
            f"{self._properties.get(field, '')}{type(self)._suffixes.get(field,'')}"
            for field in self._class_fields
        ]

    def __str__(self):
        return (
            ";".join(self._fields) + (":default" if self._properties["default"] else "")
        )

    def __repr__(self):
        return f"{type(self).__name__}(properties={self._properties.copy()})"


class AudioTrack(Track):
    _class_fields = ("name", "language", "number", "codec_id", "channels")


class SubtitleTrack(Track):
    _class_fields = ("language", "number")

In [54]:
lookup_query = [
            {
                "$lookup": {
                    "from": "audio_tracks",
                    "localField": "movie_id",
                    "foreignField": "movie_id",
                    "as": "audio_tracks",
                }
            },
            {
                "$lookup": {
                    "from": "subtitle_tracks",
                    "localField": "movie_id",
                    "foreignField": "movie_id",
                    "as": "subtitle_tracks",
                }
            },
        ]

In [55]:
with MongoClient() as client:
    db = client.my_videos
    collection_movies = db.movies
    query_results = list(
        Movie(movie) for movie in collection_movies.aggregate(lookup_query)
    )


query_results[:5]

[Movie({'_id': ObjectId('5bda0e07231d8f1b7c667e49'), 'name': 'West Side Story', 'year': '1961', 'location': '\\\\OnzeNas\\FILM_Eigen\\West Side Story (1961)\\West Side Story (1961).mkv', 'duration': '2:52:55', 'muxing_app': 'Lavf55.12.0', 'writing_app': 'HandBrake 0.10.1 2015030800', 'movie_id': -2078791735790941593, 'type': 1, 'number': 1, 'language': 'und', 'enabled': True, 'default': True, 'forced': False, 'lacing': False, 'codec_id': 'V_MPEG4/ISO/AVC', 'width': 1920, 'height': 880, 'interlaced': False, 'crop': {}, 'display_width': 1928, 'display_height': 880, 'display_unit': 3, 'audio_tracks': [{'_id': ObjectId('5bda0e07231d8f1b7c667e4a'), 'type': 2, 'number': 2, 'name': 'Surround', 'language': 'eng', 'enabled': True, 'default': True, 'forced': False, 'lacing': False, 'codec_id': 'A_DTS', 'sampling_frequency': 48000.0, 'channels': 6, 'movie_id': -2078791735790941593}, {'_id': ObjectId('5bda0e07231d8f1b7c667e4b'), 'type': 2, 'number': 3, 'name': 'Stereo', 'language': 'eng', 'enabled

In [56]:
q = query_results[0]

In [58]:
q.summary_complete

{'name': 'West Side Story',
 'year': '1961',
 'duration': '2:52:55',
 'display_width': 1928,
 'display_height': 880,
 'codec_id': 'V_MPEG4/ISO/AVC',
 'location': '\\\\OnzeNas\\FILM_Eigen\\West Side Story (1961)\\West Side Story (1961).mkv',
 'audio_amount': 2,
 'audio_summary': 'Surround;eng;2;A_DTS;6:default\nStereo;eng;3;A_AAC;2',
 'sub_amount': 2,
 'sub_summary': 'eng;4:default\ndut;5'}