https://codereview.stackexchange.com/questions/23277/trying-to-get-output-of-ffprobe-into-variable

https://stackoverflow.com/questions/9896644/getting-ffprobe-information-with-python

In [56]:
import os, sys, subprocess
from pathlib import Path
import xml.etree.ElementTree as eTree
import collections
import xmltodict
import json
import datetime

In [2]:
data_dir = Path('data')
filename = data_dir.joinpath('Blackadder - S01E01 - The Foretelling.mkv')

In [3]:
[str(i) for i in data_dir.glob('*')]

['data\\Barbapapa - S02E01 - De Spaarpot.mkv',
 'data\\Blackadder - S01E01 - The Foretelling.mkv']

In [4]:
def probe_file_json(executable, filename):
    '''Runs ``ffprobe`` executable over ``filename``, returns parsed XML

    Parameters:

        executable (str): Full path leading to ``ffprobe``
        filename (str): Full path leading to the file to be probed

    Returns:

        xml.etree.ElementTree: containing all parsed elements

    '''

    cmd = [
        executable,
        '-v', 'quiet',
        '-print_format', 'json', #here is the trick
        '-show_format',
        '-show_streams',
        '-sexagesimal',
        # '-count_frames',
        '-unit',
        filename,
        ]

    return subprocess.check_output(cmd)
    # return eTree.fromstring(subprocess.check_output(cmd))

In [19]:
def parse_disposition(info):
    return [k for k, v in info.get('disposition', {}).items() if v]
def get_lang(stream):
    return stream.get('tags', {}).get('language', None)

In [45]:
def parse_audio(stream):
    tag_info = ['lang']
    attrib_info = ['index', 'channels', 'codec_name', 'channel_layout', 'bit_rate', 'duration']
    AudioStream = collections.namedtuple('AudioStream', attrib_info + tag_info + ['disposition'])
    
    return 'audio', AudioStream(*[stream.get(info, None) for info in attrib_info], get_lang(stream), parse_disposition(stream))
    
def parse_video(stream):
    tag_info = ['lang']
    attrib_info = ['index', 'codec_name', 'width', 'height']
    
    VideoStream = collections.namedtuple('VideoStream', attrib_info + tag_info + ['disposition'])
    return 'video', VideoStream(*[stream.get(info, None) for info in attrib_info], get_lang(stream), parse_disposition(stream))
    
def parse_sub(stream):
    tag_info = ['lang']
    attrib_info = ['index', 'codec_name', 'duration']
    
    SubtitleStream = collections.namedtuple('SubtitleStream', attrib_info + tag_info + ['disposition'])
    return 'sub', SubtitleStream(*[stream.get(info, None) for info in attrib_info], get_lang(stream), parse_disposition(stream))

In [46]:
def parse_format(format):
    format_info = ['duration', 'size', 'nb_streams', 'format_name']
    FormatInfo = collections.namedtuple('FormatInfo', format_info)
    
    return FormatInfo(*[format.get(info, None) for info in format_info], )

In [47]:
def parse_streams(streams: dict):
    # print(streams)
    parsers = {'subtitle': parse_sub, 'video': parse_video, 'audio': parse_audio}
    for stream in streams:
        # print(stream)
        yield parsers[stream['codec_type']](stream)

In [9]:
def parse_file(filename: Path):
    file_info_json = json.loads(probe_file_json('ffprobe', str(filename)))
    format_info = parse_format(file_info_json['format'])
    # print(file_info_json['streams'])
    stream_info = parse_streams(file_info_json['streams'])
    
    return filename, [format_info] + [i for i in stream_info]

In [10]:
file_info_json = probe_file_json('ffprobe', str(filename))
json.loads(file_info_json)['format']

{'bit_rate': '3471189 bit/s',
 'duration': '0:33:31.480000',
 'filename': 'data\\Blackadder - S01E01 - The Foretelling.mkv',
 'format_long_name': 'Matroska / WebM',
 'format_name': 'matroska,webm',
 'nb_programs': 0,
 'nb_streams': 4,
 'probe_score': 100,
 'size': '872778464 byte',
 'start_time': '0:00:00.000000',
 'tags': {'encoder': 'libmkv 0.6.5'}}

In [11]:
json.loads(file_info_json)['streams']

[{'avg_frame_rate': '25/1',
  'bits_per_raw_sample': '8',
  'chroma_location': 'left',
  'codec_long_name': 'H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10',
  'codec_name': 'h264',
  'codec_tag': '0x0000',
  'codec_tag_string': '[0][0][0][0]',
  'codec_time_base': '1/50',
  'codec_type': 'video',
  'coded_height': 592,
  'coded_width': 720,
  'color_primaries': 'bt470bg',
  'color_range': 'tv',
  'color_space': 'smpte170m',
  'color_transfer': 'bt709',
  'display_aspect_ratio': '765:592',
  'disposition': {'attached_pic': 0,
   'clean_effects': 0,
   'comment': 0,
   'default': 1,
   'dub': 0,
   'forced': 0,
   'hearing_impaired': 0,
   'karaoke': 0,
   'lyrics': 0,
   'original': 0,
   'timed_thumbnails': 0,
   'visual_impaired': 0},
  'field_order': 'progressive',
  'has_b_frames': 2,
  'height': 592,
  'index': 0,
  'is_avc': 'true',
  'level': 31,
  'nal_length_size': '4',
  'pix_fmt': 'yuv420p',
  'profile': 'High',
  'r_frame_rate': '25/1',
  'refs': 1,
  'sample_aspect_ratio': '17:

In [139]:
class MkvInfo:
    def __init__(self, filename: Path):
        self._filename = filename
        file_info = json.loads(probe_file_json('ffprobe', str(filename)))
        
        self._format = parse_format(file_info['format'])
        self._video, self._audio, self._sub = _parse_stream_info(file_info['streams'])
        
    def __str__(self):
        stream_str = '\n\t'.join(str(i) for i in self._stream_info)
        return f'''{str(self._filename)}
    {self._format_info}
    {stream_str}'''
    
    def __repr__(self):
        stream_str = '\n\t'.join(str(i) for i in self._audio)
        return f'''{str(self._filename)}
    video: {self.summarize_format() + self.summarize_video()}
    audio: {self.summarize_audio()}
    sub: {self.summarize_sub()}'''
    
    @property
    def audio_streams(self):
        return len(self._audio)
    
    def summarize_format(self):
        info = self._format
        dt = datetime.datetime.strptime(info.duration, '%X.%f')
        duration = datetime.timedelta(hours=dt.hour, minutes=dt.minute, seconds=dt.second)
        size = round(int(info.size.split()[0]) / 1e9, 1)
        
        return str(duration), f'{size} GB'
    
    def summarize_video(self):
        info  = self._video[0]
        resolution = info.width, info.height
        
        return info.codec_name, resolution
    
    def summarize_audio(self):
        def parse_stream_info(info):
            return info.codec_name, info.channels, (f'{round(int(info.bit_rate.split()[0]) / 1000)} kb/s' if info.bit_rate else None), info.lang, ((info.disposition) if info.disposition else None)
        return [parse_stream_info(i) for i in self._audio] 
    
    def summarize_sub(self):
        return [sub.lang for sub in self._sub]
        
    
def _parse_stream_info(info):video
    result = collections.OrderedDict([(typ, []) for typ in ('video', 'audio', 'sub')])
    for typ, info in parse_streams(info):
        result[typ].append(info)
    return result.values()


In [141]:
m = (MkvInfo(filename))
m.audio_streams

1

In [87]:
datetime.timedelta(*(int(i)'0:33:31.480000'.split('.')[0].split(':')))

TypeError: unsupported type for timedelta microseconds component: str

In [96]:
datetime.datetime.strptime('0:33:31.480000', '%X.%f').hour

0

In [86]:
('0:33:31.480000'.split('.')[0].split(':'))

['0', '33', '31']

In [14]:
data_dir = Path(r'Y:\\')

In [22]:
from pprint import pprint

In [145]:
mkv_files = data_dir.glob('**/*.mkv')
with open('mkv_info2.txt', 'w', encoding='utf8') as output_file:
    for mkvfile in mkv_files:
#         print(mkvfile)
        m = MkvInfo(mkvfile)
        s = m.audio_streams
        print(f'{str(mkvfile)}: contains {m.audio_streams} audio streams')
#         pprint(m, output_file)
#         print('-' * 6, file=output_file)

Y:\West Side Story (1961)\West Side Story (1961).mkv: contains 2 audio streams
Y:\3 -10 to Yuma (2007)\3 -10 to Yuma (2007).mkv: contains 2 audio streams
Y:\Ben-Hur (1959)\Ben-Hur (1959).mkv: contains 2 audio streams
Y:\Ben-Hur (1959)\Ben Hur-disc2.mkv: contains 2 audio streams
Y:\The Lord of the Rings Collection\The Lord of the Rings - The Two Towers (2002)\The Lord of the Rings - The Two Towers (2002).mkv: contains 2 audio streams
Y:\The Lord of the Rings Collection\The Lord of the Rings - The Fellowship of the Ring (2001)\The Lord of the Rings - The Fellowship of the Ring (2001).mkv: contains 2 audio streams
Y:\The Lord of the Rings Collection\The Lord of the Rings - The Return of the King (2003)\The Lord of the Rings - The Return of the King (2003).mkv: contains 2 audio streams
Y:\C'est arrivé pres de chez vous (1992)\C'est arrivé pres de chez vous (1992).mkv: contains 1 audio streams
Y:\Fight Club (1999)\Fight Club (1999).mkv: contains 2 audio streams
Y:\Walk the Line (2005)\Walk 

Y:\300 Collection\300 (2006)\300 (2006).mkv: contains 2 audio streams
Y:\Jungle Boek Collectie\The Jungle Book (1967)\The Jungle Book (1967).mkv: contains 5 audio streams
Y:\Shakespeare in Love (1998)\Shakespeare in Love (1998).mkv: contains 2 audio streams
Y:\Mulholland Drive (2001)\Mulholland Drive (2001).mkv: contains 2 audio streams
Y:\9 (2009)\9 (2009).mkv: contains 2 audio streams
Y:\The Space Odyssey Series\2001 - A Space Odyssey (1968)\2001 - A Space Odyssey (1968).mkv: contains 1 audio streams
Y:\Once Upon a Time in America (1984)\Once Upon a Time in America (1984).mkv: contains 2 audio streams
Y:\Barry Lyndon (1975)\Barry Lyndon (1975).mkv: contains 2 audio streams
Y:\Dr. Strangelove or - How I Learned to Stop Worrying and Love the Bomb (1964)\Dr. Strangelove or - How I Learned to Stop Worrying and Love the Bomb (1964).mkv: contains 1 audio streams
Y:\Blade Runner (1982)\Blade Runner (1982).mkv: contains 2 audio streams
Y:\Big Fish (2003)\Big Fish (2003).mkv: contains 2 audio