Skip to content

Commit

Permalink
Add format, screenSize, videoCodec, audioCodec, website and year patt…
Browse files Browse the repository at this point in the history
…erns
  • Loading branch information
Toilal committed Oct 10, 2015
1 parent 9b988a7 commit 71fa283
Show file tree
Hide file tree
Showing 23 changed files with 939 additions and 54 deletions.
14 changes: 13 additions & 1 deletion guessit/rules/__init__.py
Expand Up @@ -7,7 +7,19 @@
from rebulk import Rebulk

from .episodes import EPISODES
from .format import FORMAT
from .video_codec import VIDEO_CODEC
from .audio_codec import AUDIO_CODEC
from .screen_size import SCREEN_SIZE
from .website import WEBSITE
from .year import YEAR

REBULK = Rebulk()
REBULK.rebulk(EPISODES)

REBULK.rebulk(EPISODES)
REBULK.rebulk(FORMAT)
REBULK.rebulk(VIDEO_CODEC)
REBULK.rebulk(AUDIO_CODEC)
REBULK.rebulk(SCREEN_SIZE)
REBULK.rebulk(WEBSITE)
REBULK.rebulk(YEAR)
22 changes: 22 additions & 0 deletions guessit/rules/audio_codec.py
@@ -0,0 +1,22 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
audioCodec property
"""
from rebulk import Rebulk

from .common import dash
from .common.validators import seps_surround

import regex as re

AUDIO_CODEC = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash])
AUDIO_CODEC.defaults(name="audioCodec", validator=seps_surround)

AUDIO_CODEC.regex("MP3", "LAME", r"LAME(?:\d)+-(?:\d)+", value="MP3")
AUDIO_CODEC.regex("DolbyDigital", "DD", value="DD")
AUDIO_CODEC.regex("AAC", value="AAC")
AUDIO_CODEC.regex("AC3", value="AC3")
AUDIO_CODEC.regex("Flac", value="FLAC")
AUDIO_CODEC.regex("DTS", value="DTS") # TODO: LeftValidator
AUDIO_CODEC.regex("True-HD", value="True-HD")
9 changes: 9 additions & 0 deletions guessit/rules/common/__init__.py
@@ -0,0 +1,9 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Common module
"""

seps = r' [](){},:;!?+*|&=%§-_~#/\.' # list of tags/words separators

dash = ("-", r"[\W_]?") # abbreviation used by many rebulk objects.
16 changes: 16 additions & 0 deletions guessit/rules/common/validators.py
@@ -0,0 +1,16 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Validators
"""

from rebulk.validators import chars_before, chars_after, chars_surround

from . import seps

from functools import partial


seps_before = partial(chars_before, seps)
seps_after = partial(chars_after, seps)
seps_surround = partial(chars_surround, seps)
5 changes: 2 additions & 3 deletions guessit/rules/episodes.py
Expand Up @@ -8,10 +8,9 @@

import regex as re

EPISODES = Rebulk()
EPISODES = Rebulk().regex_defaults(flags=re.IGNORECASE)

EPISODES.regex(r'(?P<season>\d+)x(?P<episodeNumber>\d+)',
r'S(?P<season>\d+)[ex](?P<episodeNumber>\d+)',
formatter={'season': int, 'episodeNumber': int},
flags=re.IGNORECASE,
formatter=int,
children=True)
30 changes: 30 additions & 0 deletions guessit/rules/format.py
@@ -0,0 +1,30 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
format property
"""
from rebulk import Rebulk

from .common import dash

import regex as re
from .common.validators import seps_surround

FORMAT = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash])
FORMAT.defaults(name="format", validator=seps_surround)

FORMAT.regex("VHS", "VHS-Rip", value="VHS")
FORMAT.regex("CAM", "CAM-Rip", "HD-CAM", value="Cam")
FORMAT.regex("TELESYNC", "TS", "HD-TS", value="Telesync")
FORMAT.regex("WORKPRINT", "WP", value="Workprint")
FORMAT.regex("TELECINE", "TC", value="Telecine")
FORMAT.regex("PPV", "PPV-Rip", value="PPV") # Pay Per View
FORMAT.regex("SD-TV", "SD-TV-Rip", "Rip-SD-TV", "TV-Rip", "Rip-TV", value="TV") # TV is too common to allow matching
FORMAT.regex("DVB-Rip", "DVB", "PD-TV", value="DVB")
FORMAT.regex("DVD", "DVD-Rip", "VIDEO-TS", "DVD-R", "DVD-9", "DVD-5", value="DVD")
FORMAT.regex("HD-TV", "TV-RIP-HD", "HD-TV-RIP", "HD-RIP", value="HDTV")
FORMAT.regex("VOD", "VOD-Rip", value="VOD")
FORMAT.regex("WEB-Rip", value="WEBRip")
FORMAT.regex("WEB-DL", "WEB-HD", "WEB", value="WEB-DL")
FORMAT.regex("HD-DVD-Rip", "HD-DVD", value="HD-DVD")
FORMAT.regex("Blu-ray(?:-Rip)?", "B[DR]", "B[DR]-Rip", "BD[59]", "BD25", "BD50", value="BluRay")
5 changes: 0 additions & 5 deletions guessit/rules/properties.py

This file was deleted.

25 changes: 25 additions & 0 deletions guessit/rules/screen_size.py
@@ -0,0 +1,25 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
screenSize property
"""
from rebulk import Rebulk

import regex as re
from .common.validators import seps_surround

SCREEN_SIZE = Rebulk().regex_defaults(flags=re.IGNORECASE)
SCREEN_SIZE.defaults(name="screenSize", validator=seps_surround)

SCREEN_SIZE.regex(r"(?:\d{3,}(?:\\|\/|x|\*))?360(?:i|p?x?)", value="360p")
SCREEN_SIZE.regex(r"(?:\d{3,}(?:\\|\/|x|\*))?368(?:i|p?x?)", value="368p")
SCREEN_SIZE.regex(r"(?:\d{3,}(?:\\|\/|x|\*))?480(?:i|p?x?)", value="480p")
SCREEN_SIZE.regex(r"(?:\d{3,}(?:\\|\/|x|\*))?576(?:i|p?x?)", value="576p")
SCREEN_SIZE.regex(r"(?:\d{3,}(?:\\|\/|x|\*))?720(?:i|p?x?)", value="720p")
SCREEN_SIZE.regex(r"(?:\d{3,}(?:\\|\/|x|\*))?900(?:i|p?x?)", value="900p")
SCREEN_SIZE.regex(r"(?:\d{3,}(?:\\|\/|x|\*))?1080i", value="1080i")
SCREEN_SIZE.regex(r"(?:\d{3,}(?:\\|\/|x|\*))?1080p?x?", value="1080p")
SCREEN_SIZE.regex(r"(?:\d{3,4}(?:\\|\/|x|\*))?2160(?:i|p?x?)", value="4K")

# TODO: implement validators from guessit 1
# validator=ChainedValidator(DefaultValidator(), OnlyOneValidator()))
21 changes: 21 additions & 0 deletions guessit/rules/video_codec.py
@@ -0,0 +1,21 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
videoCodec property
"""
from rebulk import Rebulk

from .common import dash
from .common.validators import seps_surround

import regex as re

VIDEO_CODEC = Rebulk().regex_defaults(flags=re.IGNORECASE, abbreviations=[dash])
VIDEO_CODEC.defaults(name="videoCodec", validator=seps_surround)

VIDEO_CODEC.regex(r"Rv\d{2}", value="Real")
VIDEO_CODEC.regex("Mpeg2", value="Mpeg2")
VIDEO_CODEC.regex("DVDivX", "DivX", value="DivX")
VIDEO_CODEC.regex("XviD", value="XviD")
VIDEO_CODEC.regex("[hx]-264(?:-AVC)?", "MPEG-4(?:-AVC)", value="h264")
VIDEO_CODEC.regex("[hx]-265(?:-HEVC)?", "HEVC", value="h265")
30 changes: 30 additions & 0 deletions guessit/rules/website.py
@@ -0,0 +1,30 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Website property.
"""

from pkg_resources import resource_stream # @UnresolvedImport

from rebulk import Rebulk

import regex as re

WEBSITE = Rebulk().regex_defaults(flags=re.IGNORECASE)
WEBSITE.defaults(name="website")

TLDS = [l.strip().decode('utf-8')
for l in resource_stream('guessit', 'tlds-alpha-by-domain.txt').readlines()
if b'--' not in l][1:] # All registered domain extension

SAFE_TLDS = ['com', 'org', 'net'] # For sure a website extension
SAFE_SUBDOMAINS = ['www'] # For sure a website subdomain
SAFE_PREFIX = ['co', 'com', 'org', 'net'] # Those words before a tlds are sure

WEBSITE.regex(r'(?:[^a-z0-9]|^)((?:\L<safe_subdomains>\.)+(?:[a-z-]+\.)+(?:\L<tlds>))(?:[^a-z0-9]|$)',
safe_subdomains=SAFE_SUBDOMAINS, tlds=TLDS, children=True)
WEBSITE.regex(r'(?:[^a-z0-9]|^)((?:\L<safe_subdomains>\.)*[a-z-]+\.(?:\L<safe_tlds>))(?:[^a-z0-9]|$)',
safe_subdomains=SAFE_SUBDOMAINS, safe_tlds=SAFE_TLDS, children=True)
WEBSITE.regex(r'(?:[^a-z0-9]|^)((?:\L<safe_subdomains>\.)*[a-z-]+\.(?:\L<safe_prefix>\.)+(?:\L<tlds>))(?:[^a-z0-9]|$)',
safe_subdomains=SAFE_SUBDOMAINS, safe_prefix=SAFE_PREFIX, tlds=TLDS, children=True)

26 changes: 26 additions & 0 deletions guessit/rules/year.py
@@ -0,0 +1,26 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
year property
"""
from rebulk import Rebulk

from .common.validators import seps_surround

YEAR = Rebulk()
YEAR.defaults(name="year", validator=seps_surround)


def validate_year(match):
"""
Check if match is a valid year
:param match:
:type match:
:return:
:rtype:
"""
return 1920 <= match.value < 2030


YEAR.regex(r"\d{4}", formatter=int, validator=validate_year)
8 changes: 8 additions & 0 deletions guessit/test/movies.yml
@@ -0,0 +1,8 @@
? Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv
: #title: Fear and Loathing in Las Vegas
year: 1998
screenSize: 720p
format: HD-DVD
audioCodec: DTS
videoCodec: h264
#releaseGroup: ESiR
30 changes: 30 additions & 0 deletions guessit/test/rules/audioCodec.yml
@@ -0,0 +1,30 @@
# Multiple input strings having same expected results can be chained.
# Use $ marker to check inputs that should not match results.


? +MP3
? +lame
? +lame3.12
? +lame3.100
: audioCodec: MP3

? +DolbyDigital
? +DD
? -Dolby Digital
: audioCodec: DD

? +AAC
: audioCodec: AAC

? +AC3
: audioCodec: AC3

? +Flac
: audioCodec: FLAC

? +DTS
: audioCodec: DTS

? +True-HD
? +trueHD
: audioCodec: True-HD
18 changes: 9 additions & 9 deletions guessit/test/rules/episodes.yml
@@ -1,17 +1,17 @@
# Multiple input strings having same expected results can be chained.
# Use $ marker to check inputs that should not match results.
? 2x5
? 2X5
? 02x05
? 2X05
? 02x5
? +2x5
? +2X5
? +02x05
? +2X05
? +02x5
? S02E05
? s02e05
? s02e5
? s2e05
? $s03e05
? $s02e06
? $3x05
? $2x06
? -s03e05
? -s02e06
? -3x05
? -2x06
: season: 2
episodeNumber: 5

0 comments on commit 71fa283

Please sign in to comment.