Bump rebulk 0.6.1 and enhance title guessing

guessit-io · Nov 11, 2015 · ca469a9 · ca469a9
1 parent 6733b78
commit ca469a9
Show file tree

Hide file tree

Showing 9 changed files with 62 additions and 36 deletions.
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -4,7 +4,8 @@ History
 2.0b1 (unreleased)
 ------------------
 
-- Nothing changed yet.
+- Enhance title guessing.
+- Bump rebulk to ``0.6.1`` to use deep copy instead of shallow copy inside Match crop/split methods.
 
 
 2.0a4 (2015-11-09)

diff --git a/guessit/rules/common/numeral.py b/guessit/rules/common/numeral.py
@@ -108,7 +108,7 @@ def __parse_word(value):
             return word_list.index(value.lower())
         except ValueError:
             pass
-    raise ValueError
+    raise ValueError  # pragma: no cover
 
 
 _clean_re = re.compile(r'[^\d]*(\d+)[^\d]*')
@@ -159,9 +159,9 @@ def parse_numeral(value, int_enabled=True, roman_enabled=True, word_enabled=True
                 for word in value.split():
                     try:
                         return __parse_word(word)
-                    except ValueError:
+                    except ValueError:  # pragma: no cover
                         pass
-            return __parse_word(value)
-        except ValueError:
+            return __parse_word(value)  # pragma: no cover
+        except ValueError:  # pragma: no cover
             pass
-    raise ValueError('Invalid numeral: ' + value)
+    raise ValueError('Invalid numeral: ' + value)   # pragma: no cover
diff --git a/guessit/rules/common/words.py b/guessit/rules/common/words.py
@@ -10,17 +10,6 @@
 _words_rexp = re.compile(r'\w+', re.UNICODE)
 
 
-def find_words(string):
-    """
-    Find all words in a string.
-    :param string:
-    :type string:
-    :return: list of word
-    :rtype: list[str]
-    """
-    return _words_rexp.findall(string.replace('_', ' '))
-
-
 def iter_words(string):
     """
     Iterate on all words in a string

diff --git a/guessit/rules/processors.py b/guessit/rules/processors.py
@@ -9,9 +9,10 @@
 import copy
 
 import six
+from guessit.rules.common.words import iter_words
 
 from rebulk import Rebulk, Rule, CustomRule, POST_PROCESS, PRE_PROCESS, AppendMatch, RemoveMatch
-from .common.formatters import strip
+from .common.formatters import cleanup
 from .common.comparators import marker_sorted
 from .common.date import valid_year
 
@@ -66,12 +67,14 @@ def when(self, matches, context):
         new_matches = []
 
         for filepath in marker_sorted(matches.markers.named('path'), matches):
-            holes = matches.holes(start=filepath.start, end=filepath.end, formatter=strip)
+            holes = matches.holes(start=filepath.start, end=filepath.end, formatter=cleanup)
             for name in matches.names:
                 for hole in list(holes):
                     for current_match in matches.named(name):
                         if isinstance(current_match.value, six.string_types) and \
                                         hole.value.lower() == current_match.value.lower():
+                            if 'equivalent-ignore' in current_match.tags:
+                                continue
                             new_value = _preferred_string(hole.value, current_match.value)
                             if hole.value != new_value:
                                 hole.value = new_value
@@ -151,23 +154,33 @@ def _preferred_string(value1, value2):  # pylint:disable=too-many-return-stateme
     :return: The preferred title
     :rtype: str
     """
-    if value1 and not value2:
-        return value1
-    if value2 and not value1:
-        return value2
     if value1 == value2:
         return value1
     if value1.istitle() and not value2.istitle():
         return value1
-    if value2.istitle() and not value1.istitle():
-        return value2
-    if value1[0].isupper() and not value1[0].isupper():
+    if not value1.isupper() and value2.isupper():
+        return value1
+    if not value1.isupper() and value1[0].isupper() and not value2[0].isupper():
+        return value1
+    if _count_title_words(value1) > _count_title_words(value2):
         return value1
-    if value2[0].isupper() and not value1[0].isupper():
-        return value2
-    return value1
+    return value2
 
 
+def _count_title_words(value):
+    """
+    Count only many words are titles in value.
+    :param value:
+    :type value:
+    :return:
+    :rtype:
+    """
+    ret = 0
+    for word in iter_words(value):
+        if word.group(0).istitle():
+            ret += 1
+    return ret
+
 class SeasonYear(Rule):
     """
     If a season is a valid year and no year was found, create an match with year.
@@ -195,7 +208,7 @@ class Processors(CustomRule):
     def when(self, matches, context):
         pass
 
-    def then(self, matches, when_response, context):
+    def then(self, matches, when_response, context):  # pragma: no cover
         pass
 
 

diff --git a/guessit/rules/properties/title.py b/guessit/rules/properties/title.py
@@ -8,7 +8,7 @@
 from guessit.rules.properties.film import FilmTitleRule
 from guessit.rules.properties.language import SubtitlePrefixLanguageRule, SubtitleSuffixLanguageRule, \
     SubtitleExtensionRule
-from rebulk import Rebulk, Rule, AppendMatch, RemoveMatch
+from rebulk import Rebulk, Rule, AppendMatch, RemoveMatch, AppendTags
 from rebulk.formatters import formatters
 
 from ..common.formatters import cleanup, reorder_title
@@ -288,7 +288,7 @@ class PreferTitleWithYear(Rule):
     Prefer title where filepart contains year.
     """
     dependency = TitleFromPosition
-    consequence = RemoveMatch
+    consequence = [RemoveMatch, AppendTags(['equivalent-ignore'])]
 
     properties = {'title': [None]}
 
@@ -311,15 +311,18 @@ def when(self, matches, context):
                     else:
                         with_year.append(title_match)
 
+        to_tag = []
         if with_year_in_group:
             title_values = set([title_match.value for title_match in with_year_in_group])
+            to_tag.extend(with_year_in_group)
         elif with_year:
             title_values = set([title_match.value for title_match in with_year])
+            to_tag.extend(with_year)
         else:
             title_values = set([title_match.value for title_match in titles])
 
         to_remove = []
         for title_match in titles:
             if title_match.value not in title_values:
                 to_remove.append(title_match)
-        return to_remove
+        return to_remove, to_tag
diff --git a/guessit/test/episodes.yml b/guessit/test/episodes.yml
@@ -361,7 +361,7 @@
   release_group: CtrlHD
 
 ? /home/disaster/Videos/TV/Merlin/merlin_2008.5x02.arthurs_bane_part_two.repack.720p_hdtv_x264-fov.mkv
-: title: Merlin
+: title: merlin
   season: 5
   episode: 2
   part: 2

diff --git a/guessit/test/rules/title.yml b/guessit/test/rules/title.yml
@@ -14,3 +14,19 @@
 ? title_only.mkv
 : title: title only
 
+? Some Title/some.title.mkv
+? some.title/Some.Title.mkv
+: title: Some Title
+
+? SOME TITLE/Some.title.mkv
+? Some.title/SOME TITLE.mkv
+: title: Some title
+
+? some title/Some.title.mkv
+? Some.title/some title.mkv
+: title: Some title
+
+? Some other title/Some.Other.title.mkv
+? Some.Other title/Some other title.mkv
+: title: Some Other title
+
diff --git a/guessit/test/test_yml.py b/guessit/test/test_yml.py
@@ -185,7 +185,11 @@ def check(self, string, expected):
         negates, global_, string = self.parse_token_options(string)
 
         options = expected.get('options')
-        result = guessit(string, options)
+        try:
+            result = guessit(string, options)
+        except Exception as exc:
+            logger.error('[' + string + '] Exception: ' + str(exc))
+            raise exc
 
         entry = EntryResult(string, negates)
 

diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
 README = open(os.path.join(here, 'README.rst')).read()
 HISTORY = open(os.path.join(here, 'HISTORY.rst')).read()
 
-install_requires = ['rebulk>=0.6.0', 'regex', 'babelfish>=0.5.5', 'python-dateutil']
+install_requires = ['rebulk>=0.6.1', 'regex', 'babelfish>=0.5.5', 'python-dateutil']
 
 setup_requires = ['pytest-runner']