Skip to content

Commit

Permalink
Less memory usage of language module
Browse files Browse the repository at this point in the history
  • Loading branch information
Toilal committed Feb 10, 2014
1 parent fc77c67 commit cc6a4ea
Showing 1 changed file with 25 additions and 28 deletions.
53 changes: 25 additions & 28 deletions guessit/language.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,9 @@

from __future__ import absolute_import, division, print_function, unicode_literals

from guessit.patterns import sep
from guessit import UnicodeMixin, base_text_type, u, s
from guessit.textutils import find_words
from babelfish import Language, LANGUAGES, COUNTRIES
from babelfish import Language
import babelfish
import re
import logging
Expand Down Expand Up @@ -65,7 +64,7 @@ def __init__(self):
self.alpha2 = babelfish.language_converters['alpha2']
self.name = babelfish.language_converters['name']

self.codes |= LANGUAGES | self.alpha3b.codes | self.alpha2.codes | self.name.codes
self.codes |= babelfish.LANGUAGES | self.alpha3b.codes | self.alpha2.codes | self.name.codes

for (alpha3, country), synlist in SYN.items():
for syn in synlist:
Expand Down Expand Up @@ -105,8 +104,6 @@ def reverse(self, name):
raise babelfish.LanguageReverseError(name)


ALL_NAMES = frozenset(c.lower() for c in GuessitConverter().codes)

babelfish.language_converters['guessit'] = GuessitConverter()

COUNTRIES_SYN = {'ES': ['españa'],
Expand All @@ -125,7 +122,7 @@ def __init__(self):

self.name = babelfish.country_converters['name']

self.codes |= set(COUNTRIES.keys()) | self.name.codes
self.codes |= set(babelfish.COUNTRIES.keys()) | self.name.codes

for alpha2, synlist in COUNTRIES_SYN.items():
for syn in synlist:
Expand Down Expand Up @@ -302,29 +299,34 @@ def __repr__(self):
subtitle_prefixes = ['sub', 'subs', 'st', 'vost', 'subforced', 'fansub', 'hardsub']
subtitle_suffixes = ['subforced', 'fansub', 'hardsub']

_possible_languages_hashed = {}
for valid_name in set(ALL_NAMES) - LNG_COMMON_WORDS:
_possible_languages_hashed[valid_name] = ('language', valid_name)
for subtitle_prefix in subtitle_prefixes:
_possible_languages_hashed[subtitle_prefix + valid_name] = ('subtitleLanguage', valid_name)
for subtitle_suffix in subtitle_suffixes:
_possible_languages_hashed[valid_name + subtitle_suffix] = ('subtitleLanguage', valid_name)


def find_possible_languages(string):
"""Find possible languages in the string
:return: list of tuple (property, language, word)
:return: list of tuple (property, Language, lang_word, word)
"""
found_words = set(find_words(string))
words = find_words(string)

valid_words = []
for word in found_words:
lword = word.lower()
result = _possible_languages_hashed.get(lword)
if result:
valid_words.append((result[0], result[1], word))

for word in words:
lang_word = word.lower()
key = 'language'
for prefix in subtitle_prefixes:
if lang_word.startswith(prefix):
lang_word = lang_word[len(prefix):]
key = 'subtitleLanguage'
for suffix in subtitle_suffixes:
if lang_word.endswith(suffix):
lang_word = lang_word[:len(suffix)]
key = 'subtitleLanguage'
if not lang_word in LNG_COMMON_WORDS:
try:
lang = Language(lang_word)
# Keep language with alpha2 equilavent. Others are probably an uncommon language.
if lang == 'mul' or hasattr(lang, 'alpha2'):
valid_words.append((key, lang, lang_word, word))
except babelfish.Error:
pass
return valid_words


Expand All @@ -347,18 +349,13 @@ def search_language(string, lang_filter=None):

confidence = 1.0 # for all of them

for prop, lang, word in find_possible_languages(string):
for prop, language, lang, word in find_possible_languages(string):
pos = string.find(word)
end = pos + len(word)

language = Language(lang)
if lang_filter and language not in lang_filter:
continue

if language != 'mul' and not hasattr(language, 'alpha2'):
# Found language has no alpha2 equilavent. It's probably an uncommon language.
continue

# only allow those languages that have a 2-letter code, those that
# don't are too esoteric and probably false matches
#if language.lang not in lng3_to_lng2:
Expand Down

0 comments on commit cc6a4ea

Please sign in to comment.