In [1]:
import os
import urllib


# There are two models: lid.176.bin at ~130mb and more accurate;
# and the smaller less accurate lid.176.ftz.
# https://fasttext.cc/docs/en/language-identification.html
MODEL = 'lid.176.bin'

# Check if model downloaded. 
if not os.path.isfile('models/'+MODEL):
    urllib.request.urlretrieve(
        'https://dl.fbaipublicfiles.com/fasttext/supervised-models/'+MODEL,
        'models/'+MODEL)

In [2]:
import fasttext
from pycountry import languages


def prediction_wrapper(func):
    """
    Wrapper around fastText prediction function. Same output when 'sentences' is str or
    list of strs, no other data types accepted. Human readable language prediction labels
    and rounded confidence. Only returns first label and confidence, so if model.predict()
    method is called with k > 1, other predictions are ignored.
    """        
    def wrapper(*args):
        if isinstance(args[0], list) and all(type(i) == str for i in args[0]):
            prediction = func(*args)
            return [{'sentence': s,
                     'language': languages.get(alpha_2=prediction[0][i][0][-2:]).name,
                     'confidence': round(prediction[1][i][0], 2)}
                    for i, s in enumerate(args[0])]
        elif isinstance(args[0], str):
            prediction = func(*args)
            return [{'sentence': args[0],
                 'language': languages.get(alpha_2=prediction[0][0][-2:]).name,
                 'confidence': round(prediction[1][0], 2)}]
        raise ValueError("First argument 'sentences' must be str or list of strs.")
    return wrapper
    

@prediction_wrapper
def identify_language(sentences, model_file='models/'+MODEL):
    model = fasttext.load_model(model_file)
    prediction = model.predict(sentences)
    return prediction

In [3]:
from json import dumps

print(dumps(
    identify_language(
        # sample sentences collected from:
        # https://tatoeba.org/eng/sentences/show/random
        [
            'Tom está no jardim.',                                         # pr
            'Unsere berechtigte Forderung beantworteten sie abschlägig.',  # de
            'Where is the bathroom?',                                      # en
            "Je n'aimerais pas le rencontrer dans un endroit sombre.",     # fr
            'John dice que quiere morir en el mismo lugar en que nació.',  # es
            'Том согласился помочь Мэри.',                                 # ru
            'Io non mi fido dei politici.',                                # it
        ]),
    indent=2, sort_keys=True, ensure_ascii=False))

[
  {
    "confidence": 0.97,
    "language": "Portuguese",
    "sentence": "Tom está no jardim."
  },
  {
    "confidence": 0.98,
    "language": "German",
    "sentence": "Unsere berechtigte Forderung beantworteten sie abschlägig."
  },
  {
    "confidence": 0.96,
    "language": "English",
    "sentence": "Where is the bathroom?"
  },
  {
    "confidence": 1.0,
    "language": "French",
    "sentence": "Je n'aimerais pas le rencontrer dans un endroit sombre."
  },
  {
    "confidence": 1.0,
    "language": "Spanish",
    "sentence": "John dice que quiere morir en el mismo lugar en que nació."
  },
  {
    "confidence": 1.0,
    "language": "Russian",
    "sentence": "Том согласился помочь Мэри."
  },
  {
    "confidence": 0.99,
    "language": "Italian",
    "sentence": "Io non mi fido dei politici."
  }
]



