Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 29 additions & 37 deletions dateparser/custom_language_detection/fasttext.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,35 @@
import os
"""
Deprecated fastText language detection module.

import fasttext
This module is deprecated as the fastText library is archived and unmaintained.
Please use langdetect instead.
"""

from dateparser_cli.exceptions import FastTextModelNotFoundException
from dateparser_cli.fasttext_manager import fasttext_downloader
from dateparser_cli.utils import create_data_model_home, dateparser_model_home
import warnings

_supported_models = ["large.bin", "small.bin"]
_DEFAULT_MODEL = "small"


class _FastTextCache:
model = None


def _load_fasttext_model():
if _FastTextCache.model:
return _FastTextCache.model
create_data_model_home()
downloaded_models = [
file for file in os.listdir(dateparser_model_home) if file in _supported_models
]
if not downloaded_models:
fasttext_downloader(_DEFAULT_MODEL)
return _load_fasttext_model()
model_path = os.path.join(dateparser_model_home, downloaded_models[0])
if not os.path.isfile(model_path):
raise FastTextModelNotFoundException("Fasttext model file not found")
_FastTextCache.model = fasttext.load_model(model_path)
return _FastTextCache.model
warnings.warn(
"fastText support is deprecated and will be removed in a future version. "
"The fastText library is archived and unmaintained. "
"Please migrate to langdetect: from dateparser.custom_language_detection.langdetect import detect_languages",
DeprecationWarning,
stacklevel=2,
)


def detect_languages(text, confidence_threshold):
_language_parser = _load_fasttext_model()
text = text.replace("\n", " ").replace("\r", "")
language_codes = []
parser_data = _language_parser.predict(text)
for idx, language_probability in enumerate(parser_data[1]):
if language_probability > confidence_threshold:
language_code = parser_data[0][idx].replace("__label__", "")
language_codes.append(language_code)
return language_codes
"""
Deprecated function. FastText support has been removed.

Args:
text: The text to detect languages from (unused)
confidence_threshold: Minimum confidence threshold (unused)

Raises:
ImportError: Always, as fastText is no longer supported.
"""
raise ImportError(
"fastText is no longer supported as the library is archived and unmaintained. "
"Please use langdetect instead:\n"
" pip install dateparser[langdetect]\n"
" from dateparser.custom_language_detection.langdetect import detect_languages"
)
18 changes: 13 additions & 5 deletions dateparser_cli/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import argparse
import logging
import warnings

from .fasttext_manager import fasttext_downloader
from .utils import clear_cache


Expand All @@ -12,7 +12,7 @@ def entrance():
dateparser_argparse.add_argument(
"--fasttext",
type=str,
help='To download a fasttext language detection models. Supported models are "small" and "large"',
help="[DEPRECATED] fastText is no longer supported. Please use langdetect instead.",
)
dateparser_argparse.add_argument(
"--clear",
Expand All @@ -28,9 +28,17 @@ def entrance():
logging.info("dateparser-download: All cache deleted")

if args.fasttext:
fasttext_downloader(args.fasttext)
warnings.warn(
"fastText support has been removed as the library is archived and unmaintained. "
"Please migrate to langdetect. Install with: pip install dateparser[langdetect]",
DeprecationWarning,
stacklevel=2,
)
dateparser_argparse.error(
"fastText is no longer supported. Please use langdetect for language detection."
)

if not (args.clear or args.fasttext):
if not args.clear:
dateparser_argparse.error(
"dateparser-download: You need to specify the command (i.e.: --fasttext or --clear)"
"dateparser-download: You need to specify the command (i.e.: --clear)"
)
44 changes: 8 additions & 36 deletions docs/custom_language_detection.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@ Custom language detection
=========================

`dateparser` allows to customize the language detection behavior by using the ``detect_languages_function`` parameter.
It currently supports two language detection libraries out of the box: `fastText <https://github.com/facebookresearch/fastText>`_
and `langdetect <https://github.com/Mimino666/langdetect>`_, and allows you to implement your own custom language detection.
It supports the `langdetect <https://github.com/Mimino666/langdetect>`_ library out of the box, and allows you to implement your own custom language detection.

.. warning::

Expand All @@ -14,51 +13,24 @@ and `langdetect <https://github.com/Mimino666/langdetect>`_, and allows you to i
Built-in implementations
========================

fastText
~~~~~~~~
Language detection with fastText.

Import the fastText wrapper and pass it as ``detect_languages_function``
parameter. Example::

>>> from dateparser.custom_language_detection.fasttext import detect_languages
>>> dateparser.parse('12/12/12', detect_languages_function=detect_languages)

The fastText integration currently supports the large and the small models.
Find more information about `fasttext <https://fasttext.cc/blog/2017/10/02/blog-post.html>`__ models.
You can download your model of choice using ``dateparser-download``.

Downloading small model::

>>> dateparser-download --fasttext small

Downloading large model::

>>> dateparser-download --fasttext large

Deleting all cached models::

>>> dateparser-download --clear_cache

.. note::

If no model has been downloaded, the fastText wrapper downloads and uses
the small model by default.

langdetect
~~~~~~~~~~
Language detection with langdetect.

Import the langdetect wrapper and pass it as ``detect_languages_function``
To use langdetect, first install it::

pip install dateparser[langdetect]

Then import the langdetect wrapper and pass it as ``detect_languages_function``
parameter. Example::

>>> from dateparser.custom_language_detection.langdetect import detect_languages
>>> dateparser.parse('12/12/12', detect_languages_function=detect_languages)


.. note::

From some tests we did, we recommend to use ``fastText`` for faster and more accurate results.
**Deprecated fastText support**: The fastText integration has been removed as the library
is archived and no longer maintained. If you were using fastText, please migrate to langdetect.

Custom implementation
=====================
Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
},
extras_require={
"calendars": ["convertdate>=2.2.1", "hijridate"],
"fasttext": ["fasttext>=0.9.1", "numpy>=1.22.0,<2"],
"langdetect": ["langdetect>=1.0.0"],
},
license="BSD",
Expand Down
141 changes: 104 additions & 37 deletions tests/test_language_detect.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

pytest.importorskip("fasttext")
pytest.importorskip("langdetect")

import unittest
from datetime import datetime
Expand All @@ -9,50 +9,117 @@
from parameterized import param, parameterized

from dateparser import parse
from dateparser.custom_language_detection.fasttext import (
detect_languages as fast_text_detect_languages,
)
from dateparser.custom_language_detection.langdetect import (
detect_languages as lang_detect_detect_languages,
)
from dateparser.date import DateDataParser
from dateparser.search import search_dates

detect_languages = Mock()
detect_languages.return_value = ["en"]

class LangDetectBasicTest(unittest.TestCase):
"""Tests for basic langdetect functionality"""

class CustomLangDetectParserTest(unittest.TestCase):
def check_is_returned_list(self):
self.assertEqual(type(self.result), list)
def test_returns_list(self):
result = lang_detect_detect_languages("14 June 2020", 0.0)
self.assertIsInstance(result, list)

@parameterized.expand(
[
param(dt_string="14 June 2020", confidence_threshold=0.0),
param(dt_string="26 July 2021", confidence_threshold=0.0),
]
)
def test_custom_language_detect_fast_text(self, dt_string, confidence_threshold):
self.result = fast_text_detect_languages(dt_string, confidence_threshold)
self.check_is_returned_list()
def test_detects_english(self):
result = lang_detect_detect_languages(
"The meeting is scheduled for Tuesday, July 22, 2014", 0.5
)
self.assertIn("en", result)

@parameterized.expand(
[
param(dt_string="14 June 2020", confidence_threshold=0.0),
]
)
def test_custom_language_detect_lang_detect(self, dt_string, confidence_threshold):
self.result = lang_detect_detect_languages(dt_string, confidence_threshold)
self.check_is_returned_list()
def test_detects_spanish(self):
result = lang_detect_detect_languages(
"La reunión está programada para el martes 22 de julio de 2014", 0.5
)
self.assertIn("es", result)

@parameterized.expand(
[
param(dt_string="10-10-2021", confidence_threshold=0.5),
]
)
def test_lang_detect_doesnt_raise_error(self, dt_string, confidence_threshold):
result = lang_detect_detect_languages(dt_string, confidence_threshold)
assert result == []
def test_detects_french(self):
result = lang_detect_detect_languages(
"La réunion est prévue pour le mardi 22 juillet 2014", 0.5
)
self.assertIn("fr", result)

def test_detects_german(self):
result = lang_detect_detect_languages(
"Das Treffen ist für Dienstag, den 22. Juli 2014 geplant", 0.5
)
self.assertIn("de", result)

def test_handles_numeric_only_input(self):
result = lang_detect_detect_languages("10-10-2021", 0.5)
self.assertEqual(result, [])

def test_handles_empty_string(self):
result = lang_detect_detect_languages("", 0.5)
self.assertEqual(result, [])

def test_confidence_threshold_filters_results(self):
# With low threshold, should detect language
result_low = lang_detect_detect_languages("14 June 2020", 0.0)
self.assertGreater(len(result_low), 0)

# With very high threshold, might not detect anything on short strings
result_high = lang_detect_detect_languages("14 June", 0.99)
# Short strings may have uncertain detection, result_high might be empty
self.assertIsInstance(result_high, list)


class LangDetectIntegrationTest(unittest.TestCase):
"""Tests for langdetect integration with dateparser"""

def test_parse_with_langdetect_english(self):
result = parse(
"Tuesday Jul 22, 2014",
detect_languages_function=lang_detect_detect_languages,
)
self.assertEqual(result, datetime(2014, 7, 22, 0, 0, 0))

def test_parse_with_langdetect_spanish(self):
result = parse(
"martes 22 de julio de 2014",
detect_languages_function=lang_detect_detect_languages,
)
self.assertIsNotNone(result)
self.assertEqual(result.year, 2014)
self.assertEqual(result.month, 7)
self.assertEqual(result.day, 22)

def test_parse_with_langdetect_french(self):
result = parse(
"mardi 22 juillet 2014",
detect_languages_function=lang_detect_detect_languages,
)
self.assertIsNotNone(result)
self.assertEqual(result.year, 2014)
self.assertEqual(result.month, 7)
self.assertEqual(result.day, 22)

def test_datedataparser_with_langdetect(self):
ddp = DateDataParser(detect_languages_function=lang_detect_detect_languages)
result = ddp.get_date_data("Tuesday Jul 22, 2014")
self.assertEqual(result["date_obj"], datetime(2014, 7, 22, 0, 0, 0))

def test_search_dates_with_langdetect(self):
result = search_dates(
"The event is on January 3, 2017 and ends February 1st",
detect_languages_function=lang_detect_detect_languages,
)
self.assertIsNotNone(result)
self.assertEqual(len(result), 2)
# Check that dates are found
self.assertEqual(result[0][1], datetime(2017, 1, 3, 0, 0))
self.assertEqual(result[1][1], datetime(2017, 2, 1, 0, 0))

def test_parse_without_langdetect_still_works(self):
# Ensure dateparser works without custom language detection
result = parse("Tuesday Jul 22, 2014")
self.assertEqual(result, datetime(2014, 7, 22, 0, 0, 0))


class MockLangDetectTest(unittest.TestCase):
"""Tests with mocked language detection"""

# Mock test for parse, search_dates and DateDataParser

Expand All @@ -62,7 +129,7 @@ def test_lang_detect_doesnt_raise_error(self, dt_string, confidence_threshold):
# parse

def when_date_is_parsed_using_parse(self, dt_string):
self.result = parse(dt_string, detect_languages_function=detect_languages)
self.result = parse(dt_string, detect_languages_function=self.detect_languages)

def then_date_obj_exactly_is(self, expected_date_obj):
self.assertEqual(expected_date_obj, self.result)
Expand All @@ -79,7 +146,7 @@ def test_custom_language_detect_mock_parse(self, dt_string, expected_date_obj):
# DateDataParser

def when_date_is_parsed_using_with_datedataparser(self, dt_string):
ddp = DateDataParser(detect_languages_function=detect_languages)
ddp = DateDataParser(detect_languages_function=self.detect_languages)
self.result = ddp.get_date_data(dt_string)["date_obj"]

@parameterized.expand(
Expand All @@ -97,7 +164,7 @@ def test_custom_language_detect_mock_datedataparser(

def when_date_is_parsed_using_with_search_dates(self, dt_string):
self.result = search_dates(
dt_string, detect_languages_function=detect_languages
dt_string, detect_languages_function=self.detect_languages
)

@parameterized.expand(
Expand Down
4 changes: 0 additions & 4 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ passenv = TZ
basepython = python3.14
extras =
calendars
fasttext
langdetect

[testenv:min]
Expand All @@ -39,9 +38,6 @@ deps =
convertdate==2.2.1
hijridate==2.3.0
langdetect==1.0.0
numpy==1.22.0
# fasttext excluded due to
# https://github.com/facebookresearch/fastText/issues/512

[testenv:scripts]
deps =
Expand Down
Loading