suyashb95 · gozat · Feb 7, 2022
diff --git a/setup.py b/setup.py
@@ -5,12 +5,16 @@
 
 setup(
   name = 'wiktionaryparser',
-  version = '0.0.96',
+  version = '0.0.99',
   description = 'A tool to parse word data from wiktionary.com into a JSON object',
   long_description = long_desc,
   long_description_content_type='text/markdown',
   packages = ['wiktionaryparser', 'tests'],
-  data_files=[('testOutput', ['tests/testOutput.json']), ('readme', ['readme.md']), ('requirements', ['requirements.txt'])],
+  data_files=[('testOutput_en', ['tests/testOutput_en.json']), 
+              ('testOutput_fr', ['tests/testOutput_fr.json']),
+              ('readme', ['readme.md']), 
+              ('requirements', ['requirements.txt']),
+              ],
   author = 'Suyash Behera',
   author_email = 'sne9x@outlook.com',
   url = 'https://github.com/Suyash458/WiktionaryParser', 

diff --git a/...ml_test_files/abiologically-43781266.html → ...test_files_en/abiologically-43781266.html b/...ml_test_files/abiologically-43781266.html → ...test_files_en/abiologically-43781266.html
diff --git a/tests/html_test_files/admiral-50357597.html → .../html_test_files_en/admiral-50357597.html b/tests/html_test_files/admiral-50357597.html → .../html_test_files_en/admiral-50357597.html
diff --git a/...ml_test_files/aldersblandet-38616917.html → ...test_files_en/aldersblandet-38616917.html b/...ml_test_files/aldersblandet-38616917.html → ...test_files_en/aldersblandet-38616917.html
diff --git a/tests/html_test_files/alexin-50152026.html → ...s/html_test_files_en/alexin-50152026.html b/tests/html_test_files/alexin-50152026.html → ...s/html_test_files_en/alexin-50152026.html
diff --git a/tests/html_test_files/by-50399022.html → tests/html_test_files_en/by-50399022.html b/tests/html_test_files/by-50399022.html → tests/html_test_files_en/by-50399022.html
diff --git a/...ml_test_files/correspondent-61052028.html → ...test_files_en/correspondent-61052028.html b/...ml_test_files/correspondent-61052028.html → ...test_files_en/correspondent-61052028.html
diff --git a/tests/html_test_files/for-50363295.html → tests/html_test_files_en/for-50363295.html b/tests/html_test_files/for-50363295.html → tests/html_test_files_en/for-50363295.html
diff --git a/tests/html_test_files/grapple-50080840.html → .../html_test_files_en/grapple-50080840.html b/tests/html_test_files/grapple-50080840.html → .../html_test_files_en/grapple-50080840.html
diff --git a/tests/html_test_files/heis-49469949.html → tests/html_test_files_en/heis-49469949.html b/tests/html_test_files/heis-49469949.html → tests/html_test_files_en/heis-49469949.html
diff --git a/tests/html_test_files/house-50356446.html → tests/html_test_files_en/house-50356446.html b/tests/html_test_files/house-50356446.html → tests/html_test_files_en/house-50356446.html
diff --git a/tests/html_test_files/konkurs-48269433.html → .../html_test_files_en/konkurs-48269433.html b/tests/html_test_files/konkurs-48269433.html → .../html_test_files_en/konkurs-48269433.html
diff --git a/...s/html_test_files/maldivisk-49859434.html → ...tml_test_files_en/maldivisk-49859434.html b/...s/html_test_files/maldivisk-49859434.html → ...tml_test_files_en/maldivisk-49859434.html
diff --git a/...html_test_files/pantergaupe-46717478.html → ...l_test_files_en/pantergaupe-46717478.html b/...html_test_files/pantergaupe-46717478.html → ...l_test_files_en/pantergaupe-46717478.html
diff --git a/...s/html_test_files/patronise-49023308.html → ...tml_test_files_en/patronise-49023308.html b/...s/html_test_files/patronise-49023308.html → ...tml_test_files_en/patronise-49023308.html
diff --git a/tests/html_test_files/seg-50359832.html → tests/html_test_files_en/seg-50359832.html b/tests/html_test_files/seg-50359832.html → tests/html_test_files_en/seg-50359832.html
diff --git a/tests/html_test_files/song-60388804.html → tests/html_test_files_en/song-60388804.html b/tests/html_test_files/song-60388804.html → tests/html_test_files_en/song-60388804.html
diff --git a/tests/html_test_files/test-50342756.html → tests/html_test_files_en/test-50342756.html b/tests/html_test_files/test-50342756.html → tests/html_test_files_en/test-50342756.html
diff --git a/tests/html_test_files/video-50291344.html → tests/html_test_files_en/video-50291344.html b/tests/html_test_files/video-50291344.html → tests/html_test_files_en/video-50291344.html
diff --git a/tests/html_test_files/ἀγγελία-47719496.html → .../html_test_files_en/ἀγγελία-47719496.html b/tests/html_test_files/ἀγγελία-47719496.html → .../html_test_files_en/ἀγγελία-47719496.html
diff --git a/tests/html_test_files_fr/anarchie-20220207.html b/tests/html_test_files_fr/anarchie-20220207.html
diff --git a/tests/html_test_files_fr/roquefort-20220207.html b/tests/html_test_files_fr/roquefort-20220207.html
diff --git a/tests/html_test_files_fr/song-20220207.html b/tests/html_test_files_fr/song-20220207.html
diff --git a/tests/html_test_files_fr/échelle-20220207.html b/tests/html_test_files_fr/échelle-20220207.html
diff --git a/tests/testOutput.json → tests/testOutput_en.json b/tests/testOutput.json → tests/testOutput_en.json
diff --git a/tests/testOutput_fr.json b/tests/testOutput_fr.json
diff --git a/tests/test_core.py → tests/test_core_en.py b/tests/test_core.py → tests/test_core_en.py
@@ -4,15 +4,16 @@
 from wiktionaryparser import WiktionaryParser
 from deepdiff import DeepDiff
 from typing import Dict, List
-import mock
+from unittest import mock
 from urllib import parse
 import os
 
 parser = WiktionaryParser()
 
 
 tests_dir = os.path.dirname(__file__)
-html_test_files_dir = os.path.join(tests_dir, 'html_test_files')
+html_test_files_dir = os.path.join(tests_dir, 'html_test_files_en')
+output_test_json = os.path.join(tests_dir, "testOutput_en.json")
 markup_test_files_dir = os.path.join(tests_dir, 'markup_test_files')
 
 test_words = [
@@ -74,7 +75,7 @@ class TestParser(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         self.expected_results = {}
 
-        with open('tests/testOutput.json', 'r') as f:
+        with open(output_test_json, 'r') as f:
             self.expected_results = json.load(f)
 
         super(TestParser, self).__init__(*args, **kwargs)

diff --git a/tests/test_core_fr.py b/tests/test_core_fr.py
@@ -0,0 +1,92 @@
+from parameterized import parameterized
+import unittest
+import json
+from wiktionaryparser import WiktionaryParser
+from deepdiff import DeepDiff
+from typing import Dict, List
+from unittest import mock
+from urllib import parse
+import os
+
+parser = WiktionaryParser(language="français")
+
+
+tests_dir = os.path.dirname(__file__)
+html_test_files_dir = os.path.join(tests_dir, 'html_test_files_fr')
+output_test_json = os.path.join(tests_dir, "testOutput_fr.json")
+
+test_words = [
+    ('anarchie', 20220207, ['Français']),
+    ('anarchie', 20220207, ['Italien']),
+    ('échelle', 20220207, ['Français']),
+    ('roquefort', 20220207, ['Français']),
+    ('song', 20220207, ['Anglais']),
+]
+
+
+def get_test_words_table(*allowed_words):
+    """Convert the test_words array to an array of three element tuples."""
+    result = []
+
+    for word, old_id, languages in test_words:
+        for language in languages:
+            if len(allowed_words) == 0 or (word in allowed_words):
+                result.append((language, word, old_id))
+
+    return result
+
+
+class MockResponse:
+    def __init__(self, text: str):
+        self.text = text
+
+
+def mocked_requests_get(*args, **kwargs):
+    url = args[0]
+    parsed_url = parse.urlparse(url)
+    params = kwargs['params']
+
+    word = parsed_url.path.split('/')[-1]
+    filepath = os.path.join(html_test_files_dir,
+                            f'{word}-{params["oldid"]}.html')
+    with open(filepath, 'r', encoding='utf-8') as f:
+        text = f.read()
+
+    return MockResponse(text)
+
+
+class TestParser(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        self.expected_results = {}
+
+        with open(output_test_json, 'r') as f:
+            self.expected_results = json.load(f)
+
+        super(TestParser, self).__init__(*args, **kwargs)
+
+    @parameterized.expand(get_test_words_table())
+    @mock.patch("requests.Session.get", side_effect=mocked_requests_get)
+    def test_fetch_using_mock_session(self, lang: str, word: str, old_id: int, mock_get):
+        self.__test_fetch(lang, word, old_id)
+
+    def __test_fetch(self, lang: str, word: str, old_id: int):
+        fetched_word = parser.fetch(word, language=lang, old_id=old_id)
+
+        print("Testing \"{}\" in \"{}\"".format(word, lang))
+        expected_result = self.expected_results[lang][word]
+
+        diff = DeepDiff(fetched_word,
+                        expected_result,
+                        ignore_order=True)
+
+        if diff != {}:
+            print("Found mismatch in \"{}\" in \"{}\"".format(word, lang))
+            print(json.dumps(json.loads(diff.to_json()), indent=4))
+            print("Actual result:")
+            print(json.dumps(fetched_word, indent=4))
+
+        self.assertEqual(diff, {})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/wiktionaryparser/__init__.py b/wiktionaryparser/__init__.py
@@ -1,11 +1,13 @@
+from pkg_resources import get_distribution
+__version__ = get_distribution("wiktionaryparser").version
+
 from wiktionaryparser.utils import WordData, Definition, RelatedWord
-from wiktionaryparser.core import PARTS_OF_SPEECH, RELATIONS, WiktionaryParser
+from wiktionaryparser.core import WiktionaryParser
+
 
 __all__ = [
     'WordData',
     'Definition',
     'RelatedWord',
-    'PARTS_OF_SPEECH',
-    'RELATIONS',
     'WiktionaryParser'
 ]
diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py
@@ -1,25 +1,11 @@
 import re, requests
 from wiktionaryparser.utils import WordData, Definition, RelatedWord
+from wiktionaryparser.languages import abbreviation_to_language, get_language
+
 from bs4 import BeautifulSoup
 from itertools import zip_longest
-from copy import copy
 from string import digits
 
-PARTS_OF_SPEECH = [
-    "noun", "verb", "adjective", "adverb", "determiner",
-    "article", "preposition", "conjunction", "proper noun",
-    "letter", "character", "phrase", "proverb", "idiom",
-    "symbol", "syllable", "numeral", "initialism", "interjection",
-    "definitions", "pronoun", "particle", "predicative", "participle",
-    "suffix",
-]
-
-RELATIONS = [
-    "synonyms", "antonyms", "hypernyms", "hyponyms",
-    "meronyms", "holonyms", "troponyms", "related terms",
-    "coordinate terms",
-]
-
 def is_subheading(child, parent):
     child_headings = child.split(".")
     parent_headings = parent.split(".")
@@ -31,17 +17,17 @@ def is_subheading(child, parent):
     return True
 
 class WiktionaryParser(object):
-    def __init__(self):
-        self.url = "https://en.wiktionary.org/wiki/{}?printable=yes"
+    def __init__(self, language: str="english"):
+        self.language = abbreviation_to_language(str(language))
+        self.current_word = None
         self.soup = None
+        self.PARTS_OF_SPEECH, self.RELATIONS, \
+            self.ETYMOLOGY, self.PRONUNCIATION, self.url = get_language(self.language)
         self.session = requests.Session()
         self.session.mount("http://", requests.adapters.HTTPAdapter(max_retries = 2))
         self.session.mount("https://", requests.adapters.HTTPAdapter(max_retries = 2))
-        self.language = 'english'
-        self.current_word = None
-        self.PARTS_OF_SPEECH = copy(PARTS_OF_SPEECH)
-        self.RELATIONS = copy(RELATIONS)
-        self.INCLUDED_ITEMS = self.RELATIONS + self.PARTS_OF_SPEECH + ['etymology', 'pronunciation']
+        self.INCLUDED_ITEMS = self.RELATIONS + self.PARTS_OF_SPEECH \
+                                    + self.ETYMOLOGY + self.PRONUNCIATION
 
     def include_part_of_speech(self, part_of_speech):
         part_of_speech = part_of_speech.lower()
@@ -85,9 +71,9 @@ def count_digits(self, string):
 
     def get_id_list(self, contents, content_type):
         if content_type == 'etymologies':
-            checklist = ['etymology']
+            checklist = self.ETYMOLOGY
         elif content_type == 'pronunciation':
-            checklist = ['pronunciation']
+            checklist = self.PRONUNCIATION
         elif content_type == 'definitions':
             checklist = self.PARTS_OF_SPEECH
             if self.language == 'chinese':
@@ -222,12 +208,16 @@ def parse_etymologies(self, word_contents):
             next_tag = span_tag.parent.find_next_sibling()
             while next_tag and next_tag.name not in ['h3', 'h4', 'div', 'h5']:
                 etymology_tag = next_tag
+                etymology_tag_text = ''
                 next_tag = next_tag.find_next_sibling()
                 if etymology_tag.name == 'p':
-                    etymology_text += etymology_tag.text
+                    etymology_tag_text += etymology_tag.text
+                elif etymology_tag.name in ['dd', 'dl']:
+                    etymology_tag_text += etymology_tag.text
                 else:
                     for list_tag in etymology_tag.find_all('li'):
-                        etymology_text += list_tag.text + '\n'
+                        etymology_tag_text += list_tag.text + '\n'
+                etymology_text += etymology_tag_text
             etymology_list.append((etymology_index, etymology_text))
         return etymology_list
 

diff --git a/wiktionaryparser/languages.py b/wiktionaryparser/languages.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Language tools, to change the language in the WiktionaryParser class
+"""
+
+ABBREVIATION_TO_LANGUAGE = {
+    'en': 'english', 
+    "fr": 'français',
+    }
+
+LANGUAGES = {
+    "english": {
+        "ETYMOLOGIES_HEADER": [
+            'etymology',
+            ],
+        "PRONUNCIATION_HEADER": [
+            'pronunciation',
+            ],
+        "PART_OF_SPEECH": [
+            "noun", "verb", "adjective", "adverb", "determiner",
+            "article", "preposition", "conjunction", "proper noun",
+            "letter", "character", "phrase", "proverb", "idiom",
+            "symbol", "syllable", "numeral", "initialism", "interjection",
+            "definitions", "pronoun", "particle", "predicative", "participle",
+            "suffix",
+            ],
+        "RELATIONS": [
+            "synonyms", "antonyms", "hypernyms", "hyponyms",
+            "meronyms", "holonyms", "troponyms", "related terms",
+            "coordinate terms",
+            ],
+        "URL": "https://en.wiktionary.org/wiki/{}?printable=yes",
+        },
+    "français": {
+        "ETYMOLOGIES_HEADER": [
+            'étymologie',
+            ],
+        "PRONUNCIATION_HEADER": [
+            'prononciation',
+            ],
+        "PART_OF_SPEECH": [
+            "nom commun", "verbe", "adjectif", "adverbe", "déterminant",
+            "article", "preposition", "conjonction", "nom propre",
+            "lettre", "caractère", "expression", "proverbe", "idiome",
+            "symbole", "syllabe", "nombre", "acronyme", "interjection",
+            "définitions", "pronom", "particule", "prédicat", "participe",
+            "suffixe", "locution nominale",
+            ],
+        "RELATIONS": [
+            "synonymes", "antonymes", "hypéronymes", "hyponymes",
+            "méronymes", "holonymes", "paronymes", "troponymes",
+            "vocabulaire apparenté par le sens", "dérivés",
+            "anagrammes", "proverbes et phrases toutes faites",
+            "apparentés étymologiques", "quasi-synonymes",
+            ],
+        "URL": "https://fr.wiktionary.org/wiki/{}?printable=yes",
+        },    
+}
+
+def abbreviation_to_language(language="en"):
+    """In case one gives an international code (e.g. 'en' for England/English),
+    this method transforms the international code to the language name, 
+    according to the dictionnary `COUNTRY_TO_LANGUAGE`, (e.g. returns "english")
+    """
+    try:
+        language = ABBREVIATION_TO_LANGUAGE[language]
+    except KeyError: 
+        pass
+    return language
+
+def get_language(language="english"):
+    """Exports the part of speech, relations, etymology, 
+    pronunciation and url of the given language, according to the 
+    LANGUAGES dictionnary above."""
+    pos = LANGUAGES.get(language, {}).get("PART_OF_SPEECH", [])
+    rel = LANGUAGES.get(language, {}).get("RELATIONS", [])
+    ety = LANGUAGES.get(language, {}).get("ETYMOLOGIES_HEADER", [])
+    pronun = LANGUAGES.get(language, {}).get("PRONUNCIATION_HEADER", [])
+    url = LANGUAGES.get(language, {}).get("URL", "")
+    return pos, rel, ety, pronun, url