Skip to content
This repository has been archived by the owner on Jan 3, 2024. It is now read-only.

deal with french version of wikipedia #92

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,16 @@

setup(
name = 'wiktionaryparser',
version = '0.0.96',
version = '0.0.99',
description = 'A tool to parse word data from wiktionary.com into a JSON object',
long_description = long_desc,
long_description_content_type='text/markdown',
packages = ['wiktionaryparser', 'tests'],
data_files=[('testOutput', ['tests/testOutput.json']), ('readme', ['readme.md']), ('requirements', ['requirements.txt'])],
data_files=[('testOutput_en', ['tests/testOutput_en.json']),
('testOutput_fr', ['tests/testOutput_fr.json']),
('readme', ['readme.md']),
('requirements', ['requirements.txt']),
],
author = 'Suyash Behera',
author_email = 'sne9x@outlook.com',
url = 'https://github.com/Suyash458/WiktionaryParser',
Expand Down
605 changes: 605 additions & 0 deletions tests/html_test_files_fr/anarchie-20220207.html

Large diffs are not rendered by default.

474 changes: 474 additions & 0 deletions tests/html_test_files_fr/roquefort-20220207.html

Large diffs are not rendered by default.

543 changes: 543 additions & 0 deletions tests/html_test_files_fr/song-20220207.html

Large diffs are not rendered by default.

879 changes: 879 additions & 0 deletions tests/html_test_files_fr/échelle-20220207.html

Large diffs are not rendered by default.

File renamed without changes.
328 changes: 328 additions & 0 deletions tests/testOutput_fr.json

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions tests/test_core.py → tests/test_core_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
from wiktionaryparser import WiktionaryParser
from deepdiff import DeepDiff
from typing import Dict, List
import mock
from unittest import mock
from urllib import parse
import os

parser = WiktionaryParser()


tests_dir = os.path.dirname(__file__)
html_test_files_dir = os.path.join(tests_dir, 'html_test_files')
html_test_files_dir = os.path.join(tests_dir, 'html_test_files_en')
output_test_json = os.path.join(tests_dir, "testOutput_en.json")
markup_test_files_dir = os.path.join(tests_dir, 'markup_test_files')

test_words = [
Expand Down Expand Up @@ -74,7 +75,7 @@ class TestParser(unittest.TestCase):
def __init__(self, *args, **kwargs):
self.expected_results = {}

with open('tests/testOutput.json', 'r') as f:
with open(output_test_json, 'r') as f:
self.expected_results = json.load(f)

super(TestParser, self).__init__(*args, **kwargs)
Expand Down
92 changes: 92 additions & 0 deletions tests/test_core_fr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from parameterized import parameterized
import unittest
import json
from wiktionaryparser import WiktionaryParser
from deepdiff import DeepDiff
from typing import Dict, List
from unittest import mock
from urllib import parse
import os

parser = WiktionaryParser(language="français")


tests_dir = os.path.dirname(__file__)
html_test_files_dir = os.path.join(tests_dir, 'html_test_files_fr')
output_test_json = os.path.join(tests_dir, "testOutput_fr.json")

test_words = [
('anarchie', 20220207, ['Français']),
('anarchie', 20220207, ['Italien']),
('échelle', 20220207, ['Français']),
('roquefort', 20220207, ['Français']),
('song', 20220207, ['Anglais']),
]


def get_test_words_table(*allowed_words):
"""Convert the test_words array to an array of three element tuples."""
result = []

for word, old_id, languages in test_words:
for language in languages:
if len(allowed_words) == 0 or (word in allowed_words):
result.append((language, word, old_id))

return result


class MockResponse:
def __init__(self, text: str):
self.text = text


def mocked_requests_get(*args, **kwargs):
url = args[0]
parsed_url = parse.urlparse(url)
params = kwargs['params']

word = parsed_url.path.split('/')[-1]
filepath = os.path.join(html_test_files_dir,
f'{word}-{params["oldid"]}.html')
with open(filepath, 'r', encoding='utf-8') as f:
text = f.read()

return MockResponse(text)


class TestParser(unittest.TestCase):
def __init__(self, *args, **kwargs):
self.expected_results = {}

with open(output_test_json, 'r') as f:
self.expected_results = json.load(f)

super(TestParser, self).__init__(*args, **kwargs)

@parameterized.expand(get_test_words_table())
@mock.patch("requests.Session.get", side_effect=mocked_requests_get)
def test_fetch_using_mock_session(self, lang: str, word: str, old_id: int, mock_get):
self.__test_fetch(lang, word, old_id)

def __test_fetch(self, lang: str, word: str, old_id: int):
fetched_word = parser.fetch(word, language=lang, old_id=old_id)

print("Testing \"{}\" in \"{}\"".format(word, lang))
expected_result = self.expected_results[lang][word]

diff = DeepDiff(fetched_word,
expected_result,
ignore_order=True)

if diff != {}:
print("Found mismatch in \"{}\" in \"{}\"".format(word, lang))
print(json.dumps(json.loads(diff.to_json()), indent=4))
print("Actual result:")
print(json.dumps(fetched_word, indent=4))

self.assertEqual(diff, {})


if __name__ == '__main__':
unittest.main()
8 changes: 5 additions & 3 deletions wiktionaryparser/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from pkg_resources import get_distribution
__version__ = get_distribution("wiktionaryparser").version

from wiktionaryparser.utils import WordData, Definition, RelatedWord
from wiktionaryparser.core import PARTS_OF_SPEECH, RELATIONS, WiktionaryParser
from wiktionaryparser.core import WiktionaryParser


__all__ = [
'WordData',
'Definition',
'RelatedWord',
'PARTS_OF_SPEECH',
'RELATIONS',
'WiktionaryParser'
]
44 changes: 17 additions & 27 deletions wiktionaryparser/core.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,11 @@
import re, requests
from wiktionaryparser.utils import WordData, Definition, RelatedWord
from wiktionaryparser.languages import abbreviation_to_language, get_language

from bs4 import BeautifulSoup
from itertools import zip_longest
from copy import copy
from string import digits

PARTS_OF_SPEECH = [
"noun", "verb", "adjective", "adverb", "determiner",
"article", "preposition", "conjunction", "proper noun",
"letter", "character", "phrase", "proverb", "idiom",
"symbol", "syllable", "numeral", "initialism", "interjection",
"definitions", "pronoun", "particle", "predicative", "participle",
"suffix",
]

RELATIONS = [
"synonyms", "antonyms", "hypernyms", "hyponyms",
"meronyms", "holonyms", "troponyms", "related terms",
"coordinate terms",
]

def is_subheading(child, parent):
child_headings = child.split(".")
parent_headings = parent.split(".")
Expand All @@ -31,17 +17,17 @@ def is_subheading(child, parent):
return True

class WiktionaryParser(object):
def __init__(self):
self.url = "https://en.wiktionary.org/wiki/{}?printable=yes"
def __init__(self, language: str="english"):
self.language = abbreviation_to_language(str(language))
self.current_word = None
self.soup = None
self.PARTS_OF_SPEECH, self.RELATIONS, \
self.ETYMOLOGY, self.PRONUNCIATION, self.url = get_language(self.language)
self.session = requests.Session()
self.session.mount("http://", requests.adapters.HTTPAdapter(max_retries = 2))
self.session.mount("https://", requests.adapters.HTTPAdapter(max_retries = 2))
self.language = 'english'
self.current_word = None
self.PARTS_OF_SPEECH = copy(PARTS_OF_SPEECH)
self.RELATIONS = copy(RELATIONS)
self.INCLUDED_ITEMS = self.RELATIONS + self.PARTS_OF_SPEECH + ['etymology', 'pronunciation']
self.INCLUDED_ITEMS = self.RELATIONS + self.PARTS_OF_SPEECH \
+ self.ETYMOLOGY + self.PRONUNCIATION

def include_part_of_speech(self, part_of_speech):
part_of_speech = part_of_speech.lower()
Expand Down Expand Up @@ -85,9 +71,9 @@ def count_digits(self, string):

def get_id_list(self, contents, content_type):
if content_type == 'etymologies':
checklist = ['etymology']
checklist = self.ETYMOLOGY
elif content_type == 'pronunciation':
checklist = ['pronunciation']
checklist = self.PRONUNCIATION
elif content_type == 'definitions':
checklist = self.PARTS_OF_SPEECH
if self.language == 'chinese':
Expand Down Expand Up @@ -222,12 +208,16 @@ def parse_etymologies(self, word_contents):
next_tag = span_tag.parent.find_next_sibling()
while next_tag and next_tag.name not in ['h3', 'h4', 'div', 'h5']:
etymology_tag = next_tag
etymology_tag_text = ''
next_tag = next_tag.find_next_sibling()
if etymology_tag.name == 'p':
etymology_text += etymology_tag.text
etymology_tag_text += etymology_tag.text
elif etymology_tag.name in ['dd', 'dl']:
etymology_tag_text += etymology_tag.text
else:
for list_tag in etymology_tag.find_all('li'):
etymology_text += list_tag.text + '\n'
etymology_tag_text += list_tag.text + '\n'
etymology_text += etymology_tag_text
etymology_list.append((etymology_index, etymology_text))
return etymology_list

Expand Down
81 changes: 81 additions & 0 deletions wiktionaryparser/languages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Language tools, to change the language in the WiktionaryParser class
"""

ABBREVIATION_TO_LANGUAGE = {
'en': 'english',
"fr": 'français',
}

LANGUAGES = {
"english": {
"ETYMOLOGIES_HEADER": [
'etymology',
],
"PRONUNCIATION_HEADER": [
'pronunciation',
],
"PART_OF_SPEECH": [
"noun", "verb", "adjective", "adverb", "determiner",
"article", "preposition", "conjunction", "proper noun",
"letter", "character", "phrase", "proverb", "idiom",
"symbol", "syllable", "numeral", "initialism", "interjection",
"definitions", "pronoun", "particle", "predicative", "participle",
"suffix",
],
"RELATIONS": [
"synonyms", "antonyms", "hypernyms", "hyponyms",
"meronyms", "holonyms", "troponyms", "related terms",
"coordinate terms",
],
"URL": "https://en.wiktionary.org/wiki/{}?printable=yes",
},
"français": {
"ETYMOLOGIES_HEADER": [
'étymologie',
],
"PRONUNCIATION_HEADER": [
'prononciation',
],
"PART_OF_SPEECH": [
"nom commun", "verbe", "adjectif", "adverbe", "déterminant",
"article", "preposition", "conjonction", "nom propre",
"lettre", "caractère", "expression", "proverbe", "idiome",
"symbole", "syllabe", "nombre", "acronyme", "interjection",
"définitions", "pronom", "particule", "prédicat", "participe",
"suffixe", "locution nominale",
],
"RELATIONS": [
"synonymes", "antonymes", "hypéronymes", "hyponymes",
"méronymes", "holonymes", "paronymes", "troponymes",
"vocabulaire apparenté par le sens", "dérivés",
"anagrammes", "proverbes et phrases toutes faites",
"apparentés étymologiques", "quasi-synonymes",
],
"URL": "https://fr.wiktionary.org/wiki/{}?printable=yes",
},
}

def abbreviation_to_language(language="en"):
"""In case one gives an international code (e.g. 'en' for England/English),
this method transforms the international code to the language name,
according to the dictionnary `COUNTRY_TO_LANGUAGE`, (e.g. returns "english")
"""
try:
language = ABBREVIATION_TO_LANGUAGE[language]
except KeyError:
pass
return language

def get_language(language="english"):
"""Exports the part of speech, relations, etymology,
pronunciation and url of the given language, according to the
LANGUAGES dictionnary above."""
pos = LANGUAGES.get(language, {}).get("PART_OF_SPEECH", [])
rel = LANGUAGES.get(language, {}).get("RELATIONS", [])
ety = LANGUAGES.get(language, {}).get("ETYMOLOGIES_HEADER", [])
pronun = LANGUAGES.get(language, {}).get("PRONUNCIATION_HEADER", [])
url = LANGUAGES.get(language, {}).get("URL", "")
return pos, rel, ety, pronun, url