forked from adhikara/linguee
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinguee2.py
112 lines (95 loc) · 3.77 KB
/
linguee2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# FR-EN translator using multiple methods
import sys
import os
try:
from urllib.request import urlopen
except ImportError:
from urllib2 import urlopen
try:
import simplejson as json
except (ImportError):
import json
from urllib.parse import quote
from bs4 import BeautifulSoup
import string
# from wiktionaryparser import WiktionaryParser
import requests
DICT_LANGUAGE = {"de": "German",
"en": "English",
"fr": "French",
"es": "Spanish",
"pt": "Portuguese",
"it": "Italian",
"ru": "Russian",
"ja": "Japanese",
"zh": "Chinese",
"pl": "Polish",
"nl": "Dutch",
"sv": "Swedish",
"da": "Danish",
"fi": "Finnish",
"el": "Greek",
"cs": "Czech",
"ro": "Romanian",
"hu": "Hungarian",
"sk": "Slovak",
"bg": "Bulgarian",
"sl": "Slovene",
"lt": "Lithuanian",
"lv": "Latvian",
"et": "Estonian",
"mt": "Maltese"
}
TYPES = {"noun": ["noun, masculine", "noun, neuter", "noun, feminine", "noun", 'noun,\xa0masculine', "noun,\xa0neuter",
"noun,\xa0feminine"],
"adjective": ["adjective"],
"verb": ["verb"]}
def linguee(word, from_l, to_l, type=None):
"""
crawl through linguee for a definition
:param word: str
:param from_l: str
:param to_l: str
:param type:
:return:
"""
word = word.lower()
from_l = DICT_LANGUAGE[from_l].lower() if len(from_l) == 2 else from_l.lower()
to_l = DICT_LANGUAGE[to_l].lower() if len(to_l) == 2 else to_l.lower()
short_from = from_l if len(from_l) == 2 else list(DICT_LANGUAGE.keys())[
list(DICT_LANGUAGE.values()).index(from_l.capitalize())]
short_to = to_l if len(to_l) == 2 else list(DICT_LANGUAGE.keys())[
list(DICT_LANGUAGE.values()).index(to_l.capitalize())]
linguee_link = "http://www.linguee.com/{}-{}/search?source=auto&query={}".format(from_l, to_l, quote(word))
page = urlopen(linguee_link)
soup = BeautifulSoup(page, "lxml")
definitions = list()
def1 = soup.find_all('a', class_="dictLink featured")
for element in def1:
if bool(element.find_parent(class_='lemma_content')) * bool(element.find_parent(attrs={
"data-source-lang": short_from.upper()})):
if bool(type) and any(bool(element.parent.find(attrs={"title": t})) for t in TYPES[type]):
definitions.append(element.get_text())
elif not bool(type):
definitions.append(element.get_text())
def2 = soup.find_all('a', class_="dictLink")
for element in def2:
if bool(element.find_parent(class_='lemma_content')) * bool(element.find_parent(
attrs={"data-source-lang": short_from.upper()})) * bool(element.find_parent(
class_='translation_group')) * bool(element.find_parent(class_='exact')):
if bool(type) and any(bool(element.parent.find(attrs={"title": t})) for t in TYPES[type]):
definitions.append(element.get_text())
elif not bool(type):
definitions.append(element.get_text())
return definitions
if __name__ == "__main__":
lang_from = "de"
lang_to = "en"
key = "Messwert"
definitions1 = linguee(key, lang_from, lang_to, "noun")
definitions2 = list()
for definition in definitions1:
print("{},{},{}".format(definition, lang_to, lang_from))
definition_ = linguee(definition, lang_to, lang_from, "noun")
definitions2.append(definition_)
print(definitions2)