forked from ourresearch/oadoi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
repository.py
140 lines (114 loc) · 4.46 KB
/
repository.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import shortuuid
from sqlalchemy import or_
from sqlalchemy.orm import defer
from journal import Journal
from app import db
def get_repos_by_ids(ids):
repos = db.session.query(Repository).filter(Repository.id.in_(ids)).all()
return repos
def get_sources_data(query_string=None):
response = get_repository_data(query_string) + get_journal_data(query_string)
return response
def get_sources_data_fast():
all_journals = Journal.query.options(defer('api_raw_crossref'), defer('api_raw_issn')).all()
all_repos = Repository.query.all()
all_sources = all_journals + all_repos
return all_sources
def get_journal_data(query_string=None):
journal_meta_query = Journal.query.options(defer('api_raw_crossref'), defer('api_raw_issn'))
if query_string:
journal_meta_query = journal_meta_query.filter(or_(
Journal.title.ilike(u"%{}%".format(query_string)),
Journal.publisher.ilike(u"%{}%".format(query_string)))
)
journal_meta = journal_meta_query.all()
return journal_meta
def get_raw_repo_meta(query_string=None):
raw_repo_meta_query = Repository.query.distinct(Repository.repository_name, Repository.institution_name)
if query_string:
raw_repo_meta_query = raw_repo_meta_query.filter(or_(
Repository.repository_name.ilike(u"%{}%".format(query_string)),
Repository.institution_name.ilike(u"%{}%".format(query_string)),
Repository.home_page.ilike(u"%{}%".format(query_string)),
Repository.id.ilike(u"%{}%".format(query_string))
))
raw_repo_meta = raw_repo_meta_query.all()
return raw_repo_meta
def get_repository_data(query_string=None):
raw_repo_meta = get_raw_repo_meta(query_string)
block_word_list = [
"journal",
"jurnal",
"review",
"revista",
"annals",
"annales",
"magazine",
"conference",
"proceedings",
"anales",
"publisher",
"press",
"ojs",
"bulletin",
"acta"
]
good_repo_meta = []
for repo_meta in raw_repo_meta:
if repo_meta.repository_name and repo_meta.institution_name:
good_repo = True
if repo_meta.bad_data:
good_repo = False
if repo_meta.is_journal:
good_repo = False
for block_word in block_word_list:
if block_word in repo_meta.repository_name.lower() \
or block_word in repo_meta.institution_name.lower() \
or block_word in repo_meta.home_page.lower():
good_repo = False
for endpoint in repo_meta.endpoints:
if endpoint.pmh_url and block_word in endpoint.pmh_url.lower():
good_repo = False
if good_repo:
good_repo_meta.append(repo_meta)
return good_repo_meta
class Repository(db.Model):
id = db.Column(db.Text, primary_key=True)
home_page = db.Column(db.Text)
institution_name = db.Column(db.Text)
repository_name = db.Column(db.Text)
error_raw = db.Column(db.Text)
bad_data = db.Column(db.Text)
is_journal = db.Column(db.Boolean)
def __init__(self, **kwargs):
self.id = shortuuid.uuid()[0:10]
super(self.__class__, self).__init__(**kwargs)
@property
def text_for_comparision(self):
return self.home_page.lower() + self.repository_name.lower() + self.institution_name.lower() + self.id.lower()
@property
def dedup_name(self):
return self.institution_name.lower() + " " + self.repository_name.lower()
def display_name(self):
return ' - '.join(
filter(None, map(lambda s: (s or '').strip(), [self.institution_name, self.repository_name]))
) or None
def __repr__(self):
return u"<Repository ({}) {}>".format(self.id, self.institution_name)
def to_csv_row(self):
row = []
for attr in ["home_page", "institution_name", "repository_name"]:
value = getattr(self, attr) or u''
value = value.replace(u',', u'; ')
row.append(value)
csv_row = u','.join(row)
return csv_row
def to_dict(self):
response = {
# "id": self.id,
"home_page": self.home_page,
"institution_name": self.institution_name,
"repository_name": self.repository_name
# "pmh_url": self.endpoint.pmh_url,
}
return response