-
Notifications
You must be signed in to change notification settings - Fork 48
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
(PXP:9493):(feat)Non-dbgap whitelist in usersync #1020
Changes from all commits
485cc34
3cfae50
bce50df
d681451
0fa758f
371ace1
c81e2a9
0cdf78c
0437d33
9c90043
492a969
16bbe64
eb16450
5f7d11a
04e6063
85bb785
15d43d7
1afd7c2
ba1c456
0aa3d45
c480e0c
d99ceca
2add969
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -356,9 +356,10 @@ def __init__( | |
self.storage_manager = StorageManager( | ||
storage_credentials, logger=self.logger | ||
) | ||
self.id_patterns = [] | ||
|
||
@staticmethod | ||
def _match_pattern(filepath, encrypted=True): | ||
def _match_pattern(filepath, id_patterns, encrypted=True): | ||
""" | ||
Check if the filename matches dbgap access control file pattern | ||
|
||
|
@@ -369,11 +370,18 @@ def _match_pattern(filepath, encrypted=True): | |
Returns: | ||
bool: whether the pattern matches | ||
""" | ||
pattern = r"authentication_file_phs(\d{6}).(csv|txt)" | ||
if encrypted: | ||
pattern += ".enc" | ||
pattern += "$" | ||
return re.match(pattern, os.path.basename(filepath)) | ||
id_patterns.append("authentication_file_phs(\d{6}).(csv|txt)") | ||
for pattern in id_patterns: | ||
pattern = r"{}".format(pattern) | ||
if encrypted: | ||
pattern += ".enc" | ||
pattern += "$" | ||
pattern = pattern.encode().decode( | ||
"unicode_escape" | ||
) # when converting the YAML from fence-config, python reads it as Python string literal. So "\" turns into "\\" which messes with the regex match | ||
if re.match(pattern, os.path.basename(filepath)): | ||
return True | ||
return False | ||
|
||
def _get_from_sftp_with_proxy(self, server, path): | ||
""" | ||
|
@@ -489,15 +497,20 @@ def _parse_csv(self, file_dict, sess, dbgap_config={}, encrypted=True): | |
|
||
# parse dbGaP sftp server information | ||
dbgap_key = dbgap_config.get("decrypt_key", None) | ||
parse_consent_code = dbgap_config.get("parse_consent_code", True) | ||
|
||
self.id_patterns += ( | ||
dbgap_config.get("allowed_whitelist_patterns", []) | ||
if dbgap_config.get("allow_non_dbGaP_whitelist", False) | ||
else [] | ||
) | ||
enable_common_exchange_area_access = dbgap_config.get( | ||
"enable_common_exchange_area_access", False | ||
) | ||
study_common_exchange_areas = dbgap_config.get( | ||
"study_common_exchange_areas", {} | ||
) | ||
|
||
if parse_consent_code and enable_common_exchange_area_access: | ||
if self.parse_consent_code and enable_common_exchange_area_access: | ||
self.logger.info( | ||
f"using study to common exchange area mapping: {study_common_exchange_areas}" | ||
) | ||
|
@@ -506,7 +519,9 @@ def _parse_csv(self, file_dict, sess, dbgap_config={}, encrypted=True): | |
if os.stat(filepath).st_size == 0: | ||
self.logger.warning("Empty file {}".format(filepath)) | ||
continue | ||
if not self._match_pattern(filepath, encrypted=encrypted): | ||
if not self._match_pattern( | ||
filepath, id_patterns=self.id_patterns, encrypted=encrypted | ||
): | ||
self.logger.warning( | ||
"Filename {} does not match dbgap access control filename pattern;" | ||
" this could mean that the filename has an invalid format, or has" | ||
|
@@ -527,9 +542,13 @@ def _parse_csv(self, file_dict, sess, dbgap_config={}, encrypted=True): | |
continue | ||
|
||
phsid_privileges = {} | ||
phsid = row.get("phsid", "").split(".") | ||
if dbgap_config.get("allow_non_dbGaP_whitelist", False): | ||
phsid = row.get("phsid", row.get("project_id", "")).split(".") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this be configurable? Or I guess we can just require that people either use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the way usersync is written right now makes ait a little difficult for us to have a per sftp server for picking There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah that's fine |
||
else: | ||
phsid = row.get("phsid", "").split(".") | ||
|
||
dbgap_project = phsid[0] | ||
if len(phsid) > 1 and parse_consent_code: | ||
if len(phsid) > 1 and self.parse_consent_code: | ||
consent_code = phsid[-1] | ||
|
||
# c999 indicates full access to all consents and access | ||
|
@@ -1192,6 +1211,7 @@ def _process_dbgap_files(self, dbgap_config, sess): | |
dbgap_file_list = [] | ||
hostname = dbgap_config["info"]["host"] | ||
username = dbgap_config["info"]["username"] | ||
encrypted = dbgap_config["info"].get("encrypted", True) | ||
folderdir = os.path.join(str(self.folder), str(hostname), str(username)) | ||
|
||
try: | ||
|
@@ -1200,13 +1220,17 @@ def _process_dbgap_files(self, dbgap_config, sess): | |
os.path.join(folderdir, "*") | ||
) # get lists of file from folder | ||
else: | ||
self.logger.info("Downloading files from: {}".format(hostname)) | ||
dbgap_file_list = self._download(dbgap_config) | ||
except Exception as e: | ||
self.logger.error(e) | ||
exit(1) | ||
self.logger.info("dbgap files: {}".format(dbgap_file_list)) | ||
user_projects, user_info = self._get_user_permissions_from_csv_list( | ||
dbgap_file_list, encrypted=True, session=sess, dbgap_config=dbgap_config | ||
dbgap_file_list, | ||
encrypted=encrypted, | ||
session=sess, | ||
dbgap_config=dbgap_config, | ||
) | ||
|
||
user_projects = self.parse_projects(user_projects) | ||
|
@@ -1236,6 +1260,34 @@ def _get_user_permissions_from_csv_list( | |
) | ||
return user_projects, user_info | ||
|
||
def _merge_multiple_local_csv_files( | ||
self, dbgap_file_list, encrypted, dbgap_configs, session | ||
): | ||
""" | ||
Args: | ||
dbgap_file_list (list): a list of whitelist file locations stored locally | ||
encrypted (bool): whether the file is encrypted (comes from fence config) | ||
dbgap_configs (list): list of dictionaries containing information about the dbgap server (comes from fence config) | ||
session (sqlalchemy.Session): database session | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe a quick docstring here? |
||
Return: | ||
merged_user_projects (dict) | ||
merged_user_info (dict) | ||
""" | ||
merged_user_projects = {} | ||
merged_user_info = {} | ||
|
||
for dbgap_config in dbgap_configs: | ||
user_projects, user_info = self._get_user_permissions_from_csv_list( | ||
dbgap_file_list, | ||
encrypted, | ||
session=session, | ||
dbgap_config=dbgap_config, | ||
) | ||
self.sync_two_user_info_dict(user_info, merged_user_info) | ||
self.sync_two_phsids_dict(user_projects, merged_user_projects) | ||
return merged_user_projects, merged_user_info | ||
|
||
def _merge_multiple_dbgap_sftp(self, dbgap_servers, sess): | ||
""" | ||
Args: | ||
|
@@ -1417,13 +1469,11 @@ def _sync(self, sess): | |
os.path.join(self.sync_from_local_csv_dir, "*") | ||
) | ||
|
||
# if syncing from local csv dir dbgap configurations | ||
# come from the first dbgap instance in the fence config file | ||
user_projects_csv, user_info_csv = self._get_user_permissions_from_csv_list( | ||
user_projects_csv, user_info_csv = self._merge_multiple_local_csv_files( | ||
local_csv_file_list, | ||
encrypted=False, | ||
session=sess, | ||
dbgap_config=self.dbGaP[0], | ||
dbgap_configs=self.dbGaP, | ||
) | ||
|
||
try: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
user name, login, project_id | ||
USER D,TESTUSERD,PROJECT-12345 | ||
USER B,TESTUSERB,PROJECT-12345 | ||
USER C,USERC,PROJECT-12345 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is this intended to return early on the first matched pattern? Or should this only return True if ALL patterns match that are passed in?