Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(PXP:9493):(feat)Non-dbgap whitelist in usersync #1020

Merged
merged 23 commits into from
Jul 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@ ENV appname=fence
RUN pip install --upgrade pip
RUN pip install --upgrade poetry
RUN apt-get update \
&& apt-get install -y --no-install-recommends curl bash git vim \
&& apt-get install -y --no-install-recommends curl bash git \
&& apt-get -y install vim \
libmcrypt4 libmhash2 mcrypt \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/
&& rm -rf /var/lib/apt/lists/

RUN mkdir -p /var/www/$appname \
&& mkdir -p /var/www/.cache/Python-Eggs/ \
Expand Down
82 changes: 66 additions & 16 deletions fence/sync/sync_users.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,9 +356,10 @@ def __init__(
self.storage_manager = StorageManager(
storage_credentials, logger=self.logger
)
self.id_patterns = []

@staticmethod
def _match_pattern(filepath, encrypted=True):
def _match_pattern(filepath, id_patterns, encrypted=True):
"""
Check if the filename matches dbgap access control file pattern

Expand All @@ -369,11 +370,18 @@ def _match_pattern(filepath, encrypted=True):
Returns:
bool: whether the pattern matches
"""
pattern = r"authentication_file_phs(\d{6}).(csv|txt)"
if encrypted:
pattern += ".enc"
pattern += "$"
return re.match(pattern, os.path.basename(filepath))
id_patterns.append("authentication_file_phs(\d{6}).(csv|txt)")
for pattern in id_patterns:
pattern = r"{}".format(pattern)
if encrypted:
pattern += ".enc"
pattern += "$"
pattern = pattern.encode().decode(
"unicode_escape"
) # when converting the YAML from fence-config, python reads it as Python string literal. So "\" turns into "\\" which messes with the regex match
if re.match(pattern, os.path.basename(filepath)):
return True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this intended to return early on the first matched pattern? Or should this only return True if ALL patterns match that are passed in?

return False

def _get_from_sftp_with_proxy(self, server, path):
"""
Expand Down Expand Up @@ -489,15 +497,20 @@ def _parse_csv(self, file_dict, sess, dbgap_config={}, encrypted=True):

# parse dbGaP sftp server information
dbgap_key = dbgap_config.get("decrypt_key", None)
parse_consent_code = dbgap_config.get("parse_consent_code", True)

self.id_patterns += (
dbgap_config.get("allowed_whitelist_patterns", [])
if dbgap_config.get("allow_non_dbGaP_whitelist", False)
else []
)
enable_common_exchange_area_access = dbgap_config.get(
"enable_common_exchange_area_access", False
)
study_common_exchange_areas = dbgap_config.get(
"study_common_exchange_areas", {}
)

if parse_consent_code and enable_common_exchange_area_access:
if self.parse_consent_code and enable_common_exchange_area_access:
self.logger.info(
f"using study to common exchange area mapping: {study_common_exchange_areas}"
)
Expand All @@ -506,7 +519,9 @@ def _parse_csv(self, file_dict, sess, dbgap_config={}, encrypted=True):
if os.stat(filepath).st_size == 0:
self.logger.warning("Empty file {}".format(filepath))
continue
if not self._match_pattern(filepath, encrypted=encrypted):
if not self._match_pattern(
filepath, id_patterns=self.id_patterns, encrypted=encrypted
):
self.logger.warning(
"Filename {} does not match dbgap access control filename pattern;"
" this could mean that the filename has an invalid format, or has"
Expand All @@ -527,9 +542,13 @@ def _parse_csv(self, file_dict, sess, dbgap_config={}, encrypted=True):
continue

phsid_privileges = {}
phsid = row.get("phsid", "").split(".")
if dbgap_config.get("allow_non_dbGaP_whitelist", False):
phsid = row.get("phsid", row.get("project_id", "")).split(".")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be configurable? Or I guess we can just require that people either use phsid or project_id, but this feels like something that could be a cfg

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the way usersync is written right now makes ait a little difficult for us to have a per sftp server for picking phsid and project_id. We could include it in the next design for usersync that i want to do for fix-it-Friday though?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah that's fine

else:
phsid = row.get("phsid", "").split(".")

dbgap_project = phsid[0]
if len(phsid) > 1 and parse_consent_code:
if len(phsid) > 1 and self.parse_consent_code:
consent_code = phsid[-1]

# c999 indicates full access to all consents and access
Expand Down Expand Up @@ -1192,6 +1211,7 @@ def _process_dbgap_files(self, dbgap_config, sess):
dbgap_file_list = []
hostname = dbgap_config["info"]["host"]
username = dbgap_config["info"]["username"]
encrypted = dbgap_config["info"].get("encrypted", True)
folderdir = os.path.join(str(self.folder), str(hostname), str(username))

try:
Expand All @@ -1200,13 +1220,17 @@ def _process_dbgap_files(self, dbgap_config, sess):
os.path.join(folderdir, "*")
) # get lists of file from folder
else:
self.logger.info("Downloading files from: {}".format(hostname))
dbgap_file_list = self._download(dbgap_config)
except Exception as e:
self.logger.error(e)
exit(1)
self.logger.info("dbgap files: {}".format(dbgap_file_list))
user_projects, user_info = self._get_user_permissions_from_csv_list(
dbgap_file_list, encrypted=True, session=sess, dbgap_config=dbgap_config
dbgap_file_list,
encrypted=encrypted,
session=sess,
dbgap_config=dbgap_config,
)

user_projects = self.parse_projects(user_projects)
Expand Down Expand Up @@ -1236,6 +1260,34 @@ def _get_user_permissions_from_csv_list(
)
return user_projects, user_info

def _merge_multiple_local_csv_files(
self, dbgap_file_list, encrypted, dbgap_configs, session
):
"""
Args:
dbgap_file_list (list): a list of whitelist file locations stored locally
encrypted (bool): whether the file is encrypted (comes from fence config)
dbgap_configs (list): list of dictionaries containing information about the dbgap server (comes from fence config)
session (sqlalchemy.Session): database session

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe a quick docstring here?

Return:
merged_user_projects (dict)
merged_user_info (dict)
"""
merged_user_projects = {}
merged_user_info = {}

for dbgap_config in dbgap_configs:
user_projects, user_info = self._get_user_permissions_from_csv_list(
dbgap_file_list,
encrypted,
session=session,
dbgap_config=dbgap_config,
)
self.sync_two_user_info_dict(user_info, merged_user_info)
self.sync_two_phsids_dict(user_projects, merged_user_projects)
return merged_user_projects, merged_user_info

def _merge_multiple_dbgap_sftp(self, dbgap_servers, sess):
"""
Args:
Expand Down Expand Up @@ -1417,13 +1469,11 @@ def _sync(self, sess):
os.path.join(self.sync_from_local_csv_dir, "*")
)

# if syncing from local csv dir dbgap configurations
# come from the first dbgap instance in the fence config file
user_projects_csv, user_info_csv = self._get_user_permissions_from_csv_list(
user_projects_csv, user_info_csv = self._merge_multiple_local_csv_files(
local_csv_file_list,
encrypted=False,
session=sess,
dbgap_config=self.dbGaP[0],
dbgap_configs=self.dbGaP,
)

try:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
user name, login, project_id
USER D,TESTUSERD,PROJECT-12345
USER B,TESTUSERB,PROJECT-12345
USER C,USERC,PROJECT-12345
Loading