Skip to content

Commit

Permalink
feat(passport): add db and memory cache and logic to use it under the…
Browse files Browse the repository at this point in the history
… right circumstances - also add tests
  • Loading branch information
Avantol13-machine-user committed Jan 24, 2022
1 parent 8d6e69b commit 0c32f71
Show file tree
Hide file tree
Showing 7 changed files with 1,043 additions and 39 deletions.
14 changes: 3 additions & 11 deletions .secrets.baseline
Original file line number Diff line number Diff line change
Expand Up @@ -195,27 +195,19 @@
}
],
"tests/conftest.py": [
{
"type": "Secret Keyword",
"filename": "tests/conftest.py",
"hashed_secret": "9801ff058ba790388c9efc095cb3e89a819d5ed6",
"is_verified": false,
"line_number": 164,
"is_secret": false
},
{
"type": "Private Key",
"filename": "tests/conftest.py",
"hashed_secret": "1348b145fa1a555461c1b790a2f66614781091e9",
"is_verified": false,
"line_number": 1472
"line_number": 1482
},
{
"type": "Base64 High Entropy String",
"filename": "tests/conftest.py",
"hashed_secret": "227dea087477346785aefd575f91dd13ab86c108",
"is_verified": false,
"line_number": 1495
"line_number": 1505
}
],
"tests/credentials/google/test_credentials.py": [
Expand Down Expand Up @@ -284,5 +276,5 @@
}
]
},
"generated_at": "2021-12-20T23:13:45Z"
"generated_at": "2022-01-24T16:20:56Z"
}
1 change: 0 additions & 1 deletion fence/blueprints/login/ras.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ def post_login(self, user=None, token_result=None, id_from_idp=None):
db_session=current_session,
)
user_ids_from_passports = list(users_from_passports.keys())
logger.debug(f"user_ids_from_passports: {user_ids_from_passports}")

# TODO?
# put_gen3_usernames_for_passport_into_cache(
Expand Down
12 changes: 10 additions & 2 deletions fence/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

from authlib.flask.oauth2.sqla import OAuth2AuthorizationCodeMixin, OAuth2ClientMixin
import bcrypt
import flask
from sqlalchemy import (
Integer,
BigInteger,
Expand All @@ -25,7 +24,7 @@
text,
event,
)
from sqlalchemy.dialects.postgresql import ARRAY, JSONB
from sqlalchemy.dialects.postgresql import ARRAY, JSONB, UUID
from sqlalchemy.orm import relationship, backref
from sqlalchemy.sql import func
from sqlalchemy import exc as sa_exc
Expand Down Expand Up @@ -617,6 +616,15 @@ class AssumeRoleCacheGCP(Base):
gcp_key_db_entry = Column(String())


class GA4GHPassportCache(Base):
__tablename__ = "ga4gh_passport_cache"

passport_hash = Column(UUID(as_uuid=True), primary_key=True)
passport = Column(Text, nullable=False)
expires_at = Column(BigInteger, nullable=False)
user_ids = Column(ARRAY(String(255)), nullable=False)


class GA4GHVisaV1(Base):

__tablename__ = "ga4gh_visa_v1"
Expand Down
187 changes: 163 additions & 24 deletions fence/resources/ga4gh/passports.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import flask
import os
import collections
import hashlib
import time
import datetime
import uuid
import jwt

# the whole fence_create module is imported to avoid issue with circular imports
Expand All @@ -20,12 +22,19 @@
query_for_user,
query_for_user_by_id,
GA4GHVisaV1,
GA4GHPassportCache,
IdentityProvider,
IssSubPairToUser,
)

logger = get_logger(__name__)

# cache will be in following format
# passport: ([user_id_0, user_id_1, ...], expires_at)
#
# NOTE: we'll want to watch the memory usage on this since passports can be pretty large
PASSPORT_CACHE = {}


def sync_gen3_users_authz_from_ga4gh_passports(
passports,
Expand All @@ -50,21 +59,32 @@ def sync_gen3_users_authz_from_ga4gh_passports(
embedded within the passports passed in
"""
db_session = db_session or current_session
logger.info("Getting gen3 users from passports")

# {"username": user, "username2": user2}
users_from_all_passports = {}
for passport in passports:
try:
cached_users = get_gen3_usernames_for_passport_from_cache(passport)
if cached_users:
# TODO get user from id - perhaps we can avoid this?
for user_id in cached_users:
user = query_for_user_by_id(session=db_session, user_id=user_id)
users_from_all_passports[user.username] = user
# existence in the cache means that this passport was validated
# previously (expiration was also checked)
continue
cached_usernames = get_gen3_usernames_for_passport_from_cache(
passport=passport, db_session=db_session
)
if cached_usernames:
# there's a chance a given username exists in the cache but no longer in
# the database. if not all are in db, ignore the cache and actually parse
# and validate the passport
all_users_exist_in_db = True
usernames_to_update = {}
for username in cached_usernames:
user = query_for_user(session=db_session, username=username)
if not user:
all_users_exist_in_db = False
continue
usernames_to_update[user.username] = user

if all_users_exist_in_db:
users_from_all_passports.update(usernames_to_update)
# existence in the cache and a user in db means that this passport
# was validated previously (expiration was also checked)
continue

# below function also validates passport (or raises exception)
raw_visas = get_unvalidated_visas_from_valid_passport(
Expand Down Expand Up @@ -139,29 +159,24 @@ def sync_gen3_users_authz_from_ga4gh_passports(
)
users_from_current_passport.append(gen3_user)

for user in users_from_current_passport:
users_from_all_passports[user.username] = user

put_gen3_usernames_for_passport_into_cache(
passport,
[user.id for user in users_from_current_passport],
passport=passport,
user_ids_from_passports=list(users_from_all_passports.keys()),
expires_at=min_visa_expiration,
db_session=db_session,
)
for user in users_from_current_passport:
users_from_all_passports[user.username] = user

db_session.commit()

logger.info(
f"Got Gen3 usernames from passport(s): {list(users_from_all_passports.keys())}"
)
return users_from_all_passports


def get_gen3_usernames_for_passport_from_cache(passport, db_session=None):
return


def put_gen3_usernames_for_passport_into_cache(
passport, user_ids_from_passports, expires_at, db_session=None
):
return


def get_unvalidated_visas_from_valid_passport(passport, pkey_cache=None):
"""
Return encoded visas after extracting and validating encoded passport
Expand Down Expand Up @@ -401,6 +416,130 @@ def _sync_validated_visa_authorization(
db_session.add(visa)


def get_gen3_usernames_for_passport_from_cache(passport, db_session=None):
"""
Attempt to retrieve a cached list of users ids for a previously validated and
non-expired passport.
Args:
passport (str): ga4gh encoded passport JWT
db_session (None, sqlalchemy session): optional database session to use
Returns:
list[str]: list of usernames for users referred to by the previously validated
and non-expired passport
"""
db_session = db_session or current_session
user_ids_from_passports = None
current_time = int(time.time())

# try to retrieve from local in-memory cache

if passport in PASSPORT_CACHE:
user_ids_from_passports, expires = PASSPORT_CACHE[passport]
if expires > current_time:
logger.debug(
f"Got users {user_ids_from_passports} for provided passport from in-memory cache. "
f"Expires: {expires}, Current Time: {current_time}"
)
return user_ids_from_passports
else:
# expired, so remove it
del PASSPORT_CACHE[passport]

# try to retrieve from database cache

# get an md5 hash of passport (which is 128 bits) and convert to UUID (which is 128 bits)
# for optimal usage of database's underlying UUID column type
passport_hash_as_uuid = uuid.UUID(hashlib.md5(passport.encode("utf-8")).hexdigest())
cached_passport = (
db_session.query(GA4GHPassportCache)
.filter(GA4GHPassportCache.passport_hash == passport_hash_as_uuid)
.first()
)
# we retrieved based on hash, which has a small chance of collision. Mitigate that by
# now verifying that the full passport in the db matches what was provided
if cached_passport and cached_passport.passport == passport:
if cached_passport.expires_at > current_time:
user_ids_from_passports = cached_passport.user_ids

# update local cache
PASSPORT_CACHE[passport] = (
user_ids_from_passports,
cached_passport.expires_at,
)

logger.debug(
f"Got users {user_ids_from_passports} for provided passport from "
f"database cache and placed in in-memory cache. "
f"Expires: {cached_passport.expires_at}, Current Time: {current_time}"
)
return user_ids_from_passports
else:
# expired, so remove it
db_session.remove(cached_passport)
db_session.commit()

return user_ids_from_passports


def put_gen3_usernames_for_passport_into_cache(
passport, user_ids_from_passports, expires_at, db_session=None
):
"""
Cache a validated and non-expired passport and map to the user_ids referenced
by the content.
Args:
passport (str): ga4gh encoded passport JWT
db_session (None, sqlalchemy session): optional database session to use
user_ids_from_passports (list[str]): list of user identifiers referred to by
the previously validated and non-expired passport
expires_at (int): expiration time in unix time
"""
db_session = db_session or current_session
# stores back to cache and db
PASSPORT_CACHE[passport] = user_ids_from_passports, expires_at

# get an md5 hash of passport (which is 128 bits) and convert to UUID (which is 128 bits)
# for optimal usage of database's underlying UUID column type
passport_hash_as_uuid = uuid.UUID(hashlib.md5(passport.encode("utf-8")).hexdigest())

# the improbable collision of hash on 2 different passports will result in an overwrite
# of the previous passport information and the discrepancy will raise an error on
# retrieval (after a comparison of the full stored passport vs provided). e.g. this
# collision will NOT get caught here but instead on the "GET" from cache functionality
db_session.execute(
"""\
INSERT INTO ga4gh_passport_cache (
passport_hash,
passport,
expires_at,
user_ids
) VALUES (
:passport_hash,
:passport,
:expires_at,
:user_ids
) ON CONFLICT (passport_hash) DO UPDATE SET
passport = EXCLUDED.passport,
expires_at = EXCLUDED.expires_at,
user_ids = EXCLUDED.user_ids;""",
dict(
passport_hash=passport_hash_as_uuid,
passport=passport,
expires_at=expires_at,
user_ids=user_ids_from_passports,
),
)

logger.debug(
f"Cached users {user_ids_from_passports} for provided passport in "
f"database cache and placed in in-memory cache. "
f"Expires: {expires_at}"
)


# TODO to be called after login
def map_gen3_iss_sub_pair_to_user(gen3_issuer, gen3_subject_id, gen3_user):
pass
6 changes: 6 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,13 +565,17 @@ def db_session(db, patch_app_db_session):

patch_app_db_session(session)

session.query(models.GA4GHPassportCache).delete()
session.commit()

yield session

# clear out user and project tables upon function close in case unit test didn't
session.query(models.User).delete()
session.query(models.IssSubPairToUser).delete()
session.query(models.Project).delete()
session.query(models.GA4GHVisaV1).delete()
session.query(models.GA4GHPassportCache).delete()
session.commit()

session.close()
Expand Down Expand Up @@ -1247,6 +1251,8 @@ def do_patch(session):
"fence.user",
"fence.blueprints.login.synapse",
"fence.blueprints.login.ras",
"fence.blueprints.data.indexd",
"fence.resources.ga4gh.passports",
]
for module in modules_to_patch:
monkeypatch.setattr("{}.current_session".format(module), session)
Expand Down
2 changes: 1 addition & 1 deletion tests/test-fence-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -635,7 +635,7 @@ GA4GH_VISA_V1_CLAIM_REQUIRED_FIELDS:
- "https://stsstg.nih.gov/passport/dbgap/v1.1"
source:
- "https://ncbi.nlm.nih.gov/gap"
EXPIRED_AUTHZ_REMOVAL_JOB_FREQ_IN_SECONDS: 300
EXPIRED_AUTHZ_REMOVAL_JOB_FREQ_IN_SECONDS: 1
# Global sync visas during login
# None(Default): Allow per client i.e. a fence client can pick whether or not to sync their visas during login with parse_visas param in /authorization endpoint
# True: Parse for all clients i.e. a fence client will always sync their visas during login
Expand Down
Loading

0 comments on commit 0c32f71

Please sign in to comment.