Skip to content

Commit

Permalink
feat(hashing): use sha256 and don't store plain text passport
Browse files Browse the repository at this point in the history
  • Loading branch information
Avantol13-machine-user committed Feb 9, 2022
1 parent 5141b8f commit 02a157c
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 62 deletions.
10 changes: 5 additions & 5 deletions fence/config-default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -897,12 +897,12 @@ GA4GH_VISA_ISSUER_ALLOWLIST:
- 'https://stsstg.nih.gov'
GA4GH_VISA_V1_CLAIM_REQUIRED_FIELDS:
type:
- "https://ras.nih.gov/visas/v1.1"
- 'https://ras.nih.gov/visas/v1.1'
value:
- "https://sts.nih.gov/passport/dbgap/v1.1"
- "https://stsstg.nih.gov/passport/dbgap/v1.1"
- 'https://sts.nih.gov/passport/dbgap/v1.1'
- 'https://stsstg.nih.gov/passport/dbgap/v1.1'
source:
- "https://ncbi.nlm.nih.gov/gap"
- 'https://ncbi.nlm.nih.gov/gap'
EXPIRED_AUTHZ_REMOVAL_JOB_FREQ_IN_SECONDS: 300
# Global sync visas during login
# None(Default): Allow per client i.e. a fence client can pick whether or not to sync their visas during login with parse_visas param in /authorization endpoint
Expand All @@ -912,5 +912,5 @@ GLOBAL_PARSE_VISAS_ON_LOGIN:
# Settings for usersync with visas
USERSYNC:
visa_types:
ras: ["https://ras.nih.gov/visas/v1", "https://ras.nih.gov/visas/v1.1"]
ras: ['https://ras.nih.gov/visas/v1', 'https://ras.nih.gov/visas/v1.1']
RAS_USERINFO_ENDPOINT: '/openid/connect/v1.1/userinfo'
5 changes: 2 additions & 3 deletions fence/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
text,
event,
)
from sqlalchemy.dialects.postgresql import ARRAY, JSONB, UUID
from sqlalchemy.dialects.postgresql import ARRAY, JSONB
from sqlalchemy.orm import relationship, backref
from sqlalchemy.sql import func
from sqlalchemy import exc as sa_exc
Expand Down Expand Up @@ -619,8 +619,7 @@ class AssumeRoleCacheGCP(Base):
class GA4GHPassportCache(Base):
__tablename__ = "ga4gh_passport_cache"

passport_hash = Column(UUID(as_uuid=True), primary_key=True)
passport = Column(Text, nullable=False)
passport_hash = Column(String(64), primary_key=True)
expires_at = Column(BigInteger, nullable=False)
user_ids = Column(ARRAY(String(255)), nullable=False)

Expand Down
45 changes: 14 additions & 31 deletions fence/resources/ga4gh/passports.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import hashlib
import time
import datetime
import uuid
import jwt

# the whole fence_create module is imported to avoid issue with circular imports
Expand All @@ -30,9 +29,7 @@
logger = get_logger(__name__)

# cache will be in following format
# passport: ([user_id_0, user_id_1, ...], expires_at)
#
# NOTE: we'll want to watch the memory usage on this since passports can be pretty large
# passport_hash: ([user_id_0, user_id_1, ...], expires_at)
PASSPORT_CACHE = {}


Expand Down Expand Up @@ -433,10 +430,11 @@ def get_gen3_usernames_for_passport_from_cache(passport, db_session=None):
user_ids_from_passports = None
current_time = int(time.time())

# try to retrieve from local in-memory cache
passport_hash = hashlib.sha256(passport.encode("utf-8")).hexdigest()

if passport in PASSPORT_CACHE:
user_ids_from_passports, expires = PASSPORT_CACHE[passport]
# try to retrieve from local in-memory cache
if passport_hash in PASSPORT_CACHE:
user_ids_from_passports, expires = PASSPORT_CACHE[passport_hash]
if expires > current_time:
logger.debug(
f"Got users {user_ids_from_passports} for provided passport from in-memory cache. "
Expand All @@ -445,26 +443,20 @@ def get_gen3_usernames_for_passport_from_cache(passport, db_session=None):
return user_ids_from_passports
else:
# expired, so remove it
del PASSPORT_CACHE[passport]
del PASSPORT_CACHE[passport_hash]

# try to retrieve from database cache

# get an md5 hash of passport (which is 128 bits) and convert to UUID (which is 128 bits)
# for optimal usage of database's underlying UUID column type
passport_hash_as_uuid = uuid.UUID(hashlib.md5(passport.encode("utf-8")).hexdigest())
cached_passport = (
db_session.query(GA4GHPassportCache)
.filter(GA4GHPassportCache.passport_hash == passport_hash_as_uuid)
.filter(GA4GHPassportCache.passport_hash == passport_hash)
.first()
)
# we retrieved based on hash, which has a small chance of collision. Mitigate that by
# now verifying that the full passport in the db matches what was provided
if cached_passport and cached_passport.passport == passport:
if cached_passport:
if cached_passport.expires_at > current_time:
user_ids_from_passports = cached_passport.user_ids

# update local cache
PASSPORT_CACHE[passport] = (
PASSPORT_CACHE[passport_hash] = (
user_ids_from_passports,
cached_passport.expires_at,
)
Expand Down Expand Up @@ -498,36 +490,27 @@ def put_gen3_usernames_for_passport_into_cache(
expires_at (int): expiration time in unix time
"""
db_session = db_session or current_session
# stores back to cache and db
PASSPORT_CACHE[passport] = user_ids_from_passports, expires_at

# get an md5 hash of passport (which is 128 bits) and convert to UUID (which is 128 bits)
# for optimal usage of database's underlying UUID column type
passport_hash_as_uuid = uuid.UUID(hashlib.md5(passport.encode("utf-8")).hexdigest())
passport_hash = hashlib.sha256(passport.encode("utf-8")).hexdigest()

# stores back to cache and db
PASSPORT_CACHE[passport_hash] = user_ids_from_passports, expires_at

# the improbable collision of hash on 2 different passports will result in an overwrite
# of the previous passport information and the discrepancy will raise an error on
# retrieval (after a comparison of the full stored passport vs provided). e.g. this
# collision will NOT get caught here but instead on the "GET" from cache functionality
db_session.execute(
"""\
INSERT INTO ga4gh_passport_cache (
passport_hash,
passport,
expires_at,
user_ids
) VALUES (
:passport_hash,
:passport,
:expires_at,
:user_ids
) ON CONFLICT (passport_hash) DO UPDATE SET
passport = EXCLUDED.passport,
expires_at = EXCLUDED.expires_at,
user_ids = EXCLUDED.user_ids;""",
dict(
passport_hash=passport_hash_as_uuid,
passport=passport,
passport_hash=passport_hash,
expires_at=expires_at,
user_ids=user_ids_from_passports,
),
Expand Down
55 changes: 32 additions & 23 deletions tests/test_drs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import flask
import httpx
import hashlib
import json
import jwt
import pytest
Expand Down Expand Up @@ -1004,14 +1005,16 @@ def test_passport_cache_valid_passport(
keys = [keypair.public_key_to_jwk() for keypair in flask.current_app.keypairs]
mock_httpx_get.return_value = httpx.Response(200, json={"keys": keys})

passport_hash = hashlib.sha256(encoded_passport.encode("utf-8")).hexdigest()

# check database cache
cached_passports = [
item.passport for item in db_session.query(GA4GHPassportCache).all()
item.passport_hash for item in db_session.query(GA4GHPassportCache).all()
]
assert encoded_passport not in cached_passports
assert passport_hash not in cached_passports

# check in-memory cache
assert not PASSPORT_CACHE.get(encoded_passport)
assert not PASSPORT_CACHE.get(passport_hash)

before_cache_start = time.time()
res = client.post(
Expand All @@ -1027,12 +1030,12 @@ def test_passport_cache_valid_passport(

# check that database cache populated
cached_passports = [
item.passport for item in db_session.query(GA4GHPassportCache).all()
item.passport_hash for item in db_session.query(GA4GHPassportCache).all()
]
assert encoded_passport in cached_passports
assert passport_hash in cached_passports

# check that in-memory cache populated
assert PASSPORT_CACHE.get(encoded_passport)
assert PASSPORT_CACHE.get(passport_hash)

after_cache_start = time.time()
res = client.post(
Expand Down Expand Up @@ -1215,14 +1218,16 @@ def test_passport_cache_invalid_passport(
keys = [keypair.public_key_to_jwk() for keypair in flask.current_app.keypairs]
mock_httpx_get.return_value = httpx.Response(200, json={"keys": keys})

passport_hash = hashlib.sha256(invalid_encoded_passport.encode("utf-8")).hexdigest()

# check database cache
cached_passports = [
item.passport for item in db_session.query(GA4GHPassportCache).all()
item.passport_hash for item in db_session.query(GA4GHPassportCache).all()
]
assert invalid_encoded_passport not in cached_passports
assert passport_hash not in cached_passports

# check in-memory cache
assert not PASSPORT_CACHE.get(invalid_encoded_passport)
assert not PASSPORT_CACHE.get(passport_hash)

res = client.post(
"/ga4gh/drs/v1/objects/" + test_guid + "/access/" + access_id,
Expand All @@ -1235,12 +1240,12 @@ def test_passport_cache_invalid_passport(

# check that database cache NOT populated
cached_passports = [
item.passport for item in db_session.query(GA4GHPassportCache).all()
item.passport_hash for item in db_session.query(GA4GHPassportCache).all()
]
assert invalid_encoded_passport not in cached_passports
assert passport_hash not in cached_passports

# check that in-memory cache NOT populated
assert not PASSPORT_CACHE.get(invalid_encoded_passport)
assert not PASSPORT_CACHE.get(passport_hash)

res = client.post(
"/ga4gh/drs/v1/objects/" + test_guid + "/access/" + access_id,
Expand Down Expand Up @@ -1430,6 +1435,8 @@ def test_passport_cache_expired_in_memory_valid_in_db(
keys = [keypair.public_key_to_jwk() for keypair in flask.current_app.keypairs]
mock_httpx_get.return_value = httpx.Response(200, json={"keys": keys})

passport_hash = hashlib.sha256(encoded_passport.encode("utf-8")).hexdigest()

# simulate db cache with a valid passport by first calling the endpoint to cache
# res = client.post(
# "/ga4gh/drs/v1/objects/" + test_guid + "/access/" + access_id,
Expand All @@ -1446,7 +1453,7 @@ def test_passport_cache_expired_in_memory_valid_in_db(
# double-check database cache
cached_passport = (
db_session.query(GA4GHPassportCache)
.filter(GA4GHPassportCache.passport == encoded_passport)
.filter(GA4GHPassportCache.passport_hash == passport_hash)
.first()
)
# greater and NOT == b/c of logic to set internal expiration less than real to allow
Expand All @@ -1456,8 +1463,8 @@ def test_passport_cache_expired_in_memory_valid_in_db(
# simulate in-memory cache with an expired passport by overriding the in-memory cache
from fence.resources.ga4gh import passports as passports_module

PASSPORT_CACHE = {f"{encoded_passport}": ([test_username], current_time - 1)}
assert PASSPORT_CACHE.get(encoded_passport, ("", 0))[1] == current_time - 1
PASSPORT_CACHE = {f"{passport_hash}": ([test_username], current_time - 1)}
assert PASSPORT_CACHE.get(passport_hash, ("", 0))[1] == current_time - 1
monkeypatch.setattr(passports_module, "PASSPORT_CACHE", PASSPORT_CACHE)

res = client.post(
Expand All @@ -1468,15 +1475,15 @@ def test_passport_cache_expired_in_memory_valid_in_db(
data=json.dumps(data),
)
assert res.status_code == 200
# patch_method.stop()

# check that database cache still populated
assert (
len([item.passport for item in db_session.query(GA4GHPassportCache).all()]) == 1
len([item.passport_hash for item in db_session.query(GA4GHPassportCache).all()])
== 1
)
cached_passport = (
db_session.query(GA4GHPassportCache)
.filter(GA4GHPassportCache.passport == encoded_passport)
.filter(GA4GHPassportCache.passport_hash == passport_hash)
.first()
)
# greater and NOT == b/c of logic to set internal expiration less than real to allow
Expand All @@ -1486,11 +1493,11 @@ def test_passport_cache_expired_in_memory_valid_in_db(
# check that in-memory cache populated with db expiration
# greater and NOT == b/c of logic to set internal expiration less than real to allow
# time for expiration job to run
if PASSPORT_CACHE.get(encoded_passport, ("", 0))[1] == 0:
if PASSPORT_CACHE.get(passport_hash, ("", 0))[1] == 0:
from fence.resources.ga4gh.passports import PASSPORT_CACHE as import_cache

assert PASSPORT_CACHE == None
assert PASSPORT_CACHE.get(encoded_passport, ("", 0))[1] > current_time
assert PASSPORT_CACHE.get(passport_hash, ("", 0))[1] > current_time


@responses.activate
Expand Down Expand Up @@ -1659,14 +1666,16 @@ def test_passport_cache_expired(
keys = [keypair.public_key_to_jwk() for keypair in flask.current_app.keypairs]
mock_httpx_get.return_value = httpx.Response(200, json={"keys": keys})

passport_hash = hashlib.sha256(encoded_passport.encode("utf-8")).hexdigest()

# check database cache
cached_passports = [
item.passport for item in db_session.query(GA4GHPassportCache).all()
item.passport_hash for item in db_session.query(GA4GHPassportCache).all()
]
assert encoded_passport not in cached_passports
assert passport_hash not in cached_passports

# check in-memory cache
assert not PASSPORT_CACHE.get(encoded_passport)
assert not PASSPORT_CACHE.get(passport_hash)

res = client.post(
"/ga4gh/drs/v1/objects/" + test_guid + "/access/" + access_id,
Expand Down

0 comments on commit 02a157c

Please sign in to comment.