# Part2: API calls to TranSMART


## Preparation: loading libraries and authentication

#### Set up packages and connection

In [None]:
# general packages
import json
import copy
import requests

# python API client
import transmart
from transmart.api.v2.api import Query
from transmart.api.v2.constraints import atomic
print('transmart python client version: {}'.format(transmart.__version__))

In [None]:
# ENVIRONMENT CONNECTION SETTINGS

# demo environment
keycloak_url = 'https://keycloak-dwh-test.thehyve.net'
transmart_url = 'https://transmart.thehyve.net'
realm = 'transmart'
keycloak_client_id= 'transmart-client'

# workshop environment

# node_id = 18  # the number in the username
# server_id = 3  # 1 for nodes 1-6; 2 for nodes 7-12; 3 for nodes 13-18; 4 for nodes 19-24
# password = 'FILL IN YOUR PASSWORD'

# user = f'user{node_id}'
# client_id = f'transmart-node{node_id}'
# gb_url = f'https://gb{node_id}.tuebingen{server_id}.thehyve.net'

keycloak_token_url = keycloak_url + '/auth/realms/' + realm + '/protocol/openid-connect/token'

In [None]:
# demo user credentials
# user = ...
# password = ...

# workshop environment
user = 'tuebingen2019'
password = 'tuebingen2019'

Retrieve offline token for API access

In [None]:
r = requests.post(url=keycloak_token_url,
                  data=dict(grant_type='password',
                            client_id=keycloak_client_id,
                            scope= 'offline_access',
                            username=user,
                            password=password
                           )
                 )
offline_token = r.json().get('refresh_token')
offline_token

In [None]:
# Create an API object to perform API queries with, using our user credentials

api = transmart.get_api(
    host = transmart_url, # URL of tranSMART server connected to your Glowing Bear
    kc_url = keycloak_url, # URL of Keycloak connected to your Glowing Bear
    client_id = keycloak_client_id,
    kc_realm = realm, # Realm in Keycloak for the tranSMART application
    offline_token = offline_token,
    print_urls = True, # Whether or not to print the API URLs used behind the scenes, to learn the API calls.
    interactive = False # Prevents pre-loading of studies etc.
)

# Common errors:
# * '401 Client Error: Unauthorized' - Wrong username/password
# * 'HTTPSConnectionPool' - Wrong tranSMART or Keycloak URL or no internet
# * '404 Client Error: Not Found' - Wrong Keycloak realm

## Querying the data

### Explore available studies and tree structure

#### Get list of available studies

In [None]:
studies = api.get_studies()
studies.dataframe

#### Get counts

get observation and subject counts for a given study 

In [None]:
csr_study_constraint = atomic.StudyConstraint('SYNTHETICMASS')
study_counts = api.observations.counts(constraint=csr_study_constraint)
study_counts

get observation and subject counts for all studies

In [None]:
all_counts = api.observations.counts()
all_counts

#### Get tree

visualize tree structure up to X levels deep (change as needed)

In [None]:
tree = api.tree_nodes(depth=5, counts=True, root='\\Demographics\\')
tree

In [None]:
tree = api.tree_nodes(depth=3, counts=True, root='\\Conditions\\ICD10\\')
tree

visualize tree structure only for a certain top node

In [None]:
tree = api.tree_nodes(root='\Public Studies\Tumor Samples\\', depth=3, counts=False)
tree

### Obtain list of tree nodes and corresponding concept codes

(Assuming tree is already filtered to include paths under a specific root node - as above)

In [None]:
f1 = ~tree.dataframe['conceptCode'].isna() 
f2 = ~tree.dataframe['conceptPath'].isna()
f = f1 & f2
ftree = tree.dataframe[f]
ftree = tree.dataframe[~tree.dataframe['conceptCode'].isna() ]

#display result (sorted by concept path)
concepts = ftree.loc[:, ['conceptPath', 'name', 'metadata.subject_dimension', 'conceptCode']].dropna(how='all').sort_values(by=['conceptPath'])
concepts

#### Check available values for a given concept

check values for a given concept (e.g. categorical)

In [None]:
tumor_type_constraint = atomic.ConceptCodeConstraint('4fc4afd045e321c824da01fae5c679cb34d71cac') # tumor type
aggregates_per_concept = api.observations.aggregates_per_concept(constraint=tumor_type_constraint)
aggregates_per_concept

#### Query with a simple constraint

In [None]:
# get all observations for female patients
gender_concept_code = 'e24277e1af6ded3d1a0b231f32e7723c566d5e67'
gender_constraint = api.new_constraint(concept=gender_concept_code, value_list=['female'])
tumor_type_concept_code = '4fc4afd045e321c824da01fae5c679cb34d71cac'
malignant_lymphoma_constraint = api.new_constraint(concept=tumor_type_concept_code, value_list=['Malignant lymphoma, non-Hodgkin'])

In [None]:
# treat as patient selection -> i.e. patient counts

print(api.observations.aggregates_per_concept(constraint=gender_constraint))
print(api.observations.counts(constraint=gender_constraint))

print(api.observations.aggregates_per_concept(constraint=malignant_lymphoma_constraint))
print(api.observations.counts(constraint=malignant_lymphoma_constraint))

display(api.observations(constraint=gender_constraint).dataframe.head(1))
display(api.observations(constraint=malignant_lymphoma_constraint).dataframe.head(1))

In [None]:
# treat as observation selection -> i.e. observation counts
gender_constraint_sub = copy.copy(gender_constraint)
gender_constraint_sub.subselection = 'patient'

malignant_lymphoma_constraint_sub = copy.copy(malignant_lymphoma_constraint)
malignant_lymphoma_constraint_sub.subselection = 'patient'

print(gender_constraint_sub.json())
# print(api.observations.aggregates_per_concept(constraint=gender_constraint_sub))
print(api.observations.counts(constraint=gender_constraint_sub))

print(malignant_lymphoma_constraint_sub.json())
# print(api.observations.aggregates_per_concept(constraint=malignant_lymphoma_constraint_sub))
print(api.observations.counts(constraint=malignant_lymphoma_constraint_sub))

display(api.observations(constraint=gender_constraint_sub).dataframe.head(1))
display(api.observations(constraint=malignant_lymphoma_constraint_sub).dataframe.head(1))

In [None]:
# find concept by concept_path
gender_concept_path = '\\Public Studies\\Tumor Samples\\01. Patient information\\02. Gender\\'
gender_concept_code = concepts[concepts['conceptPath']==gender_concept_path]['conceptCode'].unique()[0]
gender_concept_code

In [None]:
gender_constraint = api.new_constraint(concept=gender_concept_code, value_list=['female'])

print(api.observations.aggregates_per_concept(constraint=gender_constraint))
print(api.observations.counts(constraint=gender_constraint))

#### Query with a complex constraint

In [None]:
# Print all the possible parameters for a query constraint
for key in api.new_constraint().params:
    print("* {}".format(key))

In [None]:
# Create constraint: given value for a concept
all_females = api.new_constraint(concept=gender_concept_code, value_list=['female'])
all_males = api.new_constraint(concept=gender_concept_code, value_list=['male'])

In [None]:
# WOMEN
women_with_malignant_lymphoma = all_females & malignant_lymphoma_constraint

# Retrieve and print the counts for observations and patients matching our constraint
output = api.observations.counts(constraint=women_with_malignant_lymphoma)
print(json.dumps(output, indent=2))

# Retrieve the patients matching our constraint and displaying the first five
display(api.patients(constraint=women_with_malignant_lymphoma).dataframe.head())

# Retrieve the observations matching our constraint and displaying the first five
display(api.observations(constraint=women_with_malignant_lymphoma).dataframe.head())

In [None]:
# MEN
men_with_malignant_lymphoma = all_males & malignant_lymphoma_constraint

# Retrieve and print the counts for observations and patients matching our constraint
output = api.observations.counts(constraint=men_with_malignant_lymphoma)
print(json.dumps(output, indent=2))

# Retrieve the patients matching our constraint and displaying the first five
display(api.patients(constraint=men_with_malignant_lymphoma).dataframe.head())

# Retrieve the observations matching our constraint and displaying the first five
display(api.observations(constraint=men_with_malignant_lymphoma).dataframe.head())

### Show that you can use a mix of AND & OR

In [None]:
# creating complementary sets (MEN / WOMEN) to be able to test boolean logic

In [None]:
# "AND" (WORKS IN PYTHON CLIENT TOO)
# -> shoull return 0 patients/observations, because no patient who's both male and female

# Creating a group constraint
both_male_and_female_with_malignant_lymphoma = women_with_malignant_lymphoma & copy.copy(men_with_malignant_lymphoma)

# Print a representation of our constraint
#print(json.dumps(both_male_and_female_with_malignant_lymphoma.json(), indent=2))

# Retrieve and print the counts for observations and patients matching our constraint
output = api.observations.counts(constraint=both_male_and_female_with_malignant_lymphoma)
print(json.dumps(output, indent=2))

In [None]:
# Nested OR (tumor type combined with male or female)
# -> should return ALL patients/observations, because logic implies inclusion of patients that are male or female or have tumor

# Creating a group constraint 
male_or_female_or_malignant_lymphoma = (all_females | all_males) | malignant_lymphoma_constraint

# Retrieve and print the counts for observations and patients matching our constraint
output = api.observations.counts(constraint=male_or_female_or_malignant_lymphoma)
print(json.dumps(output, indent=2))

# Retrieve the patients matching our constraint and displaying the first five
display(api.patients(constraint=male_or_female_or_malignant_lymphoma).dataframe.head())

# Retrieve the observations matching our constraint and displaying the first five
display(api.observations(constraint=male_or_female_or_malignant_lymphoma).dataframe.head())

In [None]:
# Combine OR with AND: both women with a particular tumor type and men with the tumor type
all_with_malignant_lymphoma = women_with_malignant_lymphoma | men_with_malignant_lymphoma

# Print a representation of our constraint
#print(json.dumps(all_with_malignant_lymphoma.json(), indent=2))

# Retrieve and print the counts for observations and patients matching our constraint
output = api.observations.counts(constraint=all_with_malignant_lymphoma)
print(json.dumps(output, indent=2))

# Retrieve the patients matching our constraint and displaying the first five
display(api.patients(constraint=all_with_malignant_lymphoma).dataframe.head())

# Retrieve the observations matching our constraint and displaying the first five
display(api.observations(constraint=all_with_malignant_lymphoma).dataframe.head())

In [None]:
tissue_concept_path = '\\Public Studies\\Tumor Samples\\03. Biosource information\\03. Tissue\\'
tissue_concept_code = concepts[concepts['conceptPath']==tissue_concept_path]['conceptCode'].unique()[0]
# print(tissue_concept_code)
tissue_constraint = api.new_constraint(concept=tissue_concept_code)
# print(tissue_constraint.json())
print('Tissue types:', api.observations(constraint=tissue_constraint).dataframe.stringValue.unique())
eye_tissue_constraint = api.new_constraint(concept=tissue_concept_code, value_list = ['eye'])
eye_tissue_constraint.subselection = 'Biosource ID'

# Retrieve and print the counts for observations and patients for eye tissue biosources
output = api.observations.counts(constraint=eye_tissue_constraint)
print(json.dumps(output, indent=2))

print(json.dumps(eye_tissue_constraint.json(), indent=2))

# Use subselection to select biomaterials related to the eye tissue biosources
# (this leaves out the observations on patient, diagnosis and biosource level)
# Note that this is a pure Python dictionary, not a constraint class from the transmart library.
eye_tissue_biomaterials_constraint = {
    'type': 'subselection',
    'dimension': 'Biomaterial ID',
    'constraint': eye_tissue_constraint.json()
}
output = api.observations.counts(constraint=eye_tissue_biomaterials_constraint)
print(json.dumps(output, indent=2))


display(api.observations(constraint=eye_tissue_biomaterials_constraint).dataframe.head())