In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json


In [62]:
job_events_df = pd.read_json('/mnt/md0/Projects/vdjserver/vdjserverJobEvents.json')
job_permissions_df = pd.read_json('/mnt/md0/Projects/vdjserver/vdjserverJobPermissions.json')
jobs_all_df = pd.read_json('/mnt/md0/Projects/vdjserver/vdjserverJobs_all.json')
jobs_all_df['parameters.Creator'] = jobs_all_df['parameters'].apply(lambda x: json.loads(x).get('Creator', np.nan) if x else np.nan)
metadata_perms_df = pd.read_json('/mnt/md0/Projects/vdjserver/vdjserverMetadataPermissions.json')

jsonarray_projectJob = pd.read_json('/mnt/data2/Projects/vdjserver/vdjserverJsonArrayFeb042025.json')

with open('/mnt/md0/Projects/vdjserver/vdjserverJsonArrayFeb042025.json', 'r') as f:
    jsonarray = json.load(f)

jsonarray_projectJob = [ obj for obj in jsonarray if obj['name'] == 'projectJob' ]
dict_projectJob = { obj['value']['projectUuid'] : obj for obj in jsonarray if obj['name'] == 'projectJob'}

with open('/mnt/md0/Projects/vdjserver/vdjserverJobs_all.json', 'r') as f:
    jsonarray_jobs = json.load(f)

## Look at Public Project Data

In [55]:
public_project_list = []
for item in jsonarray:
    item_type = item['name']
    if item_type == 'public_project':
        # json_print(item)
        # break
        uuid = item.get('uuid', None)
        study_id = item.get('value', {}).get('study_id', None)
        study_title = item.get('value', {}).get('study_title', None)
        associationIds = item.get('associationIds', None)
        last_updated = item.get('lastUpdated', None)
        public_project_list.append({
            'uuid': uuid,
            'study_id': study_id,
            'associationIds': associationIds,
            'study_title': study_title,
            'lastUpdated': last_updated
        })

# Create a DataFrame from the list of extracted data
df_public_project = pd.DataFrame(public_project_list)
df_public_project.head()

Unnamed: 0,uuid,study_id,associationIds,study_title,lastUpdated
0,4505707319090933270-242ac113-0001-012,4505707319090933270-242ac113-0001-012,[],Outcome and Immune Correlates of a Phase II Tr...,2022-12-18T01:14:48.159-06:00
1,2034535426280329706-242ac113-0001-012,PRJNA300878,[],Individual heritable differences result in uni...,2022-12-18T01:14:48.306-06:00
2,5350423756993719830-242ac113-0001-012,1371444213709729305-242ac11c-0001-012,[],T cell receptor repertoires after adoptive tra...,2022-12-18T01:14:48.028-06:00
3,1570295022599213546-242ac113-0001-012,3276777473314001386-242ac116-0001-012,[],Biophysicochemical Motifs in T cell Receptor S...,2022-12-18T01:14:47.900-06:00
4,54655627105407466-242ac113-0001-012,PRJNA248475,[],B cells populating the multiple sclerosis brai...,2022-12-18T01:14:48.453-06:00


## V2 Job Data for Public Projects

In [56]:
print(jobs_all_df.keys())
print(jobs_all_df['uuid'])
job_id = '38a47767-343d-4d2c-9374-29c3be77905c-007'
project_id = '1002552565004824085-242ac117-0001-012'
print(jobs_all_df[jobs_all_df['uuid'] == job_id]['archive_path'])

jsonarray_projectJob = [ obj for obj in jsonarray if obj['name'] == 'projectJob' ]
dict_projectJob = { obj['uuid'] : obj for obj in jsonarray if obj['name'] == 'projectJob'}

print(json.dumps(jsonarray_projectJob[-5:-1], indent=2))

print(json.dumps(jsonarray_jobs[0], indent=2))

#job = {}
job = [ obj for obj in jsonarray_jobs if obj.get('archive_path') is not None and project_id in obj.get('archive_path')]
print(json.dumps(job, indent=2))

Index(['id', 'name', 'tenant_id', 'tenant_queue', 'owner', 'roles',
       'system_id', 'app_id', 'app_uuid', 'status', 'last_message', 'accepted',
       'created', 'ended', 'last_updated', 'uuid', 'work_path', 'archive',
       'archive_on_app_error', 'archive_path', 'archive_system_id',
       'node_count', 'processor_count', 'memory_gb', 'max_hours', 'inputs',
       'parameters', 'remote_job_id', 'remote_sched_id', 'remote_queue',
       'remote_submitted', 'remote_started', 'remote_ended', 'remote_outcome',
       'remote_submit_retries', 'remote_status_checks', 'failed_status_checks',
       'last_status_check', 'blocked_count', 'visible', 'update_token',
       'parameters.Creator'],
      dtype='object')
0        c7cd08ad-a560-4574-a363-b9cc4c5e051d-007
1        9188bf80-e868-4e05-a6b4-308c044108d7-007
2        773a5cb7-b369-4517-a221-83d57e3899e5-007
3        c0ab5f4a-97b0-4dc3-93e8-0908c95cb3a4-007
4        ad02cb34-250e-48cb-a06e-973e431b62ee-007
                           

## Convert public_project data and output into JSONL files

In [17]:
exclusion_names = [ 'projectLoad', 'rearrangementLoad' ]
name_map = { "projectFile": "project_file" }
permission = [ { "username": "vdjserver.curation@gmail.com", "permission": { "read":True, "write":True } } ]

col_list = ['uuid', 'owner', 'associationIds', 'created', 'lastUpdated', 'name', 'value']
obj_list = {}
data_dir = 'Metadata_public_project/'  
for project_uuid in df_public_project.uuid:
    # Open a file in write mode
    with open(f'{data_dir}{project_uuid}_metadata.jsonl', 'w') as file:
        for item in jsonarray:
            item_type = item['name']
            if item_type in exclusion_names:
                continue
            if (project_uuid in item.get('uuid', None)) or (project_uuid in item.get('associationIds', None)):
                if name_map.get(item_type) is not None:
                    item['name'] = name_map.get(item_type)
                    item_type = item['name']

                # migrate the object
                if item_type == 'public_project':
                    # move old keywords
                    if item['value'].get('vdjserver_keywords') is not None:
                        if item['value'].get('vdjserver') is None:
                            item['value']['vdjserver'] = {}
                        item['value']['vdjserver']['keywords'] = item['value']['vdjserver_keywords']
                        del item['value']['vdjserver_keywords']
                    # add permissions
                    item['permission'] = permission
                    # old fields
                    if item['value'].get('showArchivedJobs') is not None:
                        del item['value']['showArchivedJobs']
                    if item['value'].get('owner') is not None:
                        del item['value']['owner']

                if item_type == 'project_file':
                    # eliminate old file UUID
                    item['associationIds'] = [ project_uuid ]

            # json_print(item)     
                # json_print(item)
                obj = {}
                for col_name in col_list:
                    obj[col_name] = item.get(col_name, None)
                if item.get('permission') is not None:
                    obj['permission'] = item['permission']
                # print(obj)
                json.dump(obj, file)
                file.write('\n')  # Add a newline after each JSON object

## Convert Tapis V2 Job data into meta record

In [74]:
col_list = ['uuid', 'owner', 'associationIds', 'created', 'lastUpdated', 'name', 'value']
obj_list = {}
project_uuid = '2034535426280329706-242ac113-0001-012'
data_dir = 'Metadata_public_project_jobs/'  # Create/Change the directory

for project_uuid in df_public_project.uuid:
    jobs = [ obj for obj in jsonarray_jobs if obj.get('archive_path') is not None and project_uuid in obj.get('archive_path')]
    print(len(jobs))
    if len(jobs) > 0:
        with open(f'{data_dir}{project_uuid}_metadata.jsonl', 'w') as file:
            for j in jobs:
                obj = {}
                obj['name'] = 'tapis_v2_job'
                obj['associationIds'] = [ project_uuid ]
                obj['value'] = j
                obj['uuid'] = j['uuid']
                del obj['value']['uuid']
                obj['created'] = j['created']
                obj['lastUpdated'] = j['last_updated']
                
                json.dump(obj, file)
                file.write('\n')  # Add a newline after each JSON object
    #print(dict_projectJob.get(project_uuid))

    # Open a file in write mode
    #with open(f'{data_dir}{project_uuid}_metadata.jsonl', 'w') as file:


0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
22
0
2
95
10
2
30
9
20
55
14
0


In [69]:
for k in dict_projectJob:
    print(k)
    print(dict_projectJob[k])
    break


0001399309581559-5056a550b8-0001-012
{'_id': {'$oid': '53753c4ae4b0df13310ccbbe'}, 'uuid': '0001400192074855-5056a550b8-0001-012', 'owner': 'vdj', 'tenantId': 'vdjserver.org', 'schemaId': None, 'internalUsername': None, 'associationIds': [], 'lastUpdated': '2014-05-15T17:14:34.855-05:00', 'name': 'projectJob', 'value': {'projectUuid': '0001399309581559-5056a550b8-0001-012', 'jobUuid': '0001399315558601-5056a550b8-0001-007'}, 'created': '2014-05-15T17:14:34.855-05:00'}
