In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json


In [9]:
job_events_df = pd.read_json('/mnt/md0/Projects/vdjserver/vdjserverJobEvents.json')
job_permissions_df = pd.read_json('/mnt/md0/Projects/vdjserver/vdjserverJobPermissions.json')
jobs_all_df = pd.read_json('/mnt/md0/Projects/vdjserver/vdjserverJobs_all.json')
jobs_all_df['parameters.Creator'] = jobs_all_df['parameters'].apply(lambda x: json.loads(x).get('Creator', np.nan) if x else np.nan)
metadata_perms_df = pd.read_json('/mnt/md0/Projects/vdjserver/vdjserverMetadataPermissions.json')

with open('/mnt/md0/Projects/vdjserver/vdjserverJsonArrayFeb042025.json', 'r') as f:
    jsonarray = json.load(f)

## Look at Public Project Data

In [10]:
public_project_list = []
for item in jsonarray:
    item_type = item['name']
    if item_type == 'public_project':
        # json_print(item)
        # break
        uuid = item.get('uuid', None)
        study_id = item.get('value', {}).get('study_id', None)
        study_title = item.get('value', {}).get('study_title', None)
        associationIds = item.get('associationIds', None)
        last_updated = item.get('lastUpdated', None)
        public_project_list.append({
            'uuid': uuid,
            'study_id': study_id,
            'associationIds': associationIds,
            'study_title': study_title,
            'lastUpdated': last_updated
        })

# Create a DataFrame from the list of extracted data
df_public_project = pd.DataFrame(public_project_list)
df_public_project.head()

Unnamed: 0,uuid,study_id,associationIds,study_title,lastUpdated
0,4505707319090933270-242ac113-0001-012,4505707319090933270-242ac113-0001-012,[],Outcome and Immune Correlates of a Phase II Tr...,2022-12-18T01:14:48.159-06:00
1,2034535426280329706-242ac113-0001-012,PRJNA300878,[],Individual heritable differences result in uni...,2022-12-18T01:14:48.306-06:00
2,5350423756993719830-242ac113-0001-012,1371444213709729305-242ac11c-0001-012,[],T cell receptor repertoires after adoptive tra...,2022-12-18T01:14:48.028-06:00
3,1570295022599213546-242ac113-0001-012,3276777473314001386-242ac116-0001-012,[],Biophysicochemical Motifs in T cell Receptor S...,2022-12-18T01:14:47.900-06:00
4,54655627105407466-242ac113-0001-012,PRJNA248475,[],B cells populating the multiple sclerosis brai...,2022-12-18T01:14:48.453-06:00


## Convert public_project data and output into JSONL files

In [17]:
exclusion_names = [ 'projectLoad', 'rearrangementLoad' ]
name_map = { "projectFile": "project_file" }
permission = [ { "username": "vdjserver.curation@gmail.com", "permission": { "read":True, "write":True } } ]

col_list = ['uuid', 'owner', 'associationIds', 'created', 'lastUpdated', 'name', 'value']
obj_list = {}
data_dir = 'Metadata_public_project/'  
for project_uuid in df_public_project.uuid:
    # Open a file in write mode
    with open(f'{data_dir}{project_uuid}_metadata.jsonl', 'w') as file:
        for item in jsonarray:
            item_type = item['name']
            if item_type in exclusion_names:
                continue
            if (project_uuid in item.get('uuid', None)) or (project_uuid in item.get('associationIds', None)):
                if name_map.get(item_type) is not None:
                    item['name'] = name_map.get(item_type)
                    item_type = item['name']

                # migrate the object
                if item_type == 'public_project':
                    # move old keywords
                    if item['value'].get('vdjserver_keywords') is not None:
                        if item['value'].get('vdjserver') is None:
                            item['value']['vdjserver'] = {}
                        item['value']['vdjserver']['keywords'] = item['value']['vdjserver_keywords']
                        del item['value']['vdjserver_keywords']
                    # add permissions
                    item['permission'] = permission
                    # old fields
                    if item['value'].get('showArchivedJobs') is not None:
                        del item['value']['showArchivedJobs']
                    if item['value'].get('owner') is not None:
                        del item['value']['owner']

                if item_type == 'project_file':
                    # eliminate old file UUID
                    item['associationIds'] = [ project_uuid ]

            # json_print(item)     
                # json_print(item)
                obj = {}
                for col_name in col_list:
                    obj[col_name] = item.get(col_name, None)
                if item.get('permission') is not None:
                    obj['permission'] = item['permission']
                # print(obj)
                json.dump(obj, file)
                file.write('\n')  # Add a newline after each JSON object

In [10]:
## Write One public_project data into Json file

In [11]:
col_list = ['uuid', 'owner', 'associationIds', 'created', 'lastUpdated', 'name', 'value']
obj_list = {}
project_uuid = '2034535426280329706-242ac113-0001-012'
data_dir = 'Metadata_public_project/'  # Create/Change the directory

# Open a file in write mode
with open(f'{data_dir}{project_uuid}_metadata.jsonl', 'w') as file:
    for item in jsonarray:
        item_type = item['name']
        if (project_uuid in item.get('uuid', None)) or (project_uuid in item.get('associationIds', None)):
            obj = {}
            for col_name in col_list:
                obj[col_name] = item.get(col_name, None)
            # print(obj)
            json.dump(obj, file)
            file.write('\n')