In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from collections import defaultdict

In [2]:
job_events_df = pd.read_json('/mnt/data2/Projects/vdjserver/vdjserverJobEvents.json')
job_permissions_df = pd.read_json('/mnt/data2/Projects/vdjserver/vdjserverJobPermissions.json')
jobs_all_df = pd.read_json('/mnt/data2/Projects/vdjserver/vdjserverJobs_all.json')
jobs_all_df['parameters.Creator'] = jobs_all_df['parameters'].apply(lambda x: json.loads(x).get('Creator', np.nan) if x else np.nan)
metadata_perms_df = pd.read_json('/mnt/data2/Projects/vdjserver/vdjserverMetadataPermissions.json')

with open('/mnt/data2/Projects/vdjserver/vdjserverJsonArrayFeb042025.json', 'r') as f:
    jsonarray = json.load(f)

In [3]:
def json_print(item):
    print(json.dumps(item, indent = 4))

In [4]:
mailing_list = pd.read_csv("/mnt/data2/Projects/vdjserver/VDJServer_mailing_list.txt", sep = ';', skiprows = 8)
mailing_list.columns = ['Email']
mailing_list = mailing_list.iloc[:-5]
mailing_list['updated_email'] = mailing_list['Email'].apply(lambda row: row.replace(" at ", "@"))

# mailing_list['updated_email'].to_csv('email_list.txt', index = False)
print(mailing_list)

                                Email                  updated_email
0              12ysliu2 at stu.edu.cn            12ysliu2@stu.edu.cn
1               18982180702 at msn.cn             18982180702@msn.cn
2    2008110020 at alumni.sjtu.edu.cn  2008110020@alumni.sjtu.edu.cn
3                2383920158 at qq.com              2383920158@qq.com
4              2deepayan at gmail.com            2deepayan@gmail.com
..                                ...                            ...
571               zhanxw at gmail.com               zhanxw@gmail.com
572             zhe.sang at gmail.com             zhe.sang@gmail.com
573             zicheng at utexas.edu             zicheng@utexas.edu
574              zluo819 at gmail.com              zluo819@gmail.com
575            zyf950619 at gmail.com            zyf950619@gmail.com

[576 rows x 2 columns]


In [5]:
item_types = set()
for item in jsonarray:
    item_type = item['name']
    item_types.add(item_type)
print(sorted(item_types))
print(len(item_types))

['adc_cache', 'adc_cache_repertoire', 'adc_cache_study', 'adc_system_repositories', 'archive_project', 'async_query', 'bioProcessing', 'bioProcessingColumns', 'cellProcessing', 'cellProcessingColumns', 'communityDataSRA', 'data_processing', 'deletedProject', 'diagnosis', 'diagnosisColumns', 'feedback', 'garbage', 'irplus_analysis', 'job', 'nucleicAcidProcessing', 'nucleicAcidProcessingColumns', 'passwordReset', 'private_project', 'processMetadata', 'profile', 'project', 'projectFile', 'projectJob', 'projectJobArchive', 'projectJobFile', 'projectLoad', 'projectPublishInProcess', 'projectUnpublishInProcess', 'publicProject', 'public_project', 'rearrangementLoad', 'repertoire', 'sample', 'sampleColumns', 'sampleGroup', 'sample_processing', 'statistics_cache', 'statistics_cache_repertoire', 'statistics_cache_study', 'subject', 'subjectColumns', 'test', 'testMetadata', 'testmetadata', 'testmetadatamp', 'userVerification', 'vdjpipeWorkflow']
52


In [8]:
for item in jsonarray:
    item_type = item['name']
    if item_type == 'project':
        print(json.dumps(item, indent = 4))
        break
        

{
    "_id": {
        "$oid": "52e95a8de4b057612fefcc16"
    },
    "uuid": "0001391024781787-5056a550b8-0001-012",
    "owner": "tester_account2000",
    "tenantId": "vdjserver.org",
    "schemaId": null,
    "internalUsername": null,
    "associationIds": [],
    "lastUpdated": "2014-01-29T13:46:21.787-06:00",
    "name": "project",
    "value": {
        "name": "Hello World"
    },
    "created": "2014-01-29T13:46:21.787-06:00"
}


## Look at Profile Metadata

In [47]:
profile_list = []
for item in jsonarray:
    item_type = item['name']
    if item_type == 'profile':
        uuid = item.get('uuid', None)
        owner = item.get('owner', None)
        first_name = item.get('value', {}).get('firstName', None)
        last_name = item.get('value', {}).get('lastName', None)
        email = item.get('value', {}).get('email', None)
        city = item.get('value', {}).get('city', None)
        state = item.get('value', {}).get('state', None)
        country = item.get('value', {}).get('country', None)
        created = item.get('created', None)
        last_updated = item.get('lastUpdated', None)
            # Append the extracted data as a dictionary to the list
        profile_list.append({
            'username': owner,
            'firstName': first_name,
            'lastName': last_name,
            'email': email,
            
            'lastUpdated': last_updated
        })
# Create a DataFrame from the list of extracted data
df_profile = pd.DataFrame(profile_list)
# Print the DataFrame
df_profile.tail()

Unnamed: 0,username,firstName,lastName,email,lastUpdated
1817,rgarcia,Rodrigo,García Valiente,r.garciavaliente@amsterdamumc.nl,2025-01-02T11:11:52.894-06:00
1818,rgarciav,Rodrigo,García Valiente,r.garciavaliente@amsterdamumc.nl,2025-01-02T11:41:52.070-06:00
1819,erichardson,Eve,Richardson,erichardson@lji.org,2025-01-07T18:01:25.657-06:00
1820,samwol,,,samuel.wollenburg@utsouthwestern.edu,2025-01-07T20:24:59.390-06:00
1821,chrisjames1992,Chinweike Christopher,Udoye,chinweikechristopher.udoye@uksh.de,2025-01-17T07:54:02.133-06:00


## Look at project Metadata

In [17]:
project_list = []
for item in jsonarray:
    item_type = item.get('name')
    if item_type == 'project':  # filter for project items
        uuid = item.get('uuid', None)
        owner = item.get('owner', None)
        last_updated = item.get('lastUpdated', None)
        # Nested 'name' inside 'value'
        project_name = item.get('value', {}).get('name', None)
        project_list.append({
            'projectUuid': uuid,
            'owner': owner,
            'project_name': project_name,
            'last_updated': last_updated
        })

import pandas as pd
df_projects = pd.DataFrame(project_list)
df_projects.head()


Unnamed: 0,projectUuid,owner,project_name,last_updated
0,0001391024781787-5056a550b8-0001-012,tester_account2000,Hello World,2014-01-29T13:46:21.787-06:00
1,0001391025968832-5056a550b8-0001-012,tester_account2000,Project 2,2014-01-29T14:06:08.832-06:00
2,0001391628100698-5056a550b8-0001-012,admin,Demo1,2014-02-05T13:21:40.698-06:00
3,0001392911686649-5056a550b8-0001-012,test51,testProj,2014-02-20T09:54:46.649-06:00
4,0001392912386049-5056a550b8-0001-012,test51,testProj2,2014-02-20T10:06:26.048-06:00


## Look at ProjectFile data
  - Contains only ProjectUUID
  - Contains file upload information for the project


In [15]:
projectFiles_list = []
for item in jsonarray:
    item_type = item['name']
    if item_type == 'projectFile':
        uuid = item.get('uuid', None)
        associationIds = item.get('associationIds', None)
        projectUuid = item.get('value', {}).get('projectUuid', None)
        owner = item.get('owner', None)
        task_type= item.get('value', {}).get('type', None)
        file_name = item.get('value', {}).get('name', None)
        mimeType = item.get('value', {}).get('mimeType', None)
        last_updated = item.get('lastUpdated', None)
        if associationIds:
            associationIds_1 = associationIds[0]
            if len(associationIds)>1:
                associationIds_2 = associationIds[1]
                if len(associationIds) > 2:
                    print("Length associationIds: ", len(associationIds))
            else:
                associationIds_2 = None
        else:
            associationIds_1 = None
            associationIds_2 = None
        # Append the extracted data as a dictionary to the list
        projectFiles_list.append({
            'uuid': uuid,
            'projectUuid': projectUuid,
            'associationIds_1': associationIds_1,
            'associationIds_2': associationIds_2,
            'owner': owner,
            'task_type': task_type,
            'file_name': file_name,
            'mimeType': mimeType,
            'last_updated': last_updated
        })

# Create a DataFrame from the list of extracted data
df_projectFiles = pd.DataFrame(projectFiles_list)
# Print the DataFrame
df_projectFiles.tail()

Unnamed: 0,uuid,projectUuid,associationIds_1,associationIds_2,owner,task_type,file_name,mimeType,last_updated
35943,5338423137409494545-242ac118-0001-012,5456400192359305711-242ac118-0001-012,6793987554023894545-242ac112-0001-002,5456400192359305711-242ac118-0001-012,vdj,,4468_S24_L001_R1_001.fastq.gz,,2025-01-13T16:40:40.230-06:00
35944,1335427718191574545-242ac118-0001-012,5456400192359305711-242ac118-0001-012,2833383462017494545-242ac112-0001-002,5456400192359305711-242ac118-0001-012,vdj,,4468_S24_L001_R2_001.fastq.gz,,2025-01-13T16:40:40.230-06:00
35945,1840700597200490991-242ac118-0001-012,5456400192359305711-242ac118-0001-012,366925519251050991-242ac112-0001-002,5456400192359305711-242ac118-0001-012,vdj,,6634_S25_L001_R1_001.fastq.gz,,2025-01-13T16:40:43.277-06:00
35946,5023614960920170991-242ac118-0001-012,5456400192359305711-242ac118-0001-012,3549539235260010991-242ac112-0001-002,5456400192359305711-242ac118-0001-012,vdj,,6634_S25_L001_R2_001.fastq.gz,,2025-01-13T16:40:43.281-06:00
35947,7830832104257678865-242ac118-0001-012,5456400192359305711-242ac118-0001-012,8017190735231118865-242ac112-0001-002,5456400192359305711-242ac118-0001-012,vdj,,primers.fasta,,2025-01-13T16:41:49.035-06:00


## Look at projectJob data
  - Contains ProjectUUID and JobUUID

In [18]:
projectJob_list = []
for item in jsonarray:
    item_type = item['name']
    if item_type == 'projectJob':
        # json_print(item)
        uuid = item.get('uuid', None)
        owner = item.get('owner', None)
        projectUuid = item.get('value', {}).get('projectUuid', None)
        jobUuid = item.get('value', {}).get('jobUuid', None)
        last_updated = item.get('lastUpdated', None)
        projectJob_list.append({
            'uuid': uuid,
            'owner': owner,
            'projectUuid': projectUuid,
            'jobUuid': jobUuid,
            'lastUpdated': last_updated
        })

# Create a DataFrame from the list of extracted data
df_projectJob = pd.DataFrame(projectJob_list)
df_projectJob.head()

Unnamed: 0,uuid,owner,projectUuid,jobUuid,lastUpdated
0,0001400192074855-5056a550b8-0001-012,vdj,0001399309581559-5056a550b8-0001-012,0001399315558601-5056a550b8-0001-007,2014-05-15T17:14:34.855-05:00
1,0001400254373114-5056a550b8-0001-012,vdj,0001400250478554-5056a550b8-0001-012,0001400254372814-5056a550b8-0001-007,2014-05-16T10:32:53.114-05:00
2,0001400273862423-5056a550b8-0001-012,vdj,0001400250478554-5056a550b8-0001-012,0001400273862119-5056a550b8-0001-007,2014-05-16T15:57:42.423-05:00
3,0001400274448495-5056a550b8-0001-012,vdj,0001400250478554-5056a550b8-0001-012,0001400274448320-5056a550b8-0001-007,2014-05-16T16:07:28.494-05:00
4,0001400274714655-5056a550b8-0001-012,vdj,0001400250478554-5056a550b8-0001-012,0001400274714490-5056a550b8-0001-007,2014-05-16T16:11:54.655-05:00


## Filter Metadata file by removing users with only READ Acess and Usernames that contains test

In [19]:
metadata_perms_df
#filter metadata keeping only items that has permission for both READ_WRITE and ALL
filtered_metadata_perms_df = metadata_perms_df[~(metadata_perms_df.permission == 'READ')]
# Filter out usernames containing 'test'
filtered_metadata_perms_df = filtered_metadata_perms_df[~filtered_metadata_perms_df['username'].str.contains('test', case=False)]
filtered_metadata_perms_df.head()

Unnamed: 0,id,last_updated,permission,username,uuid,tenant_id
3,14,2014-01-29 10:28:16,READ_WRITE,jfonner,0001389977207738-5056a550b8-0001-012,vdjserver.org
4,17,2014-01-29 14:06:38,READ_WRITE,adshkl;dasfhkdf,0001391025968832-5056a550b8-0001-012,vdjserver.org
5,18,2014-02-20 10:07:51,READ_WRITE,VDJAuth,0001392912471365-5056a550b8-0001-012,vdjserver.org
6,19,2014-02-20 10:14:20,READ_WRITE,VDJAuth,0001392912860303-5056a550b8-0001-012,vdjserver.org
7,21,2014-02-20 11:10:54,READ_WRITE,wscarbor,0001392914178983-5056a550b8-0001-012,vdjserver.org


## Look at all the Jobs 

In [21]:
jobs_all_df.head()
## Filter AllJobs columns
#### Keeping only important ones
keep_columns = ['system_id', 'owner', 'app_id', 'status', 'last_updated', 'uuid', 'archive_path', 'remote_outcome', 'update_token', 'parameters.Creator']
jobs_all_df = jobs_all_df[keep_columns]
jobs_all_df.head()

Unnamed: 0,system_id,owner,app_id,status,last_updated,uuid,archive_path,remote_outcome,update_token,parameters.Creator
0,ls6.tacc.utexas.edu,vdj,repcalc-ls6-2.0u8,FINISHED,2025-01-25 15:43:51.678,c7cd08ad-a560-4574-a363-b9cc4c5e051d-007,/projects/5456400192359305711-242ac118-0001-01...,FINISHED,eb27e311-4a37-4aeb-b649-056704dd2711,schristley
1,ls6.tacc.utexas.edu,vdj,igblast-ls6-1.20u6,FINISHED,2025-01-24 04:20:37.891,9188bf80-e868-4e05-a6b4-308c044108d7-007,/projects/5456400192359305711-242ac118-0001-01...,FINISHED,5e2528fd-25d6-4473-9287-6a67a8de8391,schristley
2,ls6.tacc.utexas.edu,vdj,igblast-ls6-1.20u6,FAILED,2025-01-22 15:04:46.891,773a5cb7-b369-4517-a221-83d57e3899e5-007,/projects/5199144433477554666-242ac116-0001-01...,FAILED_SKIP_ARCHIVE,78b89c14-3dec-4aa8-acf8-d2592064e3a4,scott_public
3,ls6.tacc.utexas.edu,vdj,vdj_pipe-ls6-0.1.7u2,FINISHED,2025-01-14 22:31:02.980,c0ab5f4a-97b0-4dc3-93e8-0908c95cb3a4-007,/projects/5456400192359305711-242ac118-0001-01...,FINISHED,1e2f122d-5e5b-4f14-931f-ca55803115ff,schristley
4,ls6.tacc.utexas.edu,vdj,vdj_pipe-ls6-0.1.7u2,FINISHED,2025-01-09 04:21:12.476,ad02cb34-250e-48cb-a06e-973e431b62ee-007,/projects/6589143665654501871-242ac118-0001-01...,FINISHED,1069949d-1d9a-453f-80b8-7372019aba31,schristley


In [45]:
df_profile

Unnamed: 0,uuid,username,firstName,lastName,email,lastUpdated
0,0001389976523746-5056a550b8-0001-012,wscarbor,Walter,Scarborough,wscarbor@tacc.utexas.edu,2016-04-27T15:07:26.261-05:00
1,0001391029872321-5056a550b8-0001-012,test19,Test,19,test19@test.com,2014-01-29T15:12:33.955-06:00
2,0001391717057917-5056a550b8-0001-012,test31,,,test31@test.com,2014-02-06T14:04:17.917-06:00
3,0001391719926131-5056a550b8-0001-012,test33,,,test33@test.com,2014-02-06T14:52:06.131-06:00
4,0001391720404124-5056a550b8-0001-012,test34,Ned,Flanders,test34@test.com,2014-02-06T15:00:46.376-06:00
...,...,...,...,...,...,...
1817,6242932598575984145-242ac118-0001-012,rgarcia,Rodrigo,García Valiente,r.garciavaliente@amsterdamumc.nl,2025-01-02T11:11:52.894-06:00
1818,2755888095932968465-242ac118-0001-012,rgarciav,Rodrigo,García Valiente,r.garciavaliente@amsterdamumc.nl,2025-01-02T11:41:52.070-06:00
1819,5481029658171207185-242ac118-0001-012,erichardson,Eve,Richardson,erichardson@lji.org,2025-01-07T18:01:25.657-06:00
1820,4458895817601248785-242ac118-0001-012,samwol,,,samuel.wollenburg@utsouthwestern.edu,2025-01-07T20:24:59.390-06:00


In [85]:
filtered_job_metadata_perms_df = filtered_metadata_perms_df[filtered_metadata_perms_df.uuid.isin(df_projects.projectUuid)]
user = 'lliu'
user_info = filtered_job_metadata_perms_df[filtered_job_metadata_perms_df.username == user]
total_projects = user_info.shape[0]
user_info = pd.merge(user_info, df_profile,  on='username', how='inner')
email_id = user_info.email.iloc[0]
print(f"Email ID: {email_id}\n\n")
all_project_info = {}
for project_uuid in user_info.uuid:
    project_name = df_projects[df_projects.projectUuid == project_uuid].project_name.iloc[0]
    all_names = {}
    all_names['project_name'] = project_name
    for item in jsonarray:
        if project_uuid in item.get('associationIds', None):
            if item['name'] in all_names:
                all_names[item['name']] += 1
            else:
                all_names[item['name']] = 0
    all_project_info[project_uuid] = all_names
    
text = f'Dear {user}, \n\nYou have total {total_projects} projects in VDJ server.'

for k, v in all_project_info.items():
    project_name = v.get('project_name', 'Not Available')
    n_project_file = v.get('projectFile', 0)
    n_job_file = v.get('projectJob', 0)
    text += f' For {project_name} you have {n_project_file} project files and {n_job_file} job files,'
text += f' avalibale on our database. If you want them to be transferred over to our VDJ server V2 then please let us know.\n\nThanks\nVDJServer Teams.'

print(text)
all_project_info

Email ID: leyu.liu@staidsonbio.com


Dear lliu, 

You have total 2 projects in VDJ server. For My 1st NGS you have 0 project files and 5 job files, For Rabbit IG you have 0 project files and 0 job files, avalibale on our database. If you want them to be transferred over to our VDJ server V2 then please let us know.

Thanks
VDJServer Teams.


{'6656727206927929831-242ac11c-0001-012': {'project_name': 'My 1st NGS',
  'projectFile': 0,
  'projectJob': 5,
  'processMetadata': 5,
  'projectJobFile': 44,
  'subject': 0,
  'sample': 0,
  'sampleGroup': 0},
 '5877004408142163475-242ac11b-0001-012': {'project_name': 'Rabbit IG'}}