In [1]:
!pwd

/Users/yuanshi/Library/CloudStorage/Box-Box/Sequence Analysis/2025/D1D2_organize/script_DNA


# Purpose of this notebook is to collect DNA sequence analysis spanning the course of 2024-2025, check the quality, and extract meta data

## input files
metadata is downloaded from [TCGB_deepseq_sample_list#D1D2DNABarcode](https://docs.google.com/spreadsheets/d/1XexFJ8xlxE9KBj_rNrgdMQyB6rheCKy9y-WvgEqsWmM/edit?gid=219270850#gid=219270850)


In [2]:
import pandas as pd
import hashlib
import json,os
from pathlib import Path
import shutil

In [3]:
input_path = '../csvfiles/TCGB_deepseq_sample_list - D1D2DNABarcode.csv'
reffile = pd.read_csv(input_path)

In [4]:
#check md5 and store
def get_file_md5(file_path):
    md5_hash = hashlib.md5()
    with open(file_path, "rb") as file:
        for chunk in iter(lambda: file.read(4096), b""):
            md5_hash.update(chunk)
    return md5_hash.hexdigest()

def store_md5_in_json(file_paths, json_output):
    md5_dict = {}
    for file_path in file_paths:
        md5_hash = get_file_md5(file_path)
        md5_dict[file_path] = md5_hash

    with open(json_output, "w") as json_file:
        json.dump(md5_dict, json_file, indent=4)

In [5]:
with open('../jsonfiles/notebook1_input.json','w') as f:
    json.dump({input_path:get_file_md5(input_path)},f)

In [6]:
!cat ../jsonfiles/notebook1_input.json

{"../csvfiles/TCGB_deepseq_sample_list - D1D2DNABarcode.csv": "d489b1ef4a850c43a161223c3e6fd3f1"}

In [7]:
reffile = reffile[['SeqID','Index','Mouse_Cohort','MouseID','Organ','Treatment']]
reffile

Unnamed: 0,SeqID,Index,Mouse_Cohort,MouseID,Organ,Treatment
0,NovaSeq022125,Sample14,JCMMG32,#44,LIV,D1D2-NK
1,NovaSeq022125,Sample3,JCMMG32,#44,SP,D1D2-NK
2,NovaSeq022125,Sample6,JCMMG32,#44,BM,D1D2-NK
3,NovaSeq022625,UDP0003,JCMMG32,#44,SP,D1D2-NK
4,NovaSeq022625,UDP0006,JCMMG32,#44,BM,D1D2-NK
...,...,...,...,...,...,...
262,NovaSeq061325,UDP0043,JCMMG32,#26,BM,GFP-NK
263,NovaSeq061325,UDP0049,JMGA36,#30,BM,No NK
264,NovaSeq061325,UDP0053,JMGA36,#38,BM,GFP-NK
265,NovaSeq061325,UDP0098,JMGA36,#37,LIV,No NK


In [10]:
#take a look at the file
print ("DNA barcode has {} entries.".format(len(reffile)))
print ("That's from {} animals,".format(len(reffile.groupby(['Mouse_Cohort','MouseID']).count())),end='')
print ("{} organ samples.".format(len(reffile.groupby(['Mouse_Cohort','MouseID','Organ']).count())))

DNA barcode has 267 entries.
That's from 40 animals,117 organ samples.


# Check sequence linkage data

In [11]:
SeqIDs = reffile['SeqID'].unique()

In [12]:
SeqIDs

array(['NovaSeq022125', 'NovaSeq022625', 'NovaSeq121424', 'NovaSeq043024',
       'NovaSeq052924', 'NovaSeq061424', 'NovaSeq081324', 'NovaSeq100224',
       'NovaSeq101624', 'NovaSeq061325'], dtype=object)

In [13]:
pathfile = dict()
mainpath = '/Users/yuanshi/Library/CloudStorage/Box-Box/Sequence Analysis'

for s in SeqIDs:
    year = '20{}'.format(s[-2:])
    path = os.path.join(mainpath,year,s,'ISS_linkage')
    if os.path.exists(path):
        print ("Found: {}".format(path))
        pathfile[s] = path
    else:
        path = os.path.join(mainpath,year,s,'ISS','ISS_linkage')
        if os.path.exists(path):
            print ("Found: {}".format(path))
            pathfile[s] = path
        else:
            print ("Data not found! {}".format(s))

Found: /Users/yuanshi/Library/CloudStorage/Box-Box/Sequence Analysis/2025/NovaSeq022125/ISS_linkage
Found: /Users/yuanshi/Library/CloudStorage/Box-Box/Sequence Analysis/2025/NovaSeq022625/ISS/ISS_linkage
Found: /Users/yuanshi/Library/CloudStorage/Box-Box/Sequence Analysis/2024/NovaSeq121424/ISS/ISS_linkage
Found: /Users/yuanshi/Library/CloudStorage/Box-Box/Sequence Analysis/2024/NovaSeq043024/ISS_linkage
Found: /Users/yuanshi/Library/CloudStorage/Box-Box/Sequence Analysis/2024/NovaSeq052924/ISS_linkage
Found: /Users/yuanshi/Library/CloudStorage/Box-Box/Sequence Analysis/2024/NovaSeq061424/ISS_linkage
Found: /Users/yuanshi/Library/CloudStorage/Box-Box/Sequence Analysis/2024/NovaSeq081324/ISS/ISS_linkage
Found: /Users/yuanshi/Library/CloudStorage/Box-Box/Sequence Analysis/2024/NovaSeq100224/ISS/ISS_linkage
Found: /Users/yuanshi/Library/CloudStorage/Box-Box/Sequence Analysis/2024/NovaSeq101624/ISS/ISS_linkage
Found: /Users/yuanshi/Library/CloudStorage/Box-Box/Sequence Analysis/2025/NovaSe

In [14]:
with open('../jsonfiles/pathfile.json','w') as f:
    json.dump(pathfile,f)

# Copy Linkage Files

In [15]:
Path('../linkage_files').mkdir(parents=True,exist_ok=True)

In [19]:
for _,entry in reffile.iterrows():
    SeqID = entry['SeqID']
    Index = entry['Index']
    linkage = os.path.join(pathfile[SeqID],'linkage_{}.txt'.format(Index))
    if os.path.exists(linkage):
        shutil.copy2(linkage, os.path.join('../linkage_files','{}_{}.txt'.format(SeqID,Index)))
    else:
        print("Not Found",entry)

In [20]:
#check md5
store_md5_in_json([os.path.join('../linkage_files',f) for f in os.listdir('../linkage_files') if f.endswith('txt')], '../jsonfiles/linkagefiles.json')

In [21]:
!cat ../jsonfiles/linkagefiles.json

{
    "../linkage_files/NovaSeq061325_UDP0034.txt": "9345f214b76a50fe738df0edde6cf318",
    "../linkage_files/NovaSeq022125_Sample8.txt": "e8f73688e3b4e7511aa5a4db02b28493",
    "../linkage_files/NovaSeq052924_UDP0032.txt": "d386a63e20445c97b20a3ac5a17f9612",
    "../linkage_files/NovaSeq052924_UDP0026.txt": "526f7eb8f89c1386f7800874bdc49707",
    "../linkage_files/NovaSeq022125_Sample38.txt": "ffd3a42307ebf34197af6c712fbb9d39",
    "../linkage_files/NovaSeq022125_Sample10.txt": "124971608854a8d97c43bf9e3d7b3027",
    "../linkage_files/NovaSeq121424_JK1.txt": "a748e1b385c59d54237ae45591a2912c",
    "../linkage_files/NovaSeq061424_UDP0062.txt": "c487d34e204bc83feae332c784ee1c70",
    "../linkage_files/NovaSeq081324_UDP0029.txt": "a19155b7ab3dc3aa7a61a54c6402c8d9",
    "../linkage_files/NovaSeq101624_UDP0081.txt": "c0b0f8b2eadddcc4c4bc6a42018f6dc3",
    "../linkage_files/NovaSeq043024_UDP0025.txt": "d26e0c1958886e56ba0fa852fa91a9ba",
    "../linkage_files/NovaSeq043024_UDP0030.txt": "f7e

In [28]:
import pickle
with open('../picklefiles/notebook1.pkl','wb') as f:
    pickle.dump({'reffile':reffile},f)