# <p align="center"> Retrieve Specific Sessions' SoakDBs for XChem Team </p>

This notebook is meant to extract a series of sqlite files from the object store for a specific set of sessions, as requested by the XChem team. 

In [1]:
import sys
sys.path.append("..")

### <ins>Task</ins>: Obtain List of Pickle Tree files to Search Through

In [23]:
from pathlib import Path
import re
lstTargetSeshPath = Path( "../data/targetSessions.txt")

# Sessions Requested
with open( lstTargetSeshPath, "r") as f:
    lstTargetSeshS = sorted( [ targetFile[:-1].lower() if targetFile[-1] == "\n" else targetFile.lower() for targetFile in f.readlines( )  ] )
    print( "List of Target Sessions: lstTargetSeshS")
    for targetSesh in lstTargetSeshS: print( f"\t\t {targetSesh}" )
    print(f"Number of Requested Sessions: {len(lstTargetSeshS )}")

# Sessions that I managed to Identify in Object Store
dataPath = Path( "../output/ObjectStore/XChem/PerProj/data" )
lstObtainSeshS = sorted( [ re.search( "[a-z]+[0-9]+-[0-9]+", pklFileName.name).group()  for  pklFileName in  dataPath.iterdir()  if pklFileName.is_file() ] )

showFiles = 10
print( f"\nList of Obtained Sessions: lstObtainSeshS" )
for idx in range( len(lstObtainSeshS) // showFiles): print( "\t", lstObtainSeshS[idx*showFiles: (idx+1)*showFiles] ) 

# Full name of sessions obtained that match sessions requested AND sessions requested that are missing from obtaining
lstMatchedSeshS = sorted( [ re.search( "[a-z]+[0-9]+-[0-9]+", pklFileName.name).string  for idx, pklFileName in enumerate( dataPath.iterdir() ) if pklFileName.is_file() and re.search( "[a-z]+[0-9]+-[0-9]+", pklFileName.name).group() in lstTargetSeshS ] )
showFiles = 3
print(f"\nList of Obtained Sessions that Match the Request: lstMatchedSeshS")
for idx in range( len(lstMatchedSeshS) // showFiles) : print( lstMatchedSeshS[idx*showFiles: (idx+1)*showFiles ] )
print(f"Number of matched Sessions: {len( lstMatchedSeshS)}")
print(f"Number of Requested Sessions: {len(lstTargetSeshS )}")

# 
lstTreeMatchSeshS = [ f"tree_{seshFileName}" for seshFileName in lstMatchedSeshS]
print( f"\nList of Matched Tree Files: lstTreeMatchSeshS")
print( f"\t {lstTreeMatchSeshS}")

lstNotMatchedSeshS = lstObtainSeshS = sorted( [ targetSesh for targetSesh in lstTargetSeshS if targetSesh not in lstObtainSeshS ]) # [ re.search( "[a-z]+[0-9]+-[0-9]+", pklFileName.name).string  for idx, pklFileName in enumerate( dataPath.iterdir() ) if pklFileName.is_file() and re.search( "[a-z]+[0-9]+-[0-9]+", pklFileName.name).group() not in lstTargetSeshS ] )
print(f"\nList of Requested Sessions Not Obtained: {lstNotMatchedSeshS}")

print( f"\nList of Requested Sessions:\t\t\t {lstTargetSeshS}")
print(f"List of Obtained Sessions that Match the Request: {lstMatchedSeshS}")
print(f"List of Requested Sessions Not Obtained: {lstNotMatchedSeshS}")


List of Target Sessions: lstTargetSeshS
		 lb18145-111
		 lb18145-112
		 lb18145-121
		 lb18145-122
		 lb18145-125
		 lb18145-126
		 lb18145-43
		 lb18145-49
		 lb18145-55
		 lb18145-60
		 lb18145-71
		 lb18145-73
		 lb18145-74
		 lb18145-75
		 lb18145-81
		 lb18145-86
		 lb18145-88
		 lb19758-150
		 lb19758-35
		 lb19758-38
		 lb19758-50
		 lb19758-51
		 lb19758-8
		 lb22717-10
Number of Requested Sessions: 24

List of Obtained Sessions: lstObtainSeshS
	 ['lb13308-1', 'lb13308-2', 'lb13320-1', 'lb13320-12', 'lb13320-14', 'lb13320-16', 'lb13320-3', 'lb13320-5', 'lb13320-8', 'lb13379-1']
	 ['lb13380-1', 'lb13381-1', 'lb13385-1', 'lb13385-100', 'lb13385-101', 'lb13385-102', 'lb13385-103', 'lb13385-104', 'lb13385-105', 'lb13385-106']
	 ['lb13385-107', 'lb13385-108', 'lb13385-109', 'lb13385-110', 'lb13385-111', 'lb13385-113', 'lb13385-114', 'lb13385-115', 'lb13385-116', 'lb13385-117']
	 ['lb13385-118', 'lb13385-13', 'lb13385-2', 'lb13385-21', 'lb13385-25', 'lb13385-30', 'lb13385-31', 'lb13

### <ins>Task</ins>: Extract and Save all sqlite files found in the database and DataFileBackups folders for each folder

In [None]:
import re
from pathlib import Path

from scripts.filesUtils import loadPickle
from scripts.s3Utils import initialize
from scripts.objFileSys import completeTraceBackPath, getFiles

treeDir = Path( "../output/ObjectStore/XChem/PerProj/data/tree" )
bucket = "xchem"
for targetTreeFile in lstTreeMatchSeshS[:1]:

    treePickle = loadPickle( treeDir / targetTreeFile )

    # database
    for targetFolder in ["database", "DataFileBackups"]:
        results = completeTraceBackPath( targetFolder, treePickle["fileTree"], treePickle["foldersCount"], treePickle["folderTreeMaxDepth"] )
        if results != []: 
            path = results[0][0]

            files = getFiles( treePickle["fileTree"], treePickle["foldersCount"], folderPath = path  )
            sqliteFiles = [ file for file in files[1] if re.search( "sqlite$", file)]

            for sqliteFile in sqliteFiles:
                key = sqliteFile # i.e "data/2015/lb13308-1/processing/database/soakDBDataFile.sqlite"

                storeDir = Path( "..", "data", "meta", targetTreeFile[5:-4], targetFolder) # Helios -> | Local -> Path( "D:", "Documents", "XAIDA", "data", "meta", targetTreeFile[5:-4], targetFolder)
                storeDir.mkdir(parents=True, exist_ok=True)

                fileName = key.split("/")[-1]
                storePath = storeDir / fileName

                client = initialize("XChem")
                client.download_file( bucket, key, storePath.as_posix() ) 
        else:
            if len( treePickle ) == 1:
                bucket = "xchem"
                key = treePickle[0] # "data/2019/lb18145-122"
                storeDir = Path(  "../data/meta" ,  targetTreeFile[5:-4], targetFolder )
                storeDir.mkdir( parents=True, exist_ok=True )

                fileName = key.split("/")[-1] + ".content"
                storePath = storeDir / fileName 

                client = initialize("XChem")
                client.download_file( bucket, key, storePath.as_posix() ) 

### <ins>Task</ins>: Identify Sessions that do not have data

In [37]:
storeDir = Path( "..", "data", "meta" )

record ={"NoData":[], "Data": []}
for session in storeDir.iterdir():
    seshContent = [ content.name for content in session.iterdir() ]
    # print( seshContent)
    if "database" in seshContent or "DataFileBackups" in seshContent:
        record["Data"].append( session.name )
    else:
        record["NoData"].append( session.name)

print( f"List of Files without Data: {sorted( [  re.search( "[a-z]+[0-9]+-[0-9]+$", fileName).group()  for fileName in record["NoData"] ]) }" )
print( f"List of Files with Data: {sorted( [  re.search( "[a-z]+[0-9]+-[0-9]+$", fileName).group()  for fileName in record["Data"] ]) }" )

print( f"Number of Sessions without data: { len( record["NoData"] ) }" )
print( f"Number of Sessions with data: { len( record["Data"] ) }" ) 



List of Files without Data: ['lb18145-121', 'lb18145-122', 'lb18145-125', 'lb18145-60', 'lb18145-73', 'lb18145-74', 'lb18145-81', 'lb18145-86', 'lb18145-88', 'lb22717-10']
List of Files with Data: ['lb18145-111', 'lb18145-112', 'lb18145-126', 'lb18145-43', 'lb18145-49', 'lb18145-55', 'lb18145-71', 'lb18145-75', 'lb19758-35', 'lb19758-38', 'lb19758-50', 'lb19758-51', 'lb19758-8']
Number of Sessions without data: 10
Number of Sessions with data: 13
