# <p align="center">  *objFileSys.py* Script Development </p>

This notebook keeps a record of the key stages taken to develop the different tools found within objFileSys.
<details open>
<summary>Chapters:</summary>

- [Test Dataset](#testdata)
- [View Tree w List Paths](#viewlstpaths)
- [Sort and Get Folder Paths](#folderpaths)
- [Ways to Open Nested Lists](#openlist)
- [Create a Tree of Folders](#folderstree)
- [Create a Tree of Files](#filestree)
    - [Count items in Each Folder](#folderscount)
    - [Calculate Gamma](#gamma)
    - [Create File Tree](#createfiletree)
- [Extra Tools](#extratools)
- [Traverse Tree](#traversetree)
- [Visualize Tree](#visualizetree)
    - [Visualize Whole Tree](#viswholetree)
    - [Visualize Tree Subset](#vissubtree)
- [Filter Files](#filterfiles)
- [Find Files and Folders](#finditems)
</detailts>


In [None]:
import sys
sys.path.append("..")

## Test Datasets 

<a id="testdata"></a>

In [4]:
# testDir1, testTree, testFileTree, testFolderTree, testLabels, testLabelsDic, test_sortLst
# Includes paths of directories as well 
testDir1 = ["root/file11.txt", "root/subDir11/file21.txt","root/subDir11/subDir21",
            "root/subDir11/subDir22", "root/subDir11/subDir23",
            "root/subDir11/subDir21/file31.txt", "root/subDir11/subDir21/file32.txt",
            "root/subDir11/subDir21/file33.txt","root/subDir11/subDir22/file34.txt",
            "root/subDir11/subDir22/subDir31", "root/subDir11/subDir22/subDir31/file41.txt",
            "root/subDir11/subDir22/subDir31/file42.txt", "root/subDir11/subDir22/subDir31/file43.txt",
            "root/subDir11/subDir23/file35.txt","root/subDir11/subDir23/subDir32",
            "root/subDir11/subDir23/subDir32/file44.txt", "root/subDir11/subDir23/subDir32/file45.txt",
            "root/subDir11/subDir23/subDir32/subDir41", "root/subDir11/subDir23/subDir32/subDir41/file51.txt",
            "root/subDir12", "root/subDir12/file22.txt", "root/subDir12/subDir24",
            "root/subDir12/subDir24/file36.txt", "root/subDir12/subDir24/file37.txt",
            "root/subDir12/subDir25", "root/subDir12/subDir25/file38.txt",
            "root/subDir12/subDir25/subDir33", "root/subDir12/subDir25/subDir33/file46.txt",
            "root/subDir12/subDir25/subDir33/file47.txt",
            ]

# Order: first files then folders
testTree = ["root/file11.txt", 
            "root/subDir11/file21.txt",
            "root/subDir11/subDir21/file31.txt", "root/subDir11/subDir21/file32.txt","root/subDir11/subDir21/file33.txt",
            
            "root/subDir11/subDir22/file34.txt",
            "root/subDir11/subDir22/subDir31/file41.txt", "root/subDir11/subDir22/subDir31/file42.txt", "root/subDir11/subDir22/subDir31/file43.txt",
            
            "root/subDir11/subDir23/file35.txt",
            "root/subDir11/subDir23/subDir32/file44.txt", "root/subDir11/subDir23/subDir32/file45.txt",
            "root/subDir11/subDir23/subDir32/subDir41/file51.txt",
            
            "root/subDir12/file22.txt",
            "root/subDir12/subDir24/file36.txt", "root/subDir12/subDir24/file37.txt",
            
            "root/subDir12/subDir25/file38.txt",
            "root/subDir12/subDir25/subDir33/file46.txt",
            "root/subDir12/subDir25/subDir33/file47.txt",
            ]

# Only Contains directories
testFolderTree = ["root", 
            "root/subDir11",
            "root/subDir11/subDir21",
            
            "root/subDir11/subDir22",
            "root/subDir11/subDir22/subDir31",
            
            "root/subDir11/subDir23",
            "root/subDir11/subDir23/subDir32",
            "root/subDir11/subDir23/subDir32/subDir41",
            
            "root/subDir12",
            "root/subDir12/subDir24",
            
            "root/subDir12/subDir25",
            "root/subDir12/subDir25/subDir33",
            ]

# Order: First folders then files
testFileTree = [           
            "root/subDir11/subDir21/file31.txt", "root/subDir11/subDir21/file32.txt","root/subDir11/subDir21/file33.txt",    

            "root/subDir11/subDir22/subDir31/file41.txt", "root/subDir11/subDir22/subDir31/file42.txt", "root/subDir11/subDir22/subDir31/file43.txt",
            "root/subDir11/subDir22/file34.txt",

            "root/subDir11/subDir23/subDir32/subDir41/file51.txt",
            "root/subDir11/subDir23/subDir32/file44.txt", "root/subDir11/subDir23/subDir32/file45.txt",
            "root/subDir11/subDir23/file35.txt",

            "root/subDir11/file21.txt",


            "root/subDir12/subDir24/file36.txt", "root/subDir12/subDir24/file37.txt",
            

            "root/subDir12/subDir25/subDir33/file46.txt",
            "root/subDir12/subDir25/subDir33/file47.txt",
            "root/subDir12/subDir25/file38.txt",

            "root/subDir12/file22.txt",

            "root/file11.txt", 
            ]



# Note that the first element in the ID does NOT represent the root. 
# This type of ID is used in getGamma() function, but the rest of functions
# use IDs that take [0, ] always at the start which represents the root.
testLabels = [ ["fileID", "Gamma"], # Gamma is only based on folders
            # Subdir11
                # Subdir21 
                [ [0, 0, 0, 0], [ 0 ] ], [ [0, 0, 0, 1 ], [0] ],  [ [0, 0, 0, 2], [0] ], 
                # subDir22 
                    #subDir31
                    [ [0, 0, 1, 0, 0], [0] ], [ [0, 0, 1, 0, 1], [0] ], [ [0, 0, 1, 0, 2], [0] ],
                [ [0, 0, 1, 1], [1]], 
                # subDir23
                    #subDir32
                        #subDir41
                        [ [0, 0, 2, 0, 0, 0], [0] ],
                    [ [0, 0, 2, 0, 1], [1] ], [ [0, 0, 2, 0, 2], [1] ],  
                [ [0, 0, 2, 1], [2] ],
            [ [0, 0, 3], [0] ],

            #subDir12
                #subDir24
                [ [0, 1,0,0], [3]], [ [0, 1,0,1], [3]],
                #subDir25
                    #subDir33
                    [ [0, 1,1,0,0], [2] ], [ [0, 1, 1, 0, 1], [2] ],                 
                [ [0, 1,1,1], [4] ],
            [ [0, 1, 2], [1]],
            
            # File11.txt
            [ [0, 2], [0] ],              

               ]

testLabelsDic = { key: value for key, value in zip( ["path"]+ testFileTree, testLabels )}
print(f"Test Label Dic Keys: {list(testLabelsDic.keys())}" )
print(f"Test Label Dic Values: {list(testLabelsDic.values())}" )

# List that represents an unordered list of file paths, to be ordered alphabetically
test_sortLst = ["A/A/A/A/.file", "A/A/A/B/.file", "A/A/A/C/.file", "A/A/A/D/.file",
                 "A/A/B/.file",
                 "A/A/C/A/.file", "A/A/C/B/.file", "A/A/C/C/.file",
                 
                 "A/B/A/A/.file","A/B/A/B/.file","A/B/A/C/.file",
                 "A/B/B/A/.file", "A/B/B/B/.file",
                  "A/B/C/.file", 
                  
                  "A/C/.file",
                  
                  "B/C/D/.file", "B/C/C/.file", "B/C/B/.file", "B/C/A/.file",
                  "B/A/A/A/.file", "B/A/A/C/.file", "B/A/B/.file", "B/A/A/B/.file",
                   "B/B B/.file", ]

Test Label Dic Keys: ['path', 'root/subDir11/subDir21/file31.txt', 'root/subDir11/subDir21/file32.txt', 'root/subDir11/subDir21/file33.txt', 'root/subDir11/subDir22/subDir31/file41.txt', 'root/subDir11/subDir22/subDir31/file42.txt', 'root/subDir11/subDir22/subDir31/file43.txt', 'root/subDir11/subDir22/file34.txt', 'root/subDir11/subDir23/subDir32/subDir41/file51.txt', 'root/subDir11/subDir23/subDir32/file44.txt', 'root/subDir11/subDir23/subDir32/file45.txt', 'root/subDir11/subDir23/file35.txt', 'root/subDir11/file21.txt', 'root/subDir12/subDir24/file36.txt', 'root/subDir12/subDir24/file37.txt', 'root/subDir12/subDir25/subDir33/file46.txt', 'root/subDir12/subDir25/subDir33/file47.txt', 'root/subDir12/subDir25/file38.txt', 'root/subDir12/file22.txt', 'root/file11.txt']
Test Label Dic Values: [['fileID', 'Gamma'], [[0, 0, 0, 0], [0]], [[0, 0, 0, 1], [0]], [[0, 0, 0, 2], [0]], [[0, 0, 1, 0, 0], [0]], [[0, 0, 1, 0, 1], [0]], [[0, 0, 1, 0, 2], [0]], [[0, 0, 1, 1], [1]], [[0, 0, 2, 0, 0, 0], 

---

<a id="viewlstpaths"></a>
## View List Paths

Visualize File System w List of Paths. This will give the ground truth to compare other file trees.

In [169]:
# def viewLstPathS(paths, maxlevel = 10):

# Written by chatGPT

from anytree import Node, RenderTree
def viewLstPathS(paths, maxlevel = 10):

    # Root node
    root = Node("root")

    # List of paths (simulating file system paths)
    # paths = lstObjectKeys

    # Dictionary to track created nodes
    nodes = {"": root}

    # Loop through paths to build the tree
    for path in paths:
        parts = path.split("/")
        current_path = ""

        for part in parts:
            current_path = f"{current_path}/{part}".strip("/")
            if current_path not in nodes:
                parent_path = "/".join(current_path.split("/")[:-1])
                nodes[current_path] = Node(part, parent=nodes[parent_path])

    # Display tree
    for pre, fill, node in RenderTree(root, maxlevel = maxlevel): 
        print(f"{pre}{node.name}")

In [170]:
# Run Function: Visualize File Tree
viewLstPathS(testFileTree)

root
└── root
    ├── subDir11
    │   ├── subDir21
    │   │   ├── file31.txt
    │   │   ├── file32.txt
    │   │   └── file33.txt
    │   ├── subDir22
    │   │   ├── subDir31
    │   │   │   ├── file41.txt
    │   │   │   ├── file42.txt
    │   │   │   └── file43.txt
    │   │   └── file34.txt
    │   ├── subDir23
    │   │   ├── subDir32
    │   │   │   ├── subDir41
    │   │   │   │   └── file51.txt
    │   │   │   ├── file44.txt
    │   │   │   └── file45.txt
    │   │   └── file35.txt
    │   └── file21.txt
    ├── subDir12
    │   ├── subDir24
    │   │   ├── file36.txt
    │   │   └── file37.txt
    │   ├── subDir25
    │   │   ├── subDir33
    │   │   │   ├── file46.txt
    │   │   │   └── file47.txt
    │   │   └── file38.txt
    │   └── file22.txt
    └── file11.txt


---

<a id="folderpaths"></a>
## Process paths into ordered folder paths

In [6]:
# def sortPaths(lstOfPath)
def sortPaths(pathsLst):
    """
    Sorts a list of paths in a hierarchical manner, ensuring that parent directories are ordered first before their subdirectories.
    Args:
    - pathsLst (list): Each element corresponds to a string path of format "root/Dir1/SubDir2/...".
    Note:
    Make sure the formating of each path is so that it is divided by "/" and that there are no "/" at the beginning or end
    """

    # Split into steps of the path
    foldersLst = [ ( len( path.split("/")) , path.split("/") ) for path in pathsLst ]

    # Sort by size so that the larger paths come first
    foldersLst.sort( reverse = True, key= lambda x: x[0])

    # Get a Dict with the sizes of paths and number of corresponding paths in descending order
    # to match 
    foldersSizes = [ path[0] for path in foldersLst ]
    foldersUniqSizes = list(set(foldersSizes))
    foldersUniqSizes.sort(reverse=True)
    foldersSizesDic = { size : foldersSizes.count( size ) for size in foldersUniqSizes }
    foldersLst = [ path[1] for path in foldersLst ]


    largestPath = list( foldersSizesDic.keys() )[ 0 ]
    orderedFolderLst = foldersLst
    currentNumberofPaths = 0 
    lstAllPathSizes = list( foldersSizesDic.keys() )
    for size in range( largestPath, 0, -1 ):
        if size in lstAllPathSizes:
            currentNumberofPaths += foldersSizesDic[size] 

        startSubList = orderedFolderLst[ : currentNumberofPaths]
        endSubList = orderedFolderLst[currentNumberofPaths: ]

        startSubList.sort(key = lambda x: x[ size - 1] )

        orderedFolderLst = startSubList + endSubList

    orderedFolderLst = [  "/".join( path ) for path in orderedFolderLst ]

    return orderedFolderLst


In [172]:
# Test 1 -> test_sortLst
lstOfPath = test_sortLst
foldersLst = list( set( [ "/".join( path.split("/")[ :-1 ] ) for path in lstOfPath ] ) ) 
orderedFolderLst = sortPaths( foldersLst )
for folderPath in orderedFolderLst: print( folderPath )

A/A/A/A
A/A/A/B
A/A/A/C
A/A/A/D
A/A/B
A/A/C/A
A/A/C/B
A/A/C/C
A/B/A/A
A/B/A/B
A/B/A/C
A/B/B/A
A/B/B/B
A/B/C
A/C
B/A/A/A
B/A/A/B
B/A/A/C
B/A/B
B/B B
B/C/A
B/C/B
B/C/C
B/C/D


In [173]:
# Test 2  -> testTree
lstOfPath = testTree
foldersLst = list( set( [ "/".join( path.split("/")[ :-1 ] ) for path in lstOfPath ] ) ) 
orderedFoldersLst = sortPaths( foldersLst )
orderedFilesLst = sortPaths( lstOfPath )
print("Sort Folders:\n------------------------")
for folderPath in orderedFoldersLst: print( folderPath )
print("Sort Files:\n------------------------")
for filePath in orderedFilesLst: print( filePath )

Sort Folders:
------------------------
root/subDir11/subDir21
root/subDir11/subDir22/subDir31
root/subDir11/subDir22
root/subDir11/subDir23/subDir32/subDir41
root/subDir11/subDir23/subDir32
root/subDir11/subDir23
root/subDir11
root/subDir12/subDir24
root/subDir12/subDir25/subDir33
root/subDir12/subDir25
root/subDir12
root
Sort Files:
------------------------
root/file11.txt
root/subDir11/file21.txt
root/subDir11/subDir21/file31.txt
root/subDir11/subDir21/file32.txt
root/subDir11/subDir21/file33.txt
root/subDir11/subDir22/file34.txt
root/subDir11/subDir22/subDir31/file41.txt
root/subDir11/subDir22/subDir31/file42.txt
root/subDir11/subDir22/subDir31/file43.txt
root/subDir11/subDir23/file35.txt
root/subDir11/subDir23/subDir32/file44.txt
root/subDir11/subDir23/subDir32/file45.txt
root/subDir11/subDir23/subDir32/subDir41/file51.txt
root/subDir12/file22.txt
root/subDir12/subDir24/file36.txt
root/subDir12/subDir24/file37.txt
root/subDir12/subDir25/file38.txt
root/subDir12/subDir25/subDir33/fi

### Test if all paths respect assumptions to be processed properly
- To be done


---

<a id=openlist></a>
## Ways to Open Nested Lists

- pinchLevel(tree, depth, flat = False)

In [None]:
# Iteratively
def get_item_depth( lst, depth):
    """
    Gets all the items in a specific tree depth
    Args:
    - Depth (int): Represents the non-zero indexed level
    """
    for _ in range( depth - 1): # -1 bc it already start at level one, and at the return it opens one extra level
        lst = lst[-1]
    return lst[:-1]

In [None]:
# Recursively
def openNestedLst(lst, maxDepth , depth = 0  ):
    """
    Tool used to open nested lists in the form: [ ..., [nested list] ]
    """
    depth += 1
    if depth == maxDepth:
        return lst[:-1]
    return openNestedLst( lst[-1], maxDepth, depth = depth )

In [None]:
# Uses get_item_depth
def pinchLevel(tree, depth : int, flat = False ):
    """
    Get the names of all the folders and files present in a specific level of the file system
    Args:
    - tree:
    - depth: Level to extract folders from
    - flat (bool): If True - gives you a list of all folders, If False - gives you a list of lists, each list
                    representing a folder from the preceding level
    Note:
    - This is not zero indexed, to the first depth = 1, which represents the root, or the first tree[:-1] without going down a level (tree[-1])
    """
    def flatten_list( lstLst):
        """Flatten a list of lists"""
        return [item for lst in lstLst for item in lst ]

    if flat: return flatten_list( get_item_depth( tree, depth  ) )
    else: return get_item_depth( tree, depth  ) 

In [None]:
# Test pinchLevel()
print( pinchLevel( fileTree, 2, flat = False ))

---

<a id="folderstree"></a>
## Convert list of Folders into Nested List of Lists

In [2]:
# def createFolderTree(lstFolderPaths, delimiter = "/")
def createFolderTree( lstFolderPaths, delimiter = "/"): #
    """
    Args:
    - treeMaxDepth: Max number of directories to tresspass to reach a file
    """
    folderTree = [ [], [ [] ] ] 
    treeMaxDepth = 0
    folderIDS = []
    for path in lstFolderPaths:
        parts = path.split( delimiter)   #  "root/subDir11/subDir21".split("/")
        if len( parts ) > treeMaxDepth: # Get Tree Size Dynamically
            treeMaxDepth = len( parts )

        tree = folderTree
        folderID = []
        for part in  parts :     
            if part not in tree[ -2 ]:  # Since it is in alphabetical order, it will either exist in the last folder or a new folder must be created
                tree[-2].append( part)
                tree[ -1 ].insert( -1, [] )
                if tree[ -1 ][-1] == []: # This means that this level ( tree[-1] )have not yet been reached
                    tree[ -1 ][-1].append( [] ) # Dynamically increase the tree
            else: pass
            folderID.append( len( tree[-2] ) - 1 )
            tree =  tree[ -1 ] # Peel one layer of the nested list
        
        folderIDS.append( folderID ) #( folderID ) -> accounts for root | (folderID[1:]) -> Does not take root representation

    return folderTree, treeMaxDepth, folderIDS

In [7]:
# Test function - createFolderTree() - to see if it creates tree - folderTree
from scripts.filesUtils import loadPickle
import os

# Test 1
foldersLst =list( set(  [  "/".join( filePath.split("/")[ :-1 ]) for filePath in testTree ] ))# Use testTree

# # Test 2
# lstObjectKeys = loadPickle( os.path.join( "sandbox", "Boto3", "XChemFiles", "frag1.pkl" ) )
# lstOfPaths = lstObjectKeys[:100000]
# foldersLst =list( set(  [  "/".join( filePath.split("/")[ :-1 ]) for filePath in lstOfPaths ] ))


orderedFoldersLst = sortPaths( foldersLst ) 
folderTree, folderTreeMaxDepth, folderIDS = createFolderTree( orderedFoldersLst )

print("Resulting Folder Tree:\n")
tree = folderTree
switch = True
count = 0
while switch:
    count += 1
    print(count, " ", tree[:-1])
    tree= tree[-1]
    if tree == []:
        switch = False

# Expected resulting list: [ ["root"], 
#                         [ ["subDir11", "subDir12", files], 
#                       [ ["subDir21", "subDir22", "subDir23", files], ["subDir24", "subDir25", files],
#                      [ [], ["subDir31", files ] , ["subDir32", files], [], ["subDir33", files], 
#               [ [], ["subDir41", files], [], 
#               [ [] ,  
#               [ [] ]  ]    ]     ]   ]     ]        ]

Resulting Folder Tree:

1   [['root']]
2   [['subDir11', 'subDir12']]
3   [['subDir21', 'subDir22', 'subDir23'], ['subDir24', 'subDir25']]
4   [[], ['subDir31'], ['subDir32'], [], ['subDir33']]
5   [[], ['subDir41'], []]
6   [[]]
7   []


In [178]:
# Test 2 -> see if calculates folder IDs and tree Depths properly

# Get Dictionary with known folder IDS
folderIDDic = {  "/".join( key.split("/")[ :-1 ]) : value  for key, value in testLabelsDic.items() }

print( folderIDDic )
orderedFolderDic = { key: [ folderIDDic[key][0][:-1],  folderIDDic[key][1] ]  for key in orderedFoldersLst}
print(f"Ground Truth orderedFolderDic Keys:", orderedFolderDic.keys())
print("Ground Truth orderedFolderDic Values:", orderedFolderDic.values())
print("----------------------------")


print( f"Maximum Tree Depth: {folderTreeMaxDepth}\n")
print(f"Folder IDs: {folderIDS}")
print(f"Ground Truth: {[ pair[0] for pair in orderedFolderDic.values()]}")
print(f"The generated folder IDs match the ground truth: {folderIDS == [ pair[0] for pair in orderedFolderDic.values()] } ")



{'': ['fileID', 'Gamma'], 'root/subDir11/subDir21': [[0, 0, 0, 2], [0]], 'root/subDir11/subDir22/subDir31': [[0, 0, 1, 0, 2], [0]], 'root/subDir11/subDir22': [[0, 0, 1, 1], [1]], 'root/subDir11/subDir23/subDir32/subDir41': [[0, 0, 2, 0, 0, 0], [0]], 'root/subDir11/subDir23/subDir32': [[0, 0, 2, 0, 2], [1]], 'root/subDir11/subDir23': [[0, 0, 2, 1], [2]], 'root/subDir11': [[0, 0, 3], [0]], 'root/subDir12/subDir24': [[0, 1, 0, 1], [3]], 'root/subDir12/subDir25/subDir33': [[0, 1, 1, 0, 1], [2]], 'root/subDir12/subDir25': [[0, 1, 1, 1], [4]], 'root/subDir12': [[0, 1, 2], [1]], 'root': [[0, 2], [0]]}
Ground Truth orderedFolderDic Keys: dict_keys(['root/subDir11/subDir21', 'root/subDir11/subDir22/subDir31', 'root/subDir11/subDir22', 'root/subDir11/subDir23/subDir32/subDir41', 'root/subDir11/subDir23/subDir32', 'root/subDir11/subDir23', 'root/subDir11', 'root/subDir12/subDir24', 'root/subDir12/subDir25/subDir33', 'root/subDir12/subDir25', 'root/subDir12', 'root'])
Ground Truth orderedFolderDic

<a id ="filestree"></a>
## Populate List of Folders with Files

<a id="folderscount"></a>

- ### Get Folders Count from folders Tree

In [None]:
# def countFolders(folderTree)

def countFolders(folderTree, folderTreeMaxDepth):
    foldersCount = []

    tree = folderTree
    for _ in range( folderTreeMaxDepth + 1 ):
        foldersCount.append( [ len( lst) for lst in tree[ :-1] ] )
        tree = tree[-1] # Peel one layer of the nested list
    return foldersCount # foldersCount[1:] -> does NOT take the root representation

In [15]:
# Test function - countFolders()

foldersCountGroundTruth = [ [1], [2], [3, 2], [0, 1, 1, 0, 1], [0, 1, 0, ], [0] ]
foldersCount = countFolders( folderTree, folderTreeMaxDepth )
print(f"Folders Count: { foldersCount }")
print( f"Ground Truth:{foldersCountGroundTruth}")

Folders Count: [[1], [2], [3, 2], [0, 1, 1, 0, 1], [0, 1, 0], [0]]
Ground Truth:[[1], [2], [3, 2], [0, 1, 1, 0, 1], [0, 1, 0], [0]]


- ### Open the level

In [19]:
def pinchLevel(tree, depth : int, flat = False ):
    """
    Get the names of all the folders and files present in a specific level of the file system
    Args:
    - tree:
    - depth: Level to extract folders from
    - flat (bool): If True - gives you a list of all folders, If False - gives you a list of lists, each list
                    representing a folder from the preceding level
    Note:
    - This is not zero indexed, to the first depth = 1, which represents the root, or the first tree[:-1] without going down a level (tree[-1])
    """
    def flatten_list( lstLst):
        """Flatten a list of lists"""
        return [item for lst in lstLst for item in lst ]

    def get_item_depth( lst, depth):
        """
        Gets all the items in a specific tree depth
        Args:
        - Depth (int): Represents the non-zero indexed level
        """
        for _ in range( depth - 1): # -2 bc it already start at level one, and at the return it opens one extra level
            lst = lst[-1]
        return lst[:-1]

    if flat: return flatten_list( get_item_depth( tree, depth  ) )
    else: return get_item_depth( tree, depth  ) 

In [30]:
def openNestedLst(lst, maxDepth , depth = 0  ):
    """
    Tool used to open nested lists in the form: [ ..., [nested list] ]
    """
    depth += 1
    if depth == maxDepth:
        return lst[:-1]
    return openNestedLst( lst[-1], maxDepth,  depth = depth)

level = 6
print( openNestedLst( folderTree, level) )
pinchLevel( folderTree, level)

[[]]


[[]]

<a id="gamma"></a>

- ### Get Gamma - w help of folders count


#### Introduction to the concept of Gamma and folder IDs
<details>
<summary>  </summary>

There are two ways a file and folder can be represented in a list of lists: "folder / file ID" and "folder / file treeList index".

- Folder / File ID
    - Each number represents the index of the folder to open in that level. Note that python does zero indexing.
    - Example: ( 0, 1, 0, 2 )
        - Select the <ins>1st folder</ins> (**0**) in the root node,
        - then the <ins>2nd subfolder</ins> (**1**) in it, 
        - the <ins>1st subsubfolder</ins> (**0**) inside it, 
        - Lastly the <ins>3rd subsubsubfolder / file</ins> (**2**). 
- File Tree Index
    - In the file tree we get access to all the folders found in a level. However, there is no direct way to connect a current subfolder with their previous folder.
    - Example: How to access the <ins>2nd</ins> (1) folder in the <ins>3rd</ins> level ([-1][-1]):
        -  treelist[-1][-1][1] 

Now it is hard to know exactly which subfolder we want based on its index at its level. There should be a way to convert the folder ID to a way to access it:

### **Formula for folder and file IDs**

$$ 
File ID = ( 1, 2, 1)\\
X_N = (X_1, X_2, X_3, ... , X_n),\\ 
$$

$$
Folder Counter = ( (1), (1), (3), (1,1,1), ...)\\
Y_{N,K} = ( (Y_{1,1}) , ( Y_{2,1} ), ( Y_{3,1}), ( Y_{4,1}, Y_{4,2}, Y_{4,3}), ..., Y_{n,k}) \\
$$

where:<br/>
- File ID (zero indexed), represents which directory to go through 
- n represents the level in the file system <br/>
- k represents the folder a specific file level (zero indexed) <br/>

<ins> Formula </ins>

$$File \: in \: nested \: Files\:  =  File[-1]...[-1]_{n-2}[\gamma_{n-1}][\omega]$$

$$\omega = X_n$$

$$  \gamma_{n-1} = \sum_{i=1}^{n-1} \gamma_i \qquad where:\\
 \gamma_1 = X_1 \: and \: \gamma_n = \sum_{i=0}^{\gamma_{n-1} -1} Y_{n,i} + X_n
 $$ 

</details>

In [17]:
# def getGamma(foldersCount, folderID):

def getGamma(foldersCount, folderID): 
    folderID = folderID #[1:]-> does not take the root representation # [] = 0 (supra root), [0] = 0 (root), [0, ]
    gamma = 0
    for count, ID in zip( foldersCount, folderID):
        gamma = sum( count[ : gamma] ) + ID
    return gamma

# Gamma Represents the index of the folder to open at the level of the folder
# At level 1 -> this represents a supra root with its content as the root. So you need an index (gamma) that selects the supra root
# At level 2 -> this represents the root so you need an index (gamma) that represents the root
# At level 3 ->  

print( "Supra Root Content - Level 1:\t" , pinchLevel( folderTree, 1, flat = False) )
print( "Root Content - Level 2:\t\t", pinchLevel( folderTree, 2, flat = False) )
print("Content found in Level 3:\t",  pinchLevel( folderTree, 3, flat = False) )

Supra Root Content - Level 1:	 [['root']]
Root Content - Level 2:		 [['subDir11', 'subDir12']]
Content found in Level 3:	 [['subDir21', 'subDir22', 'subDir23'], ['subDir24', 'subDir25']]


In [18]:
print(f"Ground Truth orderedFolderDic Keys:", orderedFolderDic.keys())
print("Ground Truth orderedFolderDic Values:", orderedFolderDic.values())

Ground Truth orderedFolderDic Keys: dict_keys(['root/subDir11/subDir21', 'root/subDir11/subDir22/subDir31', 'root/subDir11/subDir22', 'root/subDir11/subDir23/subDir32/subDir41', 'root/subDir11/subDir23/subDir32', 'root/subDir11/subDir23', 'root/subDir11', 'root/subDir12/subDir24', 'root/subDir12/subDir25/subDir33', 'root/subDir12/subDir25', 'root/subDir12', 'root'])
Ground Truth orderedFolderDic Values: dict_values([[[0, 0, 0], [0]], [[0, 0, 1, 0], [0]], [[0, 0, 1], [1]], [[0, 0, 2, 0, 0], [0]], [[0, 0, 2, 0], [1]], [[0, 0, 2], [2]], [[0, 0], [0]], [[0, 1, 0], [3]], [[0, 1, 1, 0], [2]], [[0, 1, 1], [4]], [[0, 1], [1]], [[0], [0]]])


In [19]:
# Test function - getGamma() 
folderID = [ 0, 1, 1]

testResult = [ getGamma( foldersCount, ID ) for ID in folderIDS]#[1:] ]
print(f"Gamma Results:{testResult}")
print(f"Ground Truth:  { [ pair[1][0] for pair in orderedFolderDic.values() if pair[1] != []]}")
print( getGamma( foldersCount, folderID ))

Gamma Results:[0, 0, 1, 0, 1, 2, 0, 3, 2, 4, 1, 0]
Ground Truth:  [0, 0, 1, 0, 1, 2, 0, 3, 2, 4, 1, 0]
4


- ### Get folder content - from its gamma and tree depth

In [180]:
rabbitHole(folderTree, folderTreeMaxDepth )

[['root']]
[['subDir11', 'subDir12']]
[['subDir21', 'subDir22', 'subDir23'], ['subDir24', 'subDir25']]
[[], ['subDir31'], ['subDir32'], [], ['subDir33']]
[[], ['subDir41'], []]


In [20]:
folderID = [ 0  ] # Try interactively to alter ID starting on [ ]
level = len( folderID )
gamma = getGamma( foldersCount, folderID)#[1:])
print( pinchLevel( folderTree, level + 1, flat = False )[gamma] )

# Test if can alter tree
# pinchLevel( folderTree, level + 1, flat = False )[gamma].append( "test" )
# print( pinchLevel( folderTree, level + 1, flat = False )[gamma] )

['subDir11', 'subDir12']


In [53]:
#  getFolderContent( folderTree, gamma, fileDepth = 10, count = -1)
def getFolderContent( folderTree, gamma, fileDepth = 10, count = -1):
    count += 1
    # print( folderTree)
    if count == fileDepth:
        return folderTree[ gamma ]
    return getFolderContent( folderTree[-1], gamma, fileDepth = fileDepth, count = count )


<a id="createfiletree"></a>

- ### Populate Tree with files based on Gamma

In [45]:
# def createFileTree(lstOfPath, foldersLst, folderTree, folderIDS, foldersCount)

from collections import defaultdict
from copy import deepcopy

def createFileTree( lstOfPath, orderedFoldersLst, folderTree, folderIDS, foldersCount ):
    """
    Args:
    - lstOfPath (list): list of path strings with the right formatting
    - orderedFoldersLst (list): list of folders ordered in alphabetical order hierarchically
    - folderTree( list ): Nested list representing a file system expressed in the lstOfPaths
    - folderIDS ( list ): ID locating each folder in the file system (w/o root representation )
    - foldersCount ( list): List represent the number of folders found inside each parent folder for each level

    """
    # Get a sorted folderID : list( Files )  Dictionary
    folderFileDict = defaultdict( list )
        # Get Folder Path for each File:
    for folderPath, fileName in zip(  [ "/".join( path.split("/")[:-1])  for path in lstOfPath], [path.split("/")[-1] for path in lstOfPath ] ):
        folderFileDict[ folderPath].append( fileName)
        # Sort the files according to an already sorted orderedFolderLst
    sortedFolderFileDict = { key: folderFileDict[key] for key in orderedFoldersLst }
        # Assign the Folder IDs to the ordered Folders with corresponding files
    folderIDFileDict = { tuple( ID ) : path for ID, path in zip( folderIDS, sortedFolderFileDict.values( ) )}

    fileTree = deepcopy( folderTree )
    for folderID, fileNameS in folderIDFileDict.items():
        gamma = getGamma( foldersCount, folderID ) #[1:] )
        for fileName in fileNameS:
            # file = path.split("/")[-1]
            pinchLevel( fileTree, len(folderID) + 1 , flat = False )[gamma].append( fileName ) # level + 1
            # getFolderContent( fileTree, gamma, fileDepth = len(folderID) + 1  ).append( fileName)
    return fileTree

In [46]:
# def rabbitHole( tree, treeMaxDepth, count = 0) - to print the content of the nested list at each level
def rabbitHole( tree, treeMaxDepth, count = 0):
    count += 1
    print(tree[:-1])
    if count == treeMaxDepth:
        return None
    return rabbitHole( tree[-1], treeMaxDepth,  count = count)


In [47]:
# Reset Tree
foldersLst = list( set( [ "/".join( path.split("/")[ :-1 ] ) for path in testTree ] ) ) 
orderedFolderLst = sortPaths( foldersLst )
# print("success")
folderTree, folderTreeMaxDepth, foldersIDS = createFolderTree( orderedFolderLst ) # 1 min
# print("success")
foldersCount = countFolders( folderTree, folderTreeMaxDepth ) 

In [48]:
# Test createFileTree() w rabbitHole()
fileTree = createFileTree( lstOfPath, orderedFoldersLst, folderTree, folderIDS, foldersCount )
rabbitHole(fileTree, folderTreeMaxDepth + 1)


[['root']]
[['subDir11', 'subDir12', 'file11.txt']]
[['subDir21', 'subDir22', 'subDir23', 'file21.txt'], ['subDir24', 'subDir25', 'file22.txt']]
[['file31.txt', 'file32.txt', 'file33.txt'], ['subDir31', 'file34.txt'], ['subDir32', 'file35.txt'], ['file36.txt', 'file37.txt'], ['subDir33', 'file38.txt']]
[['file41.txt', 'file42.txt', 'file43.txt'], ['subDir41', 'file44.txt', 'file45.txt'], ['file46.txt', 'file47.txt']]
[['file51.txt']]


In [23]:
# Test internal working of createFileTree()

from collections import defaultdict
from copy import deepcopy

lstOfPath = testTree

folderFileDict = defaultdict( list )
for folderPath, filePath in zip(  [ "/".join( path.split("/")[:-1])  for path in lstOfPath], [path for path in lstOfPath ] ):
    folderFileDict[ folderPath].append( filePath)
sortedFolderFileDict = { key: folderFileDict[key] for key in orderedFoldersLst }
folderIDFileDict = { tuple( ID ) : path for ID, path in zip( folderIDS, sortedFolderFileDict.values( ) )}



print("Sorted Folder File Dictionary: \n")
for item in sortedFolderFileDict.items(): print( item )
print(f"\nSordted FolderID File Dictionary: \n")
for item in folderIDFileDict.items(): print( item )



# Test gamma and get folderID
folderID = [1,1] # Choose interactively different values, start with []
print(f"Folder ID: {folderID}")
gamma = getGamma( foldersCount,folderID )
print( f"Gamma: {gamma}" )
print(f"Folder Content: {getFolderContent( folderTree, gamma, fileDepth = len(folderID) + 1  )}")




Sorted Folder File Dictionary: 

('root/subDir11/subDir21', ['root/subDir11/subDir21/file31.txt', 'root/subDir11/subDir21/file32.txt', 'root/subDir11/subDir21/file33.txt'])
('root/subDir11/subDir22/subDir31', ['root/subDir11/subDir22/subDir31/file41.txt', 'root/subDir11/subDir22/subDir31/file42.txt', 'root/subDir11/subDir22/subDir31/file43.txt'])
('root/subDir11/subDir22', ['root/subDir11/subDir22/file34.txt'])
('root/subDir11/subDir23/subDir32/subDir41', ['root/subDir11/subDir23/subDir32/subDir41/file51.txt'])
('root/subDir11/subDir23/subDir32', ['root/subDir11/subDir23/subDir32/file44.txt', 'root/subDir11/subDir23/subDir32/file45.txt'])
('root/subDir11/subDir23', ['root/subDir11/subDir23/file35.txt'])
('root/subDir11', ['root/subDir11/file21.txt'])
('root/subDir12/subDir24', ['root/subDir12/subDir24/file36.txt', 'root/subDir12/subDir24/file37.txt'])
('root/subDir12/subDir25/subDir33', ['root/subDir12/subDir25/subDir33/file46.txt', 'root/subDir12/subDir25/subDir33/file47.txt'])
('root

## Put all functions together to create a file Tree from a list of paths:

In [179]:
# def createTree(lstOfPath)


def createTree(lstOfPathS ):
    # lstOfPath  # Input
    # print("success")
    foldersLst = list( set( [ "/".join( path.split("/")[ :-1 ] ) for path in lstOfPathS ] ) ) 
    orderedFolderLst = sortPaths( foldersLst )
    # print("success")
    folderTree, folderTreeMaxDepth, foldersIDS = createFolderTree( orderedFolderLst ) # 1 min
    # print("success")
    foldersCount = countFolders( folderTree, folderTreeMaxDepth ) 
    # print("success")
    fileTree = createFileTree(lstOfPathS, orderedFolderLst, folderTree, foldersIDS, foldersCount) # 7 min 
    # print("success")
    return fileTree, folderTree, folderTreeMaxDepth,  foldersIDS, foldersCount, foldersLst

# 8m 43 for 10 000 877 paths
# lstObjectKeys = loadPickle( os.path.join( "sandbox", "Boto3", "XChemFiles", "frag1.pkl" ) )[1:]
# fileTree, folderTree, folderTreeMaxDepth,  foldersIDS, foldersCount, foldersLst = createTree( lstObjectKeys )


In [181]:
# Test createTree()
from scripts.filesUtils import loadPickle
import os

# lstOfPaths = testTree
lstObjectKeys = loadPickle( os.path.join( "sandbox", "Boto3", "XChemFiles", "frag1.pkl" ) )
lstOfPaths = lstObjectKeys[:100000]

fileTree, folderTree, folderTreeMaxDepth,  foldersIDS, foldersCount, foldersLst = createTree( lstOfPaths )
rabbitHole(fileTree, folderTreeMaxDepth +1  )

# Times
# 100 000       6 secs
# 1 000 000     41 secs
# 10 000 000    523 secs
# Bottleneck -> createFileTree()

[['data']]
[['2015']]
[['lb13308-1', 'lb13308-2', 'lb13320-1']]
[['processing'], ['processing'], ['processing']]
[['old_processing_prior_to_20160520', 'reference', 'transfer_pandda_data.done'], ['README.reprocessing'], ['AR_compounds_pdb', 'TeXRank', 'allDatasets', 'analysis', '2018072014.txt', '2018072014.txt.info.csv', '2018081517.txt', '2018081517.txt.info.csv', '2019100110.txt', '2019100110.txt.info.csv', 'README.reprocessing', 'STDIN.e18931570', 'STDIN.o18931570', 'XChemExplorer_settings.pkl']]
[['analysis', 'database', 'lab36_old', 'pandda', 'pandda-original.eff', 'pandda.eff', 'pandda.eff~'], ['3GFT.pdb', '3GFTEdit.pdb'], ['A01_P1_acedrg1.cif', 'A01_P1_acedrg1.pdb', 'A01_P2_acedrg1.cif', 'A01_P2_acedrg1.pdb', 'A02_P1_acedrg1.cif', 'A02_P1_acedrg1.pdb', 'A02_P2_acedrg1.cif', 'A02_P2_acedrg1.pdb', 'A03_P1_acedrg1.cif', 'A03_P1_acedrg1.pdb', 'A03_P2_acedrg1.cif', 'A03_P2_acedrg1.pdb', 'A04_P1_acedrg1.cif', 'A04_P1_acedrg1.pdb', 'A04_P2_acedrg1.cif', 'A04_P2_acedrg1.pdb', 'A05_P1_ac

---

### Get Folder Index (Incomplete)

In [None]:
path = "root/subDir12/subDir25/subDir33/file46.txt"
parts = path.split( "/" )
print( len( parts) )
countdown = len( parts )
tree = folderTree
currentFolder = 0
for idx, part in enumerate(  len(parts) ):
    currentFolder = tree[0].index( part )

---


<a id=extratools></a>
## Extra tools
### Tools to browse with a path

- getGammawPath(folderPath, tree, foldersCount, targetDepth = None)


In [None]:
def getGammawPath(path, tree, foldersCount, ):

    pathParts = path.split("/") 

    pointer = {"gamma": 0, "id": 0 }  # gamma represents the index location of which supra folder does the target folder live in | id represents the index of where the folder is within the supra folder

    for levelFoldersCount, currentDir in zip( foldersCount,  pathParts ):
        
        if currentDir in tree[:-1][ pointer["gamma"] ]:
            pointer["id"] = tree[:-1][ pointer["gamma"] ].index( currentDir ) # Find id for current folder
            pointer["gamma"] = sum( levelFoldersCount[ : pointer[ "gamma" ]  ] ) + pointer["id"] # Update gamma for next level based on foldersCount
            
            tree = tree[-1]
        else:
            print( f"Could not find \"{currentDir}\" directory.")
            
            return None


    return  pointer["gamma"] # , folderID


## V1 ############ Does not work if there are two folders of the same name in different suprafolders at the same level (e.g. [ [compounds, proteins, dataset1.log ], [compounds, proteins, dataset2.log ] ])

# def getGammawPath(folderPath, tree, foldersCount, targetDepth = None):
#     """
#     Function allows to obtain the gamma of a folder based on the base of said folder
#     Args:
#     - folderPath (str): Can either be an absolute path or the name of the folder to open (see target Depth) 
#     - targetDepth: If None: folderPath must correspond to an absolute path. 
#                    If int: folderPath corresponds to directory name and depth its level        
#     """
#     if not targetDepth:
#         pathSplit = folderPath.split( "/" )
#         targetDepth = len( pathSplit )
#         targetFolder = pathSplit[-1]
#     else: targetFolder = folderPath
    
#     directory = pinchLevel( tree, targetDepth, flat = False) 
    
#     if targetDepth < 2:
#         return print( "This directory is too shallow to calculate gamma")

#     gamma = 0
#     for x, supraFolder in enumerate( directory):
#         for y, subFolders in enumerate( supraFolder):
#             if subFolders == targetFolder:
#                 gamma = sum( foldersCount[ targetDepth - 2 ][:x]) + y
#                 return gamma

In [7]:
# Test
import sys
sys.path.append("..")

from tests.testFileSys import testTree, testLabelsDic
from scripts.objFileSys import createTree
### Input
# targetFolder = "root"
# targetDepth = None # 
# fileTree = fileTree
# foldersCount = foldersCount
###

fileTree, folderTree, folderTreeMaxDepth,  foldersIDS, foldersCount, foldersLst = createTree( testTree )

print(testLabelsDic)
input = list( testLabelsDic.keys() )[1:]
inputFolders = [ "/".join( path.split("/")[ :-1] ) for path in input ]
expectedOutput = [ gamma[1] for  gamma in list( testLabelsDic.values() )[1:]  ]
print( "Calc Gamma:    ", [ [ getGammawPath( folderPath, fileTree, foldersCount ) ] for folderPath in inputFolders ] )
print( "Expected Result:", expectedOutput)

# print("Gamma ",gamma)
# print( "Directory", directory)
# print( "Count", foldersCount[1])
# var = getFolder( fileTree, gamma, targetDepth )

# pinchLevel( fileTree, 4, flat = False)[gamma]

{'path': ['fileID', 'Gamma'], 'root/subDir11/subDir21/file31.txt': [[0, 0, 0], [0]], 'root/subDir11/subDir21/file32.txt': [[0, 0, 1], [0]], 'root/subDir11/subDir21/file33.txt': [[0, 0, 2], [0]], 'root/subDir11/subDir22/subDir31/file41.txt': [[0, 1, 0, 0], [0]], 'root/subDir11/subDir22/subDir31/file42.txt': [[0, 1, 0, 1], [0]], 'root/subDir11/subDir22/subDir31/file43.txt': [[0, 1, 0, 2], [0]], 'root/subDir11/subDir22/file34.txt': [[0, 1, 1], [1]], 'root/subDir11/subDir23/subDir32/subDir41/file51.txt': [[0, 2, 0, 0, 0], [0]], 'root/subDir11/subDir23/subDir32/file44.txt': [[0, 2, 0, 1], [1]], 'root/subDir11/subDir23/subDir32/file45.txt': [[0, 2, 0, 2], [1]], 'root/subDir11/subDir23/file35.txt': [[0, 2, 1], [2]], 'root/subDir11/file21.txt': [[0, 3], [0]], 'root/subDir12/subDir24/file36.txt': [[1, 0, 0], [3]], 'root/subDir12/subDir24/file37.txt': [[1, 0, 1], [3]], 'root/subDir12/subDir25/subDir33/file46.txt': [[1, 1, 0, 0], [2]], 'root/subDir12/subDir25/subDir33/file47.txt': [[1, 1, 0, 1], 

In [102]:
print( foldersCount ) # root, subDir11 subDir12, 
for level in range( 1, len( foldersCount ) +1 ): print( foldersCount[ level - 1 ], "\t\t", pinchLevel( folderTree, level ) )
print("")
print( pinchLevel( folderTree, 1 ) )
print( pinchLevel( folderTree, 2 ))
print( pinchLevel( folderTree, 3 ) )

[[1], [2], [3, 2], [0, 1, 1, 0, 1], [0, 1, 0], [0]]
[1] 		 [['root']]
[2] 		 [['subDir11', 'subDir12']]
[3, 2] 		 [['subDir21', 'subDir22', 'subDir23'], ['subDir24', 'subDir25']]
[0, 1, 1, 0, 1] 		 [[], ['subDir31'], ['subDir32'], [], ['subDir33']]
[0, 1, 0] 		 [[], ['subDir41'], []]
[0] 		 [[]]

[['root']]
[['subDir11', 'subDir12']]
[['subDir21', 'subDir22', 'subDir23'], ['subDir24', 'subDir25']]


- convertPathtoID(tree, foldersCount, path)

In [112]:
def convertPathtoID(tree, foldersCount, path):

    pathParts = path.split("/") 

    pointer = {"gamma": 0, "id": 0 }  # gamma represents the index location of which supra folder does the target folder live in | id represents the index of where the folder is within the supra folder
    folderID = [ ]

    for levelFoldersCount, currentDir in zip( foldersCount,  pathParts ):
        
        if currentDir in tree[:-1][ pointer["gamma"] ]:
            
            pointer["id"] = tree[:-1][ pointer["gamma"] ].index( currentDir ) # Find id for current folder
            pointer["gamma"] = sum( levelFoldersCount[ : pointer[ "gamma" ]  ] ) + pointer["id"] # Update gamma for next level based on foldersCount
            
            tree = tree[-1]
            folderID.append( pointer["id"] )

        else:
            print( f"Could not find \"{currentDir}\" directory.")
            
            return None

    return folderID

## V1 ##############

# def origConvertPathtoID( tree, foldersCount, path ):
#     """
#     Args
#     - path: must be of the form root/ ... / lastSubfolder
#     """
#     pathSplit = path.split( "/" )
#     targetDepth = len( pathSplit ) 
#     fullID = [ ]
#     if path == "":
#         fullID = [0]
#     else:
#         for idx in range( targetDepth ):
#             currentTarget = pathSplit[idx]
#             gamma = getGamma( foldersCount, fullID )
#             currentDirectory = pinchLevel( tree, idx + 1, flat = False)
#             subfolderS = currentDirectory[ gamma ]
#             if currentTarget not in subfolderS:
#                 print("Path does not exist")
#                 return None
#             for id, subfolder in enumerate( subfolderS):
#                 if subfolder == currentTarget:
#                     fullID.append( id )

#     return fullID

In [113]:
# Test convertIDtoPath

from pathlib import Path
import sys
sys.path.append( "..")

from scripts.objFileSys import  pinchLevel , viewSubtree, convertIDtoPath, getGamma
from scripts.filesUtils import loadPickle

treePkl = loadPickle( Path( "../output/ObjectStore/XChem/PerProj/Data/tree/tree_2017_lb17578-4.pkl") )

folderID = [0, 0, 0, 0, 3, 5, 6, 200, 3 ]
fileID = folderID + []
print(f"True ID: {fileID}")

path = convertIDtoPath( treePkl["fileTree"], treePkl["foldersCount"], fileID )
####
folderID = [0,0,0,0, 3, 0, 520, 0]
viewSubtree( treePkl["fileTree"], treePkl["foldersCount"], 1, len(folderID) + 2, 
            folderID = folderID, showIndex = True )
path = "data/2017/lb17578-4/processing/analysis/initial_model/TMEMAB-x0560/compound"
convertedID = convertPathtoID( treePkl["fileTree"], treePkl["foldersCount"], path )
folderID = convertedID

print( f"Converted ID: {convertedID }" )

oriConvID = origConvertPathtoID( treePkl["fileTree"], treePkl["foldersCount"], path )
print( f"Ori Converted ID: {oriConvID}")


True ID: [0, 0, 0, 0, 3, 5, 6, 200, 3]
data
└── 2017
    └── lb17578-4
        └── processing
            └── analysis
                └── initial_model
                    └── TMEMAB-x0560
                        └── compound
                            ├── [0-f] OX-119.cif
                            ├── [1-f] OX-119.pdb
                            ├── [2-f] OX-119.png
                            ├── [3-f] OX-119.smiles
                            └── [4-f] OX-119_with_H.pdb
Converted ID: [0, 0, 0, 0, 3, 0, 520, 0]
Ori Converted ID: [0, 0, 0, 0, 3, 0, 520, 0]


- convertIDtoPath( tree, foldersCount, ID)

In [None]:
def convertIDtoPath( tree, foldersCount, ID  ):
    """
    This takes any  folder or file ID (where first element represents root - 0), 
    and outputs the path (which matches the object key)
    """
    path = ""
    gamma = 0
    for currentLevelID, currentLevelCount in zip( ID, foldersCount):

        path = path + "/" + tree[:-1][gamma][ currentLevelID ]
        gamma = sum( currentLevelCount[ : gamma] ) + currentLevelID
        tree = tree[-1]

    return path[1:] # remove initial "/"


## V1 #################

# # def convertIDtoPath( tree, foldersCount, ID  ): 
# def convertIDtoPath( tree, foldersCount, ID  ):
#     """
#     This takes any  folder or file ID (where first element represents root - 0), 
#     and outputs the path (which matches the object key)
#     """
#     path = ""
#     for idx in range( len(ID) ):
#         gammaFolderID = ID[ : idx] 
#         gamma = getGamma( foldersCount, gammaFolderID ) 
#         path = path + "/" + pinchLevel(tree, idx + 1)[gamma][ ID[idx] ]

#     return path[1:] # remove initial "/"

In [24]:
# Test convertIDtoPath

from pathlib import Path
import sys
sys.path.append( "..")

from scripts.objFileSys import  pinchLevel , viewSubtree
from scripts.filesUtils import loadPickle

treePkl = loadPickle( Path( "../output/ObjectStore/XChem/PerProj/Data/tree/tree_2017_lb17578-4.pkl") )

folderID = [0, 0, 0, 0, 3 , 5,  ]

fileID = folderID + []

print( convertIDtoPath( treePkl["fileTree"], treePkl["foldersCount"], fileID ) )

viewSubtree( treePkl["fileTree"], treePkl["foldersCount"], 1, len(folderID) + 2, folderID = folderID, showIndex = True )

data/2017/lb17578-4/processing/analysis/panddas
data
└── 2017
    └── lb17578-4
        └── processing
            └── analysis
                └── panddas
                    ├── [0] aligned_structures
                    ├── [1] analyses
                    ├── [2] analyses-2018-03-01-0433
                    ├── [3] coot-backup
                    ├── [4] logs
                    ├── [5] pickled_data
                    ├── [6] processed_datasets
                    ├── [7] reference
                    ├── [8-f] pandda.done
                    ├── [9-f] pandda.sh
                    ├── [10-f] pandda.sh.e22863360
                    └── [11-f] pandda.sh.o22863360


---

<a id="traversetree"></a>

## Traverse Tree

- openFolder()

In [None]:
# def openFolder( treeLst, foldersCount, folderID: list ) -> Similar to pinchLevel but opens based on a folderID
def openFolder( treeLst, foldersCount, folderID: list ):
    """
    Args:
    - folderID: 
    """
    targetDepth = len( folderID ) + 1 # bc [] shows the root, and [0] opens root and shows its content
    directory = pinchLevel( treeLst, targetDepth, flat = False)
    gamma = getGamma( foldersCount, folderID) 
    return directory[gamma]



In [None]:
# Test openFolder()
print( openFolder( fileTree, foldersCount, [ 0, 0, 2, 0, 3, 2, 0, 0, 0, 0, 0,  ] ) )

input = [ fileID[0][:-1] for fileID in list( testLabelsDic.values() )[1:] ]
output = list( testLabelsDic.keys() )[1:]
# print( [ openFolder( fileTree, foldersCount, folderID) for folderID in input ] )
print( output)

['01-rwcontents.log', '02-pointless.log', '03-refmac5_rigid.log', '04-unique.log', '05-freerflag.log', '06-cad.log', '07-refmac5_jelly.log', '08-refmac5_restr.log', '09-find-blobs.log', 'coot.sh', 'dimple.log', 'final.mtz', 'final.pdb', 'ini.pdb', 'run-coot.py', 'screen.log', 'workflow.pickle']
['root/subDir11/subDir21/file31.txt', 'root/subDir11/subDir21/file32.txt', 'root/subDir11/subDir21/file33.txt', 'root/subDir11/subDir22/subDir31/file41.txt', 'root/subDir11/subDir22/subDir31/file42.txt', 'root/subDir11/subDir22/subDir31/file43.txt', 'root/subDir11/subDir22/file34.txt', 'root/subDir11/subDir23/subDir32/subDir41/file51.txt', 'root/subDir11/subDir23/subDir32/file44.txt', 'root/subDir11/subDir23/subDir32/file45.txt', 'root/subDir11/subDir23/file35.txt', 'root/subDir11/file21.txt', 'root/subDir12/subDir24/file36.txt', 'root/subDir12/subDir24/file37.txt', 'root/subDir12/subDir25/subDir33/file46.txt', 'root/subDir12/subDir25/subDir33/file47.txt', 'root/subDir12/subDir25/file38.txt', 'r

- openFolderwPath( tree, foldersCount, folderPath , folderContentDepth = None)

In [None]:
# def openFolderwPath - Open the content of a folder path  

def openFolderwPath(tree, foldersCount, path):

    pathParts = path.split("/") 

    pointer = {"gamma": 0, "id": 0 }  # gamma represents the index location of which supra folder does the target folder live in | id represents the index of where the folder is within the supra folder

    # folderID = [0] # If uncommented, allows to output the folder ID as well

    for levelFoldersCount, currentDir in zip( foldersCount,  pathParts ):
        
        pointer["id"] = tree[:-1][ pointer["gamma"] ].index( currentDir ) # Find id for current folder
        pointer["gamma"] = sum( levelFoldersCount[ : pointer[ "gamma" ]  ] ) + pointer["id"] # Update gamma for next level based on foldersCount
        
        tree = tree[-1]
        
        # folderID.append( pointer[ "id" ] ) 

        print( f"Pinch Directory of interest: \t{tree[ pointer["gamma"] ] }") # open in case of 

    return tree[ pointer["gamma"] ] # , folderID






## V2 ######################## Does not work if there are two folders of the same name in different suprafolders at the same level (e.g. [ [compounds, proteins, dataset1.log ], [compounds, proteins, dataset2.log ] ])

# def openFolderwPath( tree, foldersCount, folderPath , targetFolderDepth = None):
#     """
#     Open the content of a folder based on the folder path, OR the folder name with its depth.
#     Note: path cannot start nor end with "/". -> root/folder/subfolder
#     Args:
#     - folderPath (str): Can either be an absolute path or the name of the folder to open (see target Depth) 
#     - targetFolderDepth: If None: folderPath must correspond to an absolute path. 
#                    If int: folderPath corresponds to directory name and depth of the content that lives inside of it.    
#     """
#     if not targetFolderDepth:
#         pathSplit = folderPath.split( "/" )
#         targetDepth = len( pathSplit )
#         targetFolder = pathSplit[-1]
#     else:
#         targetFolder = folderPath
#         targetDepth = targetFolderDepth


#     if folderPath == "":
#         directory = pinchLevel( fileTree, 1, flat = False)[0]
#         return directory
#     else:
#         # Perform typo check
#         if targetFolder not in pinchLevel( tree, targetDepth, flat = True ):
#             print( "Wrong path or Wrong Depth")
#             return None

#         gamma = 0
#         folderDirectory = pinchLevel( tree, targetDepth, flat = False) 
#         for x, supraFolder in enumerate( folderDirectory ):
#             for y, subFolders in enumerate( supraFolder):
#                 if subFolders == targetFolder:
#                     gamma = sum( foldersCount[ targetDepth - 1 ][:x] ) + y # foldersCount[] zero-indexed, hence targetDepth - 1
#                     break

#         folderContent = pinchLevel( tree, targetDepth + 1 )[ gamma ] # targetDepth + 1 to get folders that exist in the targetFolder content depth, and select the right one with gamma
#         return folderContent


## V1 #########################

# def openFolderwPath( tree, foldersCount, path ):
#     """
#     Visualize folder content based on folder path.
#     Note: 
#     - path cannot start nor end with "/". -> root/folder/subfolder
#     - Works with both file and folder Trees
#     """
#     pathSplit = path.split( "/" )
#     targetDepth = len( pathSplit )
#     targetFolder = pathSplit[-1]

#     directory = pinchLevel( tree, targetDepth, flat = False) 

#     if path == "":
#         directory = pinchLevel( fileTree, 1, flat = False)
#         return directory
#     elif targetDepth == 1:
#         gamma = 0
#     else:
#         gamma = 0
#         for x, supraFolder in enumerate( directory):
#             for y, subFolders in enumerate( supraFolder):
#                 if subFolders == targetFolder:
#                     gamma = sum( foldersCount[ targetDepth - 2 ][:x]) + y
#                     break

#     directory = getFolderContent( tree, gamma, fileDepth = targetDepth  )
#     return directory




    # directory = getFolderContent( tree, gamma, fileDepth = targetDepth + 1) # getFolder
    # print( directory)
    # return directory




In [None]:
# Test 

# ### Input
# fileTree, f
# targetFolderPath = "root/subDir11/subDir21/subDir33"
# fileTree = fileTree
# foldersCount = foldersCount
# ########

# pathSplit = targetFolderPath.split( "/" )
# targetDepth = len( pathSplit )

# directory = pinchLevel( fileTree, targetDepth, flat = False) 

# gamma = 0
# for x, supraFolder in enumerate( directory):
#     for y, subFolders in enumerate( supraFolder):
#         if subFolders == targetFolder:
#             gamma = sum( foldersCount[ targetDepth - 2 ][:x]) + y
#             break
        
    
# # print("Gamma ",gamma)
# # print( "Directory", directory)
# # print( "Count", foldersCount[1])
# var = getFolder( fileTree, gamma, targetDepth )
# print( var)

#Interactive selection of path and Depth
path = "subDir11" # root/subDir11/subDir22/ # "subDir21"
depth = 2 #None # 3


openFolderwPath(fileTree, foldersCount, path, targetFolderDepth = depth  )

['subDir21', 'subDir22', 'subDir23', 'file21.txt']

---

<a id="visualizetree"></a>

## Visualize

<a id="viswholetree"></a>

### Visualize Whole Tree

In [65]:
# Help Visualize specific depths
print( foldersCount )
pinchLevel( fileTree,  3, flat = False) 

[[2], [3, 2], [0, 1, 1, 0, 1], [0, 1, 0], [0]]


[['subDir21', 'subDir22', 'subDir23', 'file21.txt'],
 ['subDir24', 'subDir25', 'file22.txt']]

In [34]:
# def viewTree() -> observe the whole tree or up to a certain depth, with or without files
from anytree import Node, RenderTree
def viewTree(tree, treeDepth, foldersCount,  viewFiles = True  ):
    """
    Get a tree diagram
    Args:
    - Tree depth: can select how deep to open the tree
    """
    rootName = pinchLevel( tree,  1, flat = False)[0][0]

    root = Node(rootName)
    lstParentNodes = [ root ]
    foldersCount = [ None ] + [None] + foldersCount # So that can use level to access the right 
    for level in range( treeDepth - 1 ):
        level += 2      # Level starts at 2 and ends at treeDepth + 2 - 1 (not included in range [) ]) - 1 (inside range)
        childNodes = []
        fileNodes = []
        for parentFolderID, parentFolder in enumerate( pinchLevel( tree,  level, flat = False) ):                    
            for childFolderIdx, childFolder in enumerate( parentFolder):
                if childFolderIdx <  foldersCount[ level ][parentFolderID]:
                    childNodes.append( Node( childFolder, parent = lstParentNodes[ parentFolderID ]) )
                elif childFolderIdx >= foldersCount[ level ][parentFolderID]:                               # This filters folders that only contain files or the sections of a folder that are files
                    if viewFiles: fileNodes.append( Node( childFolder, parent = lstParentNodes[ parentFolderID ] ) )
                    elif not viewFiles: pass

    
        lstParentNodes = childNodes


    for pre, fill, node in RenderTree(root):
        print(f"{pre}{node.name}")


In [None]:
# Test
fileTree, folderTree, folderTreeMaxDepth,  foldersIDS, foldersCount, foldersLst = createTree( testFileTree )

# File Tree
### New Function
print( "new function")
viewTree( fileTree, folderTreeMaxDepth , foldersCount, viewFiles = False )
### Ground Truth
print("Ground Truth")
viewLstPathS( testFileTree )

# # Folder Tree
# ### New Function
# viewTree( folderTree, folderTreeMaxDepth + 1, foldersCount )
# ### Ground Truth
# viewLstPathS( testFolderTree )

new function
root
├── subDir11
│   ├── subDir21
│   ├── subDir22
│   │   └── subDir31
│   └── subDir23
│       └── subDir32
│           └── subDir41
└── subDir12
    ├── subDir24
    └── subDir25
        └── subDir33
Ground Truth
root
└── root
    ├── subDir11
    │   ├── subDir21
    │   │   ├── file31.txt
    │   │   ├── file32.txt
    │   │   └── file33.txt
    │   ├── subDir22
    │   │   ├── subDir31
    │   │   │   ├── file41.txt
    │   │   │   ├── file42.txt
    │   │   │   └── file43.txt
    │   │   └── file34.txt
    │   ├── subDir23
    │   │   ├── subDir32
    │   │   │   ├── subDir41
    │   │   │   │   └── file51.txt
    │   │   │   ├── file44.txt
    │   │   │   └── file45.txt
    │   │   └── file35.txt
    │   └── file21.txt
    ├── subDir12
    │   ├── subDir24
    │   │   ├── file36.txt
    │   │   └── file37.txt
    │   ├── subDir25
    │   │   ├── subDir33
    │   │   │   ├── file46.txt
    │   │   │   └── file47.txt
    │   │   └── file38.txt
    │   └── file22.txt

<a id="vissubtree"></a>

### Visualize Subset of the Tree

In [71]:
# Help Visualize specific depths
level = 2
print(f"Level {level}")
print( pinchLevel( fileTree,  level , flat = False) )
print( "Folders Count:", foldersCount[ level - 1] )
print("---------------------")
level = level + 1
print(f"Level {level}")
print( pinchLevel( fileTree,  level , flat = False) )
print( "Folders Count:", foldersCount[level - 1] ) # zero indexed
print("---------------------")
print("Folders Count")
print( foldersCount)
print("---------------------")
folderID = [0,1]
print(f"Open Folder: {folderID}")
print( openFolder( fileTree, foldersCount, folderID ) )
print("---------------------")
level = 1
folderID = [0, 1] 
parentID = folderID[ :level]
print( parentID)
gamma = getGamma( foldersCount, parentID[1:] )
print("Gamma", gamma )
pinchLevel( fileTree,  level + 1, flat = False)[gamma]#[ folderID[level] ]

Level 2
[['subDir11', 'subDir12', 'file11.txt']]
Folders Count: [3, 2]
---------------------
Level 3
[['subDir21', 'subDir22', 'subDir23', 'file21.txt'], ['subDir24', 'subDir25', 'file22.txt']]
Folders Count: [0, 1, 1, 0, 1]
---------------------
Folders Count
[[2], [3, 2], [0, 1, 1, 0, 1], [0, 1, 0], [0]]
---------------------
Open Folder: [0, 1]
['subDir24', 'subDir25', 'file22.txt']
---------------------
[0]
Gamma 0


['subDir11', 'subDir12', 'file11.txt']

In [66]:
from tests.testFileSys import testTree
from scripts.objFileSys import createTree, pinchLevel, getGamma

fileTree, folderTree, folderTreeMaxDepth,  foldersIDS, foldersCount, foldersLst = createTree( testTree )
id = []
gamma = getGamma( foldersCount, id )
print(gamma)
print( pinchLevel( fileTree, len( id) + 1)[gamma] )


0
['root']


In [154]:
# Test Obtaining the root name
folderID = [0, 1, 1, 0 ] # Play by swictching the values of the folder ID
startDepth = len(folderID) # 1 # play by switching the valyes of the start Depth

parentFolderIDS = [ folderID[ : startDepth - 1  ] ] # Ignors the last element of folderID
print( "Parent Folder ID:", parentFolderIDS[0])
parentFolderGammas =  [ getGamma( foldersCount, parentFolderIDS[0] ) ]
print("Parent Folder Gamma:", parentFolderGammas )
print( "Level Output: ",pinchLevel( fileTree,  startDepth , flat = False) )
print( "Level with Gamma:", pinchLevel( fileTree,  startDepth, flat = False)[ parentFolderGammas[0] ] )
rootName = pinchLevel( fileTree,  startDepth,flat = False)[ parentFolderGammas[0] ][ folderID[ startDepth - 1  ] ]
print( "Root Name:",rootName )

print("\nFolder ID:\t\t", folderID)
print("Folder ID Name:\t\t", pinchLevel( fileTree, len(folderID) )[ getGamma( foldersCount, folderID[:-1] ) ][ folderID[-1] ] )
print( "Folder ID Content:\t", pinchLevel( fileTree, len(folderID) + 1)[getGamma(foldersCount, folderID) ])


Parent Folder ID: [0, 1, 1]
Parent Folder Gamma: [4]
Level Output:  [['file31.txt', 'file32.txt', 'file33.txt'], ['subDir31', 'file34.txt'], ['subDir32', 'file35.txt'], ['file36.txt', 'file37.txt'], ['subDir33', 'file38.txt']]
Level with Gamma: ['subDir33', 'file38.txt']
Root Name: subDir33

Folder ID:		 [0, 1, 1, 0]
Folder ID Name:		 subDir33
Folder ID Content:	 ['file46.txt', 'file47.txt']


In [160]:
# Test going over the folders that are mentioned by the folderID
folderID = [0, 1, 1, 0 ]
startDepth = 1

# Get Root Name
parentFoldersIDS = [ folderID[ : startDepth - 1 ] ] 
parentFolderGammas =  [ getGamma( foldersCount, parentFoldersIDS[0] ) ]
rootName = pinchLevel( fileTree,  startDepth, flat = False)[ parentFolderGammas[0] ][ folderID[ startDepth - 1  ] ] # folderID is zero-indexed 

# Initalize for loop
parentFoldersIDS = [ folderID[ : startDepth ] ] 
parentFolderGammas =  [ getGamma( foldersCount, parentFoldersIDS[0] ) ]

root = Node(rootName)
lstParentNodes = [ root ]

print( f"Parent Folder ID: {parentFoldersIDS}")
print(f"Parent Folder Gamma: {parentFolderGammas }")
# Level 2
parentFolderLevel = 2
print( f"Parent Folder Level: {parentFolderLevel}")
childNodes = []
childFolderIndex = folderID[ parentFolderLevel ]
print( f"Child Folder Index: {childFolderIndex}")
childName = pinchLevel( fileTree,  parentFolderLevel , flat = False)[ parentFolderGammas[0] ][ childFolderIndex ]  
print(f"Child Name: {childName}")     
childNodes.append( Node( childName, parent = lstParentNodes[-1] ) )

parentFoldersIDS = [ folderID[ : parentFolderLevel ] ] 
print( f"Parent Folder ID: {parentFoldersIDS}")
parentFolderGammas =  [ getGamma( foldersCount, parentFoldersIDS[0] ) ] # Use [1:]  in parentFoldersIDS[0][1:] bc getGamma does not require root index element
lstParentNodes = childNodes  




Parent Folder ID: [[0]]
Parent Folder Gamma: [0]
Parent Folder Level: 2
Child Folder Index: 1
Child Name: subDir12
Parent Folder ID: [[0, 1]]


In [None]:
from anytree import Node, RenderTree

def viewSubtree(tree,  foldersCount, startDepth, treeDepth, folderID = None, folderPath = None, 
                viewFiles  = True, getFileIDS = False, getFilePathS = False, showIndex = True):
    """
    Args:
    - startDepth: Minimum is 1
    - folderID: Smallest is [ 0 ] -> root 
    """
    if folderPath != None:
        folderID =  convertPathtoID( tree, foldersCount,  folderPath)
        if folderID == None: 
            return None

    # Get Root Name
    rootFolderID = [ folderID[ : startDepth - 1 ] ] # zero index
    rootFolderGamma =  [ getGamma( foldersCount, rootFolderID[0] ) ]
    rootName = pinchLevel( tree,  startDepth, flat = False)[ rootFolderGamma[0] ][ folderID[ startDepth - 1  ] ] # folderID is zero-indexed 
    root = Node(rootName)

    # Initalize for loop
    parentFoldersIDS = [ folderID[ : startDepth ] ] 
    parentFolderGammas =  [ getGamma( foldersCount, parentFoldersIDS[0] ) ]

    lstParentNodes = [ root ]
    fileIDS =[]
    for parentFolderLevel in range( startDepth + 1, treeDepth ): # Bc startDepth was already taken by root Node
        if parentFolderLevel <= len( folderID ):
        
            childNodes = []
            childName = pinchLevel( tree,  parentFolderLevel , flat = False)[ parentFolderGammas[0] ][ folderID[ parentFolderLevel - 1 ] ]       
            childNodes.append( Node( childName, parent = lstParentNodes[ 0 ] ) )

            parentFoldersIDS = [ folderID[ : parentFolderLevel  ] ] # Parent for new level
            parentFolderGammas =  [ getGamma( foldersCount, parentFoldersIDS[0] ) ] # Use [1:]  in parentFoldersIDS[0][1:] bc getGamma does not require root index element
            lstParentNodes = childNodes        
                    
        else:
            # Find how many child folders in each parent folder
            childFoldersIDS = []
            childFoldersGammaS = []
            childNodeS = []


            fileNodeS = []
            for parentFolderID, parentFolderGamma, parentNode in  zip( parentFoldersIDS, parentFolderGammas, lstParentNodes ): # parentfolderID > childFolderID > childGamma & new level > newParentGamma | parentFolderGamma > childfolders and files names
                childFoldersNameS = [] 
                filesNameS = []


                numberChildrenFolderS = foldersCount[ parentFolderLevel  - 1 ][parentFolderGamma] # zero-index, Number of children subfolders in parent folder. IF 0, only get folders in fileNames,
                childFoldersNameS.extend( pinchLevel( tree,  parentFolderLevel , flat = False)[parentFolderGamma][ :numberChildrenFolderS] ) # Get IDS for each subfolder | levelFolderLives + 1 = level child folders lives
                childFoldersIDS.extend( [parentFolderID + [ childID ] for childID in range( numberChildrenFolderS ) ] )
                               
                for idx, childFolderName in enumerate( childFoldersNameS):
                    if showIndex:
                        childNodeS.append( Node( f"[{idx}] {childFolderName}", parent = parentNode ) )
                    else:
                        childNodeS.append( Node( childFolderName, parent = parentNode ) )


                directory = pinchLevel( tree,  parentFolderLevel , flat = False)[parentFolderGamma]
                numberFiles = len( directory[ numberChildrenFolderS: ] ) 
                fileIDS.extend( [parentFolderID + [ fileID ] for fileID in range( numberChildrenFolderS, numberChildrenFolderS + numberFiles ) ] )
                
                filesNameS.extend( pinchLevel( tree,  parentFolderLevel , flat = False)[parentFolderGamma][ numberChildrenFolderS: ] )   
                if viewFiles: 
                    for idx, fileName in enumerate( filesNameS):
                        if showIndex:
                            fileNodeS.append( Node( f"[{numberChildrenFolderS + idx}-f] {fileName}", parent = parentNode ) ) # In this case, child folder is the name of a file
                        else:
                            fileNodeS.append( Node( fileName, parent = parentNode ) )
                elif not viewFiles: pass

            childFoldersGammaS = [ getGamma( foldersCount, folderID ) for folderID in  childFoldersIDS ]
            parentFoldersIDS, parentFolderGammas, lstParentNodes = childFoldersIDS, childFoldersGammaS, childNodeS

    for pre, fill, node in RenderTree(root):
        print(f"{pre}{node.name}")

    if getFileIDS:
        if getFilePathS:
            return fileIDS, [ convertIDtoPath(tree, foldersCount, fileID ) for fileID in fileIDS]
        else:
            return fileIDS
    elif getFilePathS: return [ convertIDtoPath(tree, foldersCount, fileID ) for fileID in fileIDS]
    elif not getFileIDS: return None

In [None]:
from tests.testFileSys import testTree
from scripts.objFileSys import createTree

fileTree, folderTree, folderTreeMaxDepth,  foldersIDS, foldersCount, foldersLst = createTree( testTree )
viewSubtree(fileTree,  foldersCount, 1 , folderTreeMaxDepth + 1 , 
            folderID = None,  # Play with folder id from [0] on
            folderPath = "root/subDir11/subDir22/subDir31", # Play with folderPath from "" on
            viewFiles = True, getFilePathS = True, getFileIDS=False)
# convertIDtoPath( )

root
└── subDir11
    └── subDir22
        └── subDir31
            ├── file41.txt
            ├── file42.txt
            └── file43.txt


['root/subDir11/subDir22/subDir31/file41.txt',
 'root/subDir11/subDir22/subDir31/file42.txt',
 'root/subDir11/subDir22/subDir31/file43.txt']

In [None]:
fileIDS = viewSubtree( fileTree, folderID, foldersCount, len( folderID ) , len( folderID ) +  4 ,  viewFiles = True , getFileIDS = True)
for ID in fileIDS:
    print( convertIDtoPath( fileTree, foldersCount, ID) )

In [None]:
# Test convertIDtoPath
fileIDS = viewSubtree( fileTree, folderID, foldersCount, len( folderID ) , len( folderID ) +  4 ,  viewFiles = True , getFileIDS = True)
for ID in fileIDS:
    print( convertIDtoPath( fileTree, foldersCount, ID) )

root/subDir11/subDir21/file31.txt
root/subDir11/subDir21/file32.txt
root/subDir11/subDir21/file33.txt


In [318]:
# Test
convertPathtoID( fileTree, foldersCount,  "root")

[0]

In [None]:
# Test viewSubTree

# Test creation of file IDS
lstObjectKeys = testTree
fileTree, folderTree, folderTreeMaxDepth,  foldersIDS, foldersCount, foldersLst = createTree( lstObjectKeys )

fileIDS = folderID =  [ 0, 0, 0] # first element represent the root 
fileIDS = viewSubtree( fileTree, folderID, foldersCount, len( folderID ) , len( folderID ) +  4 ,  viewFiles = True , getFileIDS = True)
print( fileIDS)

# Test creation of file paths
lstObjectKeys = loadPickle( os.path.join( "sandbox", "Boto3", "XChemFiles", "frag1.pkl" ) )
lstObjectKeys = lstObjectKeys[:100000]
fileTree, folderTree, folderTreeMaxDepth,  foldersIDS, foldersCount, foldersLst = createTree( lstObjectKeys )

fileIDS = folderID =  [ 0, 0, 2, 0 , 0 ] # first element represent the root 
filePaths = viewSubtree( fileTree, folderID, foldersCount, len(folderID) , len(folderID) + 2 ,  viewFiles = True , getFileIDS=False, getFilePathS=True)

numFiles = 3
for idx in range(  ( len( filePaths ) // numFiles) + 1 ):
    print( filePaths[idx*numFiles: (idx+1)*numFiles ] )

subDir21
├── file31.txt
├── file32.txt
└── file33.txt
[[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 0, 2]]
AR_compounds_pdb
├── A01_P1_acedrg1.cif
├── A01_P1_acedrg1.pdb
├── A01_P2_acedrg1.cif
├── A01_P2_acedrg1.pdb
├── A02_P1_acedrg1.cif
├── A02_P1_acedrg1.pdb
├── A02_P2_acedrg1.cif
├── A02_P2_acedrg1.pdb
├── A03_P1_acedrg1.cif
├── A03_P1_acedrg1.pdb
├── A03_P2_acedrg1.cif
├── A03_P2_acedrg1.pdb
├── A04_P1_acedrg1.cif
├── A04_P1_acedrg1.pdb
├── A04_P2_acedrg1.cif
├── A04_P2_acedrg1.pdb
├── A05_P1_acedrg1.cif
├── A05_P1_acedrg1.pdb
├── A05_P2_acedrg1.cif
├── A05_P2_acedrg1.pdb
├── A06_P1_acedrg1.cif
├── A06_P1_acedrg1.pdb
├── A06_P2_acedrg1.cif
├── A06_P2_acedrg1.pdb
├── A07_P1_acedrg1.cif
├── A07_P1_acedrg1.pdb
├── A07_P2_acedrg1.cif
├── A07_P2_acedrg1.pdb
├── A08_P1_acedrg1.cif
├── A08_P1_acedrg1.pdb
├── A08_P2_acedrg1.cif
├── A08_P2_acedrg1.pdb
├── A09_P1_acedrg1.cif
├── A09_P1_acedrg1.pdb
├── A09_P2_acedrg1.cif
├── A09_P2_acedrg1.pdb
├── A10_P1_acedrg1.cif
├── A10_P1_acedrg1.pdb
├── A10_P2_a

Convert path to id, so that can then viewSubTree based on path

In [1]:
# Currently being developed
def getGammawDepth( parentFolderLocation, level):
    pass

---



<a id="filterfiles"></a>

## Filter and Find wanted files

In [None]:
from scripts.objFileSys import createFolderTree, countFolders, createFileTree
from scripts.filesUtils import loadPickle
import os

lstObjectKeys = loadPickle( os.path.join( "sandbox", "Boto3", "XChemFiles", "frag1.pkl" ) )[1:100000]

In [None]:
lstSplitPaths = [ path.split("/") for path in lstObjectKeys]

- ### File Types

List All existing files

In [None]:
# def getFileTypes( lstFilePaths, regexFilterS = None):
import re
def getFileTypes( lstFilePaths, regexFilterS = None):
    """
    Args:
    - lstFilePaths:
    - regexFilterS (None / list(lists) ): If list, each element corresponds to one regexFilter to apply. If none, it does not
        - I.e. ["not", regex] -> accept files that do NOT match regex | ["match", regex] -> accept files that MATCH regex
    """
    regex = "\\.\\w*$"
    lstSplitPaths = [ path.split("/") for path in lstFilePaths]
    lstAllFilesTypes = [ path[-1][re.search( regex,  path[-1] ).start() :   ]  if 
                    re.search( regex, path[-1] ) else idx for idx, path in enumerate( lstSplitPaths )  ]
        
    failedFileTypes = [ lstSplitPaths[  idx ][-1]   for idx in lstAllFilesTypes if type(idx) == int ]
    failedFileTypes = list( set( failedFileTypes) )
    failedFileTypes.sort()
    failedFilesDict = {f"Failed_{regex}": failedFileTypes}

    lstFileTypes = [  fileType  for fileType in lstAllFilesTypes if type(fileType) != int ]
    lstFileTypes = list( set( lstFileTypes ) )
    lstFileTypes.sort( )
    filteredFilesDict = {f"Pass_{regex}": lstFileTypes }
    
    if type(regexFilterS) == list:
        for match, regex in regexFilterS:
            matchFileS = [ file for file in lstFileTypes if re.search( "[0-9][0-9][0-9][0-9]$", file)]
            matchFileS.sort()
            notMatchFileS = [ file for file in lstFileTypes if not re.search( "[0-9][0-9][0-9][0-9]$", file)]
            notMatchFileS.sort()
            if match == "match": filteredFilesDict[f"Pass_{regex}"], failedFilesDict[f"Failed_{regex}"] =  matchFileS, notMatchFileS
            elif match == "not": filteredFilesDict[f"Pass_{regex}"], failedFilesDict[f"Failed_{regex}"] =  notMatchFileS, matchFileS
            else: print( "Match must either equal match or not")
            lstFileTypes = filteredFilesDict[f"Pass_{regex}"]

    return filteredFilesDict, failedFilesDict

In [None]:
# Testing and creating function
lstAllFilesTypes = [ path[-1][re.search( "\\.\\w*$", path[-1] ).start() :   ]  if re.search( "\\.\\w*$", path[-1] ) else idx for idx, path in enumerate( lstSplitPaths )  ]
failedFileTypes = [ lstSplitPaths[  idx ][-1]   for idx in lstAllFilesTypes if type(idx) == int ]
failedFileTypes = list( set( failedFileTypes) )
failedFileTypes.sort()
lstFileTypes = [  fileType  for fileType in lstAllFilesTypes if type(fileType) != int ]
lstFileTypes = list( set( lstFileTypes ) )
lstFileTypes.sort( )
print(f"First Round of Filerting: {lstFileTypes}" )
print(f"Failed Files: {failedFileTypes}") # Described as objects in object store but do not adopt a conventional file notation (.fileName)

lstFileTypes_2 = [ file for file in lstFileTypes if not re.search( "[0-9][0-9][0-9][0-9]$", file)]
lstFileTypes_2.sort()
failedFileTypes_2 = [ file for file in lstFileTypes if re.search( "[0-9][0-9][0-9][0-9]$", file)]
failedFileTypes_2.sort()

print(f"Second Rounf of Filtering: {lstFileTypes_2}")
print(f"Failed Files: {failedFileTypes_2}")

First Round of Filerting: ['.HKL', '.INP', '.LP', '.XDS', '.bak', '.bib', '.cbf', '.ccp4', '.cif', '.csh', '.csv', '.dat', '.dc_id', '.debug', '.done', '.e15502012', '.e17903026', '.e17925888', '.e17929302', '.e17931625', '.e17931767', '.e17931776', '.e17931777', '.e17931779', '.e17931780', '.e17931784', '.e17931786', '.e18931570', '.e3370966', '.e3371347', '.e3371629', '.e3372374', '.e3372931', '.e3569614', '.e3655641', '.e3656015', '.e3809591', '.e3813110', '.e3813269', '.e4017668', '.e4017793', '.e4018365', '.e4018412', '.e4018841', '.e4018979', '.e4019001', '.e4019264', '.e4019266', '.e4019513', '.e4020330', '.e4020379', '.e4021040', '.e4021202', '.e4021215', '.e4021356', '.e4021516', '.e4021642', '.e4021803', '.e4021964', '.e4022048', '.e4022162', '.e4022205', '.e4022243', '.e4022461', '.e4022463', '.e4022877', '.e4022878', '.e4023551', '.e4023611', '.e4024511', '.e4024903', '.e4025005', '.e4025026', '.e4025052', '.e4025725', '.e4025733', '.e4025909', '.e4026532', '.e4026596', '.e

In [None]:
# Final Test function - getFileTypes()
filteredFiles, failedFiles = getFileTypes( lstObjectKeys, regexFilterS = [ ["not", "[0-9][0-9][0-9][0-9]$"] ] )
print( list( filteredFiles.values())[-1] )

['.HKL', '.INP', '.LP', '.XDS', '.bak', '.bib', '.cbf', '.ccp4', '.cif', '.csh', '.csv', '.dat', '.dc_id', '.debug', '.done', '.eff', '.epr', '.err', '.error', '.exe', '.gz', '.html', '.jar', '.jpg', '.json', '.log', '.map', '.mol', '.mtz', '.out', '.params', '.pck', '.pdb', '.phil', '.pickle', '.pkl', '.pml', '.png', '.py', '.r3d', '.reprocessing', '.rst', '.running', '.sca', '.scm', '.sh', '.smi', '.smiles', '.sol', '.sqlite', '.sum', '.txt', '.xinfo', '.xlsm', '.xml']


- Chose files of interest

In [None]:
# Filter based on ending
lstFileTypes = [".mtz", ".ccp4", ".pdb", ".smiles", ".cif", ".sqlite"]


---

<a id="finditems"></a>

## Find Files and Folders

In [2]:
import sys
sys.path.append( ".." )

from tests.testFileSys import testTree
from scripts.objFileSys import createTree, pinchLevel, viewSubtree

fileTree, folderTree, folderTreeMaxDepth,  foldersIDS, foldersCount, foldersLst = createTree(testTree)


- ###  Get Target Item Identifiers: depth, index, suprafolder index

In [18]:
"""
Input:
- targetFolder
- tree: tree object (folder or file - folder is recommended) used to identify where the folder of interest lives
- startDepth: Can be used to narrow the search scope
- endDepth: Can be used to narrow the search scope
- regexpression: To be added in the future, to allow more complex searches
"""

# Test Input
targetItem = "file47.txt" # "subDir41" #"subDir33" # "subDir33"# "database"
tree = fileTree
startDepth = 1
endDepth = folderTreeMaxDepth + 2


import re

def findTargetIdxs(targetItem, tree,startDepth = 1, endDepth = 10, regexpression = None  ):
    
    """
    Args:
    - targetFolder
    - tree: tree object (folder or file - folder is recommended) used to identify where the folder of interest lives
    - startDepth: Can be used to narrow the search scope
    - endDepth: Can be used to narrow the search scope
    - regexpression: To be added in the future, to allow more complex searches
    
    Output:
    - supraFolderIdx: Zero-Index of suprafolder which target folder / file lives in (aka gamma)
    - folderIdx: zero-Index of where the folder / file lives
    - targetLevel: Level (in non-zero index) that identified folder / live  lives in
    """

    targetLevel, supraFolderIdx, folderIdx = None, None, None
    for depth in range( 1, endDepth): # startDepth,
        if depth >= startDepth:
            flatFoldersList = [ item for lst in tree[:-1]  for item in lst ]
            if targetItem in flatFoldersList and not regexpression:                              # Use re.search() for regex
                foldersList = tree[:-1] # pinchLevel( tree, depth, flat = False )  # List of list representing the folders with subfolders in the current level
                IDSlst = [ ( idx, listOfFolders.index( targetItem) ) for idx, listOfFolders in enumerate( foldersList ) if targetItem in  listOfFolders ] # listOfFolders.index( targetFolder) -> Get the zero-index for the target folder inside its suprafolder | 
                supraFolderIdx, folderIdx = IDSlst[0] # 
                targetLevel = depth
                # print(f"Depth: {depth}")
                break
            elif regexpression:
                matchRegex = [ True if re.search( regexpression, item ) else False for item in flatFoldersList  ]
                if any( matchRegex):
                    foldersList = tree[:-1]
                    targetItem = flatFoldersList[ matchRegex.index( True) ]
                    print(f"Target Item: {targetItem}")
                    IDSlst = [ ( idx, listOfFolders.index( targetItem) ) for idx, listOfFolders in enumerate( foldersList ) if targetItem in  listOfFolders ]
                    supraFolderIdx, folderIdx = IDSlst[0]
                    targetLevel = depth
                    break                                  
                
        if depth == endDepth - 1: 
            print(f"Could not find \"{targetItem}\"")
            return None
        
        tree = tree[-1]
    return supraFolderIdx, folderIdx, targetLevel 

## V1 ###########################

# def findTargetIdxs(targetItem, tree,startDepth = 1, endDepth = folderTreeMaxDepth + 2, regexpression = None  ):
#     targetLevel, supraFolderIdx, folderIdx = None, None, None
#     for depth in range( startDepth, endDepth):
#         flatFoldersList = pinchLevel( tree, depth, flat = True)
#         if targetItem in flatFoldersList:                              # Use re.search() for regex
#             foldersList = pinchLevel( tree, depth, flat = False )  # List of list representing the folders with subfolders in the current level
#             IDSlst = [ ( idx, listOfFolders.index( targetItem) ) for idx, listOfFolders in enumerate( foldersList ) if targetItem in  listOfFolders ] # listOfFolders.index( targetFolder) -> Get the zero-index for the target folder inside its suprafolder | 
#             supraFolderIdx, folderIdx = IDSlst[0] # 
#             targetLevel = depth
#             print(f"Depth: {depth}")
#             break
#         if depth == endDepth - 1: print("Could not find target item")

#     return supraFolderIdx, folderIdx, targetLevel

"""
Output:
- supraFolderIdx: Zero-Index of suprafolder which target folder / file lives in (aka gamma)
- folderIdx: zero-Index of where the folder / file lives
- targetLevel: Level (in non-zero index) that identified folder / live  lives in
"""

supraFolderIdx, folderIdx, targetLevel = findTargetIdxs(targetItem,tree, startDepth=startDepth, endDepth=endDepth  )


In [None]:
# Test

targetItem =  "subDir1" # "subDir41" #"subDir33" # "subDir33"# "database"
tree = fileTree
startDepth = 1
endDepth = folderTreeMaxDepth + 2

output = findTargetIdxs(targetItem,tree, startDepth=startDepth, endDepth=endDepth, regexpression= None)  # "4.txt$"  )

if output: supraFolderIdx, folderIdx, targetLevel = output
else: supraFolderIdx, folderIdx, targetLevel = None, None, None

print("\nView Results:")
print(supraFolderIdx, folderIdx, targetLevel  )
print( "\nView Actual Tree:")
print( f"Level 5: {pinchLevel( fileTree, 5, flat = False)}" )
print( f"Level 4: {pinchLevel( fileTree, 4, flat = False)}" )
print( f"Level 3: {pinchLevel( fileTree, 3, flat = False)}" )
print( f"Level 2: {pinchLevel( fileTree, 2, flat = False)}" )
print( f"Level 1: {pinchLevel( fileTree, 1, flat = False)}" )

Could not find "subDir1"

View Results:
None None None

View Actual Tree:
Level 5: [['file41.txt', 'file42.txt', 'file43.txt'], ['subDir41', 'file44.txt', 'file45.txt'], ['file46.txt', 'file47.txt']]
Level 4: [['file31.txt', 'file32.txt', 'file33.txt'], ['subDir31', 'file34.txt'], ['subDir32', 'file35.txt'], ['file36.txt', 'file37.txt'], ['subDir33', 'file38.txt']]
Level 3: [['subDir21', 'subDir22', 'subDir23', 'file21.txt'], ['subDir24', 'subDir25', 'file22.txt']]
Level 2: [['subDir11', 'subDir12', 'file11.txt']]
Level 1: [['root']]


In [42]:
# Advanced Code: findAllTargetIdxs 

import re


def findAllTargetIdxs(targetItem, tree,startDepth = 1, endDepth = 10, regexpression = None  ):

    allTargetIdxs = []
    targetLevel, supraFolderIdx, folderIdx = None, None, None
    for depth in range( 1, endDepth): # startDepth,
        if depth >= startDepth:
            
            flatFoldersList = [ item for lst in tree[:-1]  for item in lst ]

            if targetItem in flatFoldersList and not regexpression:                              # Use re.search() for regex
                foldersList = tree[:-1] # pinchLevel( tree, depth, flat = False )  # List of list representing the folders with subfolders in the current level
                IDSlst = [ ( idx, listOfFolders.index( targetItem ) ) for idx, listOfFolders in enumerate( foldersList ) if targetItem in  listOfFolders ] # listOfFolders.index( targetFolder) -> Get the zero-index for the target folder inside its suprafolder | 
                for IDS in IDSlst:
                    supraFolderIdx, folderIdx = IDS # 
                    targetLevel = depth
                    allTargetIdxs.append( (supraFolderIdx, folderIdx, targetLevel ) )
                    
            elif regexpression:
                
                matchRegex = [ item if re.search( regexpression, item ) else False for item in flatFoldersList  ]
                lstRegexMatches = list( set(  filter( lambda x: x, matchRegex ) ) )

                for matchItem in lstRegexMatches:
                
                    foldersList = tree[:-1]
                    targetItem = matchItem
                    IDSlst = [ ( idx, listOfFolders.index( targetItem) ) for idx, listOfFolders in enumerate( foldersList ) if targetItem in  listOfFolders ]
            
                    for IDS in IDSlst:
                        supraFolderIdx, folderIdx = IDS # 
                        targetLevel = depth
                        allTargetIdxs.append( (supraFolderIdx, folderIdx, targetLevel ) )

        tree = tree[-1]


    return allTargetIdxs



# ## V1 ################

# def findAllTargetIdxs(targetItem, tree,startDepth = 1, endDepth = 10, regexpression = None  ):
#     allTargetIdxs = []
#     targetLevel, supraFolderIdx, folderIdx = None, None, None
#     for depth in range( startDepth, endDepth):
#         flatFoldersList = pinchLevel( tree, depth, flat = True)
#         if targetItem in flatFoldersList:                              # Use re.search() for regex
#             foldersList = pinchLevel( tree, depth, flat = False )  # List of list representing the folders with subfolders in the current level
#             IDSlst = [ ( idx, listOfFolders.index( targetItem) ) for idx, listOfFolders in enumerate( foldersList ) if targetItem in  listOfFolders ] # listOfFolders.index( targetFolder) -> Get the zero-index for the target folder inside its suprafolder | 
#             for IDS in IDSlst:
#                 supraFolderIdx, folderIdx = IDS # 
#                 targetLevel = depth
#                 allTargetIdxs.append( (supraFolderIdx, folderIdx, targetLevel ) )
#                 print(f"Depth: {depth}")

#     return allTargetIdxs

In [73]:
targetItem =  "subDir11" # "subDir41" #"subDir33" # "subDir33"# "database"
tree = fileTree
startDepth = 1
endDepth = folderTreeMaxDepth + 2

output = findAllTargetIdxs(targetItem,tree, startDepth=startDepth, endDepth=endDepth, regexpression=  ".txt$"  ) # None)  #

idx = 0

In [92]:
if output != []: supraFolderIdx, folderIdx, targetLevel = output[idx]
else: supraFolderIdx, folderIdx, targetLevel = None, None, None

print("\nView Results:")
print(supraFolderIdx, folderIdx, targetLevel  )
print( "\nView Actual Tree:")
print( f"Level 6: {pinchLevel( fileTree, 6, flat = False)}" )
print( f"Level 5: {pinchLevel( fileTree, 5, flat = False)}" )
print( f"Level 4: {pinchLevel( fileTree, 4, flat = False)}" )
print( f"Level 3: {pinchLevel( fileTree, 3, flat = False)}" )
print( f"Level 2: {pinchLevel( fileTree, 2, flat = False)}" )
print( f"Level 1: {pinchLevel( fileTree, 1, flat = False)}" )

idx += 1


View Results:
0 0 6

View Actual Tree:
Level 6: [['file51.txt']]
Level 5: [['file41.txt', 'file42.txt', 'file43.txt'], ['subDir41', 'file44.txt', 'file45.txt'], ['file46.txt', 'file47.txt']]
Level 4: [['file31.txt', 'file32.txt', 'file33.txt'], ['subDir31', 'file34.txt'], ['subDir32', 'file35.txt'], ['file36.txt', 'file37.txt'], ['subDir33', 'file38.txt']]
Level 3: [['subDir21', 'subDir22', 'subDir23', 'file21.txt'], ['subDir24', 'subDir25', 'file22.txt']]
Level 2: [['subDir11', 'subDir12', 'file11.txt']]
Level 1: [['root']]


- ### Cumulative Count from folders Count

In [None]:
def cumulativeCount( foldersCount):
    finalCount = []
    for levelCount in foldersCount:
        levelRecord = [0]
        for idx, value in enumerate( levelCount ):
            levelRecord.append( levelRecord[idx] + value )
        finalCount.append( levelRecord[1:])
    return finalCount

folderSumCount = cumulativeCount( foldersCount )

In [None]:
# Test 1:
print("Test1")
print(foldersCount)
print( cumulativeCount( foldersCount ) )
print()

# Test 2:
print("Test2")
print( [[0,1,1,2,0,0,2,0,3,5]] )
print( cumulativeCount( [[0,1,1,2,0,0,2,0,3,5]] ) )

- ### Convert Item Identifiers into an Item Gamma ID

    - Each element represents the gamma of the right folder at each specific level

In [None]:
"""
Input:
- folderSumCount
- supraFolderIdx
- folderIdx
- targetLevel
"""
def getGammaID(supraFolderIdx, folderIdx, targetLevel, folderSumCount ):
    gammaFolderID = [ supraFolderIdx, folderIdx] # It represents the gamma of which folder to open in each level with pinchLevel with exception of the last element which identifies the folder / file


    for _level in range( targetLevel - 1, 0, -1): # level - 1 matches the suprafolder level w/o zero indexing OR the folder level with zero indexing 
        for idx, sumCount in enumerate( folderSumCount[ _level - 1] ):
            if  gammaFolderID[0] + 1 <= sumCount: # If True, it means that we reached the level
                gammaFolderID.insert( 0, idx)
                break
    return gammaFolderID

gammaFolderID = getGammaID(supraFolderIdx, folderIdx, targetLevel, folderSumCount )


"""
output: gammaFolderID
"""


In [None]:
# Test
from scripts.objFileSys import viewSubtree
print( gammaFolderID )
print( foldersCount )

viewSubtree( fileTree, foldersCount, 1, folderTreeMaxDepth + 2, folderID = [0], viewFiles = True)


- ### Convert Gamma ID to Folder ID

In [None]:

"""
Input:
- gammaFolderID
"""

def convertGammaIDtoFolderID(gammaFolderID, foldersCount):
    folderID = []
    foldersBefore = 0 # Initialize for the root which has o folders
    for idx in range( len( gammaFolderID[:-2] ) ) : # Note that final 2 level is not accounted bc the last element is not a gamma but the actual index, and the one before is accounted with idx + 1
        _level = idx + 1 # Make it non-zero index

        currentGamma = gammaFolderID[idx]
        nextGamma = gammaFolderID[idx+1]

        foldersBefore = sum( foldersCount[ _level - 1][:currentGamma] ) # Get the number of folders before the folder of interest at the specific level
        folderID.append( nextGamma - foldersBefore )

    folderID = folderID + gammaFolderID[-1:]
    return folderID

"""
Output:
- folderID
"""
folderID = convertGammaIDtoFolderID( gammaFolderID, foldersCount )

- ### Join all in one function: backTrackFile()

In [None]:
def traceBackPath(targetItem, tree, foldersCount, treeMaxDepth):
    supraFolderIdx, folderIdx, targetLevel = findTargetIdxs(targetItem, tree, startDepth = 1, endDepth= treeMaxDepth+2  )
    folderSumCount = cumulativeCount( foldersCount )
    gammaFolderID = getGammaID(supraFolderIdx, folderIdx, targetLevel, folderSumCount )
    folderID = convertGammaIDtoFolderID( gammaFolderID, foldersCount )
    folderPath = convertIDtoPath( tree, foldersCount, folderID)
    return folderPath, folderID



In [None]:
# Test
fileTree, folderTree, folderTreeMaxDepth,  foldersIDS, foldersCount, foldersLst = createTree(testTree)
targetItem =  "subDir41" #"file47.txt" #"subDir33" # "subDir33"# "database"
tree = fileTree

print( traceBackPath(targetItem, tree, foldersCount, folderTreeMaxDepth ) )

In [None]:
# Advanced function:

def completeTraceBackPath(targetItem, tree, foldersCount, treeMaxDepth):
    resultPaths = []
    allTargetIdxs = findAllTargetIdxs(targetItem, tree, startDepth = 1, endDepth= treeMaxDepth+2  )

    for targetIdx in allTargetIdxs:
        supraFolderIdx, folderIdx, targetLevel = targetIdx
        folderSumCount = cumulativeCount( foldersCount )
        gammaFolderID = getGammaID(supraFolderIdx, folderIdx, targetLevel, folderSumCount )
        folderID = convertGammaIDtoFolderID( gammaFolderID, foldersCount )
        folderPath = convertIDtoPath( tree, foldersCount, folderID)
        resultPaths.append( ( folderPath, folderID ) )
    return resultPaths

In [None]:
# Test
from pathlib import Path
from scripts.filesUtils import loadPickle
from scripts.objFileSys import viewSubtree
treePickle = loadPickle( Path("D:/Documents/XAIDA/output/ObjectStore/XChem/PerProj/data/tree/tree_2015_lb13320-1.pkl" ))
fileTree, folderTree, folderTreeMaxDepth,  foldersIDS, foldersCount, foldersLst = createTree(testTree)
targetItem =  "subDir41" #"file47.txt" #"subDir33" # "subDir33"# "database"
tree = fileTree

folderID = [0,0,0,0, 12]
viewSubtree( treePickle["fileTree"], treePickle["foldersCount"], 1, len( folderID) + 2, folderID = folderID, viewFiles = False)

- ### Get a list of all files that live in a specific directory

In [None]:
from scripts.objFileSys import convertPathtoID, getGamma

def getFiles(tree,  foldersCount, folderID = None, folderPath = None):
    """
    Args:
    - startDepth: Minimum is 1
    - folderID: Smallest is [ 0 ] -> root 
    """
    if folderPath != None:
        folderID =  convertPathtoID( tree, foldersCount,  folderPath)

    fileIDS = []
    filesNameS = []

    fileLevel = len( folderID ) + 1 
    folderGamma = getGamma( foldersCount, folderID )
    directory = pinchLevel( tree,  fileLevel , flat = False)[folderGamma]
    
    numberChildrenFolderS = foldersCount[ fileLevel  - 1 ][folderGamma]
    numberFiles = len( directory[ numberChildrenFolderS: ] )     
    fileIDS.extend( [folderID + [ fileID ] for fileID in range( numberChildrenFolderS, numberChildrenFolderS + numberFiles ) ] )
    if folderPath != None:
        filesNameS.extend( pinchLevel( tree,  fileLevel , flat = False)[folderGamma][ numberChildrenFolderS: ] )   
        filePaths = [ folderPath + "/" + fileName for fileName in filesNameS ]

    if folderPath != None:
        return fileIDS, filePaths
    else:
        return fileIDS, [ convertIDtoPath(tree, foldersCount, fileID ) for fileID in fileIDS]
   

In [None]:
# Test

fileTree, folderTree, folderTreeMaxDepth,  foldersIDS, foldersCount, foldersLst = createTree(testTree)

getFiles( fileTree, foldersCount, folderPath = "root/subDir11/subDir21"  )

In [None]:
viewSubtree(fileTree, foldersCount, 1, 5, folderPath = "root/subDir12/subDir25/subDir33"  )

## Full Workflow

In [None]:
# Full Example

# Test
from tests.testFileSys import testTree

# from pathlib import Path
# from scripts.filesUtils import loadPickle
# from scripts.objFileSys import viewSubtree


# treePickle = loadPickle( Path("D:/Documents/XAIDA/output/ObjectStore/XChem/PerProj/data/tree/tree_2015_lb13320-1.pkl" ))

fileTree, folderTree, folderTreeMaxDepth,  foldersIDS, foldersCount, foldersLst = createTree(testTree)

# Get Path for subDir21
result = completeTraceBackPath("subDir21", fileTree, foldersCount, folderTreeMaxDepth )

path = result[0][0]
print( path )


files = getFiles( fileTree, foldersCount, folderPath = path  )
print( files[1] )