### Imports and Preparation

In [None]:
#!pip install tqdm
import time
import pandas as pd                                     # Store and process tabular data in dataframes
import requests                                         # Execute HTTP requests. GET requests in this file.
from requests.adapters import HTTPAdapter               # Customize the HTTP adapter used by the requests session
from requests.packages.urllib3.util.retry import Retry  # Retry method to define custom retry strategy
#from tqdm.notebook import trange, tqdm                 # Library for progress bar

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

In [None]:
### GLOBAL VARIABLES
# Table names in the catalog
table_name_in_catalog1 = "dev.dept.acc_folder1"
table_name_in_catalog2 = "dev.dept.acc_folder2"

# URL to request access token
ACCESS_TOKEN_URL = "OAuth2 bearer token URL"

# ACC project id
PROJECT_ID = "project_id"

### Function Definitions

In [None]:
def getAccessToken():
    """Get access token for authorization
    
    Returns:
        str: access token string
    """
#    return http.get(ACCESS_TOKEN_URL).json()["access_token"]
    return requests.get(ACCESS_TOKEN_URL).json()["access_token"]


def getAPIResponse(folder_id,page_num=0):
    """Get single page or multi-page response object from API based on result pagination.

    Args:
        folder_id (str): id of the current folder
        page_num (int, optional): current page number for paginated result. Defaults to 0 in case of no pagination.

    Returns:
        JSON obj: JSON object in response from GET request
    """
#    temp_count = 0
    if page_num == 0:
        url = f"https://developer.api.autodesk.com/data/v1/projects/{PROJECT_ID}/folders/{folder_id}/contents"
    else:
        url = f"https://developer.api.autodesk.com/data/v1/projects/{PROJECT_ID}/folders/{folder_id}/contents?page%5Bnumber%5D={page_num}&page%5Blimit%5D=200"

    header = {"Authorization": f"Bearer {getAccessToken()}"}
    response = requests.get(url, headers=header)

    while response.status_code != 200:
        header = {"Authorization": f"Bearer {getAccessToken()}"}
        response = requests.get(url, headers=header)
        if response.status_code not in [0, 200]:
            print(response.status_code)
            time.sleep(1)
#            temp_count += 1
#            if temp_count == 5:
#                exit

    return response
#    return http.get(url, headers=header)


def getItemAttr(currData, currFolderPath):
    """Extract item attributes and metadata

    Args:
        currData (dict): json data in dict format
        currFolderPath (str): current folder path

    Returns:
        list: list of attributes of an item
    """
    folder_path = f"{currFolderPath}/"
    file_displayName = currData["attributes"]["displayName"]
    file_url = currData["links"]["webView"]["href"]
    file_version = currData["relationships"]["tip"]["data"]["id"][-1]
    file_last_modified = currData["attributes"]["lastModifiedTime"]
    file_last_modified_by = currData["attributes"]["lastModifiedUserName"]
    file_created = currData["attributes"]["createTime"]
    file_created_by = currData["attributes"]["createUserName"]
    return [folder_path, file_displayName, file_url, file_version, file_last_modified, file_last_modified_by, file_created,file_created_by]


def get_all_files(folder_id, folderPath):
    """Primary function to recursively extract all the files from the target folder and subfolders within

    Args:
        folder_id (str): current id of the target folder
        folderPath (str): current path of the folder in storage system

    Returns:
        list: list of lists containing attributes/metadata of extracted files
    """
    # params = {"filter[extension.type]": "items:autodesk.bim360:File"}   # Filter parameter for 'search' endpoint
    
    # get response obj from API for folder with folder id 'folder_id'
    response = getAPIResponse(folder_id)
    files = []

    # Check if the GET request was successful
    if response.status_code == 200:

        # Check if the result is paginated
        if "next" in list(response.json()["links"]):
            page_num = 0
            while True:
                # Get API response for specified folder id with page number in case of pagination
                multiPageResponse = getAPIResponse(folder_id, page_num).json()
                
                # Iterate through all the items in the page
                for item in multiPageResponse["data"]:
                    
                    # Check for type:item (file)
                    if item["type"] == "items":
                        # Extract file attributes
                        files.append(getItemAttr(item, folderPath))

                    # Check for type:folder and recursively call for subfolders if true
                    if item["type"] == "folders":
                        subFolderPath = f"{folderPath}/{item['attributes']['displayName']}"
                        files.extend(get_all_files(item["id"],subFolderPath))
                
                if "next" in list(multiPageResponse["links"]):
                    # Increment page number by 1 if "next" found in list of keys, which means the current page was not the last
                    page_num += 1
                else:
                    # Break the while loop if "next" not found in the list of keys, which means the current page was the last
                    break
        
        else:
            # Convert the response object into json format in case the result is not paginated
            singlePageResponse = response.json()
            
            # Iterate through all the items
            for item in singlePageResponse["data"]:
                
                # Check for type:item (file)
                if item["type"] == "items":
                    # Extract file attributes
                    files.append(getItemAttr(item, folderPath))

                # Check for type:folder
                if item["type"] == "folders":
                    # Define the folder path for the folder subfolder found
                    subFolderPath = f"{folderPath}/{item['attributes']['displayName']}"
                    # Recursively call the function itself with folder id and path of the current folder
                    files.extend(get_all_files(item["id"],subFolderPath))

        return files
    
    else:
        # In case the GET request fails
        print(f"Error: {response.status_code} - {response.text}")
        return []
    

### Main

In [None]:
# List of dictionary items containing folder id, folder path and output file name for each folder
folders = [
    {
        "id":"urn of acc folder1",
        "path":"Folder/Path/1",
        "database":table_name_in_catalog1
        },
    {
        "id":"urn of acc folder2",
        "path":"Folder/Path/2",
        "database":table_name_in_catalog2
        }
    ]

# Dataframe Schema for Spark
schema = StructType([
    StructField("folder_path", StringType(), True),
    StructField("file_displayName", StringType(), True),
    StructField("file_url", StringType(), True),
    StructField("file_version", StringType(), True),
    StructField("file_last_modified", StringType(), True),
    StructField("file_last_modified_by", StringType(), True),
    StructField("file_created", StringType(), True),
    StructField("file_created_by", StringType(), True)
])

In [None]:
# Iterate through the list of target folders
for folder in folders:
    # Generate list of files and its attributes within specified folder and subfolder within
    files_list = get_all_files(folder["id"], folder["path"])
    print(f"{folder['path'].split('/')[-1]}: {len(files_list)}")

    # Convert the list of files to a dataframe
    files_df = spark.createDataFrame(files_list, schema=schema)
    #["folder_path","file_displayName","file_url","file_version","file_last_modified","file_last_modified_by","file_created", "file_created_by"])
    print("DataFrame Created")

    # Save data to databricks table
    files_df.write.option("mergeSchema", "true").mode("overwrite").saveAsTable(folder['database'])
    print("Table saved!")