# Aconex Workflow Extraction

### Summary
- This notebook uses Aconex API to extract all ONxpress workflows for analytics & reporting purposes.

### Input
> _No external input required._

### API
- API Vendor: Oracle Aconex
- Authentication: Basic (API key and credentials)

### The Execution Flow
1. Find out the number of pages.
2. Loop through the pages while storing the data on each page in a dataframe.
3. Convert the dataframe to spark dataframe.
4. Update the existing data in Delta table with new data.

### Output
- `dev.dept.aconex_workflows`

### Installs & Imports

In [None]:
%pip install xmltodict
%pip install tqdm

In [None]:
from requests.auth import HTTPBasicAuth
from cryptography.fernet import Fernet
from tqdm.notebook import trange, tqdm

import xmltodict, keyring, requests
import pandas as pd
import numpy as np
import time
import ast

from delta.tables import DeltaTable
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

### Global Variables

In [None]:
# Detla table path
table_name_in_catalog = "dev.dept.aconex_workflows"

# Credentials
KEY = "KEY"
TOKEN = "TOKEN"
API_KEY = "API KEY"

# Delta table schema
SCHEMA = StructType([
		StructField("column1", StringType(), True),
		StructField("column2", StringType(), True),
		StructField("column3", StringType(), True),
		StructField("column4", StringType(), True),
		StructField("column5", StringType(), True),
		StructField("column6", StringType(), True),
		StructField("column7", StringType(), True),
		StructField("column8", StringType(), True),
		StructField("column9", StringType(), True),
		StructField("column10", StringType(), True),
		StructField("column11", StringType(), True),
		StructField("column12", StringType(), True),
		StructField("column13", StringType(), True),
		StructField("column14", StringType(), True),
		StructField("column15", StringType(), True),
		StructField("column16", StringType(), True),
		StructField("column17", StringType(), True),
		StructField("column18", StringType(), True)
    ])

### Function Definitions

In [None]:
# Functon to fetch a single page of the Workflow module data from Aconex
def get_response(page_number = 1, max_page_size = 500):

    url = "https://ca1.aconex.com/api/projects/1879053393/workflows"
    auth = HTTPBasicAuth("mpatel", Fernet(KEY).decrypt(TOKEN).decode())

    headers = {
        "X-Application-Key": API_KEY
    }

    params = {
        "page_size": max_page_size,
        "page_number": page_number,
        #updated_after: 
    }

    response_raw = requests.get(url, headers=headers, auth=auth, params=params)
    response_parsed = xmltodict.parse(response_raw.text.replace("\x02"," "), encoding='utf-8')
    
    return response_parsed, response_raw.status_code


# Function to split a dataframe into list of dataframe batches
def df_batcher(df, batch_size):
    batches = []
    total_batches = len(df) // batch_size + (1 if len(df) % batch_size != 0 else 0)
    for i in range(total_batches):
        start_index = i * batch_size
        end_index = start_index + batch_size
        batches.append(df[start_index:end_index])
    return batches


# Function to parse through the assignees dictionary and return a string of concatenated assignee names
def parse_assignees(dict_value):

    # If dictionary, there is only one assignee
    if type(dict_value['Assignee']) == dict:

        # Concat Assignee Name and Organization Name
        return dict_value['Assignee']['Name'] + " - " + dict_value['Assignee']['OrganizationName']
    
    # If list of dictionaries, there are multiple assignees
    elif type(dict_value['Assignee']) == list:

        # Concat each Assignee Name and Organization Name, and join with comma
        return ', '.join([f"{i['Name']} - {i['OrganizationName']}" for i in dict_value['Assignee']])

### Main()

#### Initializations

In [None]:
# Get total number of pages for the 'for' loop in next code block

response, status_code = get_response()

if status_code != 200:
    print(status_code)
    print(response)
else:
    total_pages = int(response['WorkflowSearch']['@TotalPages'])
    print(f"Total Pages: {total_pages}\nTotal Items: {response['WorkflowSearch']['@TotalResults']}")

#### Fetch Data

In [None]:
# Define an empty dataframe to append the results to
df = pd.DataFrame(columns=['column1','column2','column3','column4','column5','column6','column7','column8','column9','column10','column11','column12','column13','column14','column15','column16','column17','column18'])

# Parse through each page of the Workflows module and append data to DataFrame
for i in trange(1,total_pages+1):
    response, status_code = get_response(page_number=i)
    while status_code !=200:
        time.sleep(5)
        response, status_code = get_response(page_number=i)

    page_content = response['WorkflowSearch']['SearchResults']['Workflow']
    temp_df = pd.DataFrame(response['WorkflowSearch']['SearchResults']['Workflow'])
    df = pd.concat([df,temp_df],ignore_index=True)

#### Data Prep

In [None]:
# Remove unnencessary columns
df = df[['column1','column2','column3','column4','column5','column6','column7','column8','column9','column10','column11','column12','column13','column14','column15','column16','column17','column18']]

# Create column with parsed assignees
df['AssignedTo'] = df['Assignees'].apply(parse_assignees)

# Remove old Assignees column
df = df.drop('Assignees', axis=1)

# Convert data type all values of the dataframe to string
df = df.astype(str)

# Rename columns of the dataframe
df.rename(
    columns={
        "old_column":"NewColumn"
    },
    inplace=True
)

# Remove true duplicate rows
#df.drop_duplicates(inplace=True)

In [None]:
# Check for duplicate Document IDs
len(df) == len(set(df["WorkflowID"]))

In [None]:
df

#### Data Save

In [None]:
# Split the dataframe into list of batches
df_batches = df_batcher(df, 25000)

# Check breakdown of batches
[len(i) for i in df_batches]

In [None]:
# Save batches to delta table in a for loop
count = 0
for t in trange(len(df_batches)):
    spark_df = spark.createDataFrame(df_batches[t], schema=SCHEMA)
    if count == 0:
        # Overwrite data for the first batch
        spark_df.write.format("delta").option("delta.columnMapping.mode", "name").option("mergeSchema", "true").mode("overwrite").saveAsTable(table_name_in_catalog)
    else:
        # Append data for the rest of the batches
        spark_df.write.format("delta").option("delta.columnMapping.mode", "name").option("mergeSchema", "true").mode("append").saveAsTable(table_name_in_catalog)
    print(count)
    count += 1