# Aconex Documents Extraction

### Summary
- This notebook uses Aconex API to extract list of documents hosted on Aconex document register for analytics & reporting purposes.

### Input
> _No external input required._

### API
- API Vendor: Oracle Aconex
- Authentication: Basic (API key and credentials)

### The Execution Flow
1. Find out the number of pages.
2. Loop through the pages while storing the data on each page in a dataframe.
3. Convert the dataframe to spark dataframe.
4. Update the existing data in Delta table with new data.

### Output
- `dev.dept.aconex_docs`

### Installs & Imports

In [None]:
%pip install xmltodict
%pip install tqdm

In [None]:
from requests.auth import HTTPBasicAuth
from cryptography.fernet import Fernet
from tqdm.notebook import trange, tqdm

import xmltodict, keyring, requests
import pandas as pd
import numpy as np
import time

from delta.tables import DeltaTable
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType

### Global Variables

In [None]:
# Detla table path
table_name_in_catalog = "dev.dept.aconex_docs"

# Credentials
KEY = "KEY"
TOKEN = "TOKEN"
API_KEY = "API KEY"

# Delta table schema
SCHEMA = StructType([
		StructField("Column1", StringType(), True),
		StructField("Column2", StringType(), True),
		StructField("Column3", StringType(), True),
		StructField("Column4", StringType(), True),
		StructField("Column5", StringType(), True),
		StructField("Column6", StringType(), True),
		StructField("Column7", StringType(), True),
		StructField("Column8", StringType(), True),
		StructField("Column9", StringType(), True),
		StructField("Column10", StringType(), True),
		StructField("Column11", StringType(), True),
		StructField("Column12", StringType(), True),
		StructField("Column13", StringType(), True),
		StructField("Column14", StringType(), True),
		StructField("Column15", StringType(), True),
		StructField("Column16", StringType(), True),
		StructField("Column17", StringType(), True),
		StructField("Column18", StringType(), True),
		StructField("Column19", StringType(), True),
		StructField("Column20", StringType(), True),
		StructField("Column21", StringType(), True),
		StructField("Column22", StringType(), True),
		StructField("Column23", StringType(), True),
		StructField("Column24", StringType(), True),
		StructField("Column25", StringType(), True),
		StructField("Column26", StringType(), True),
		StructField("Column27", StringType(), True),
		StructField("Column28", StringType(), True),
		StructField("Column29", StringType(), True),
		StructField("Column30", StringType(), True),
		StructField("Column31", StringType(), True),
		StructField("Column32", StringType(), True),
		StructField("Column33", StringType(), True),
		StructField("Column34", StringType(), True),
		StructField("Column35", StringType(), True),
		StructField("Column36", StringType(), True),
		StructField("Column37", StringType(), True),
		StructField("Column38", StringType(), True)
    ])

### Function Definitions

In [None]:
# Functon to fetch a single page of document register from Aconex
def get_response(page_number = 1, max_page_size = 500):

    url = "https://ca1.aconex.com/api/projects/<project id>/register"
    auth = HTTPBasicAuth("username", Fernet(KEY).decrypt(TOKEN).decode())

    headers = {
        "X-Application-Key": API_KEY
    }

    params = {
        "return_fields": "identifier1,identifier2,identifier3,identifier4,identifier5,identifier6,identifier7,identifier8,identifier9,identifier10,identifier11,identifier12,identifier13,identifier14,identifier15,identifier16,identifier17,identifier18,identifier19,identifier20,identifier21,identifier22,identifier23,identifier24,identifier25,identifier26,identifier27,identifier28,identifier29,identifier30,identifier31,identifier32,identifier33,identifier34,identifier35,identifier36,identifier38,identifier37",
        "content_search":"true",
        "search_type": "PAGED",
        "page_size": max_page_size,
        "page_number": page_number,
        "show_document_history": True
    }

    response_raw = requests.get(url, headers=headers, auth=auth, params=params)
    response_parsed = xmltodict.parse(response_raw.text.replace("\x02"," "), encoding='utf-8')
    
    return response_parsed, response_raw.status_code


# Function to split a dataframe into list of dataframe batches
def df_batcher(df, batch_size):
    batches = []
    total_batches = len(df) // batch_size + (1 if len(df) % batch_size != 0 else 0)
    for i in range(total_batches):
        start_index = i * batch_size
        end_index = start_index + batch_size
        batches.append(df[start_index:end_index])
    return batches


### Main()

#### Initializations

In [None]:
# Get total number of pages for the 'for' loop in next code block

response, status_code = get_response()

if status_code != 200:
    print(f"{status_code}: {response}")
else:
    total_pages = int(response['RegisterSearch']['@TotalPages'])
    print(f"Total Pages: {total_pages}\nTotal Items: {response['RegisterSearch']['@TotalResults']}")

#### Fetch Data

In [None]:
# Define an empty dataframe to append the results to
df = pd.DataFrame(columns=['Column1','Column2','Column3','Column4','Column5','Column6','Column7','Column8','Column9','Column10','Column11','Column12','Column13','Column14','Column15','Column16','Column17','Column18','Column19','Column20','Column21','Column22','Column23','Column24','Column25','Column26','Column27','Column28','Column29','Column30','Column31','Column32','Column33','Column34','Column35','Column36','Column38','Column37'])

# For loop to fetch all the pages of document register and append to the dataframe
for i in trange(1,total_pages+1):
    response, status_code = get_response(page_number=i)
    while status_code !=200:
        time.sleep(5)
        response, status_code = get_response(page_number=i)

    page_content = response['RegisterSearch']['SearchResults']['Document']
    temp_df = pd.DataFrame(response['RegisterSearch']['SearchResults']['Document'])
    df = pd.concat([df,temp_df],ignore_index=True)

#### Data Prep

In [None]:
# Convert data type all values of the dataframe to string
df = df.astype(str)

# Rename columns of the dataframe
df.rename(
    columns={
        "old_column":"NewColumn"
    },
    inplace=True
)

# Remove true duplicate rows
df.drop_duplicates(inplace=True)

In [None]:
# Check for duplicate Document IDs
len(df) == len(set(df["DocumentID"]))

#### Data Save

In [None]:
# Split the dataframe into list of batches
df_batches = df_batcher(df, 25000)

# Check breakdown of batches
[len(i) for i in df_batches]

In [None]:
# Save batches to delta table in a for loop
count = 0
for t in trange(len(df_batches)):
    spark_df = spark.createDataFrame(df_batches[t], schema=SCHEMA)
    if count == 0:
        # Overwrite data for the first batch
        spark_df.write.format("delta").option("delta.columnMapping.mode", "name").option("mergeSchema", "true").mode("overwrite").saveAsTable(table_name_in_catalog)
    else:
        # Append data for the rest of the batches
        spark_df.write.format("delta").option("delta.columnMapping.mode", "name").option("mergeSchema", "true").mode("append").saveAsTable(table_name_in_catalog)
    print(count)
    count += 1

### Test

In [None]:
%sql
--SELECT COUNT(DocumentID) AS Count, COUNT(DISTINCT(DocumentID)) AS DistinctCount, COUNT(DocumentID) - COUNT(DISTINCT(DocumentID)) AS Diff
--FROM dev.dept.aconex_docs;

In [None]:
'''
spark_df = spark.createDataFrame(df, schema=SCHEMA)
delta_table = DeltaTable.forName(spark, table_name_in_catalog)

delta_table.alias("target").merge(
    spark_df.alias("source"),
    "target.DocumentID = source.DocumentID"
).whenMatchedUpdateAll(
).whenNotMatchedInsertAll(
).execute()
'''