In [None]:
# -------------------------------------------------------------------------------
# Data Fetcher Script for ETL Pipeline
# -------------------------------------------------------------------------------
# This script forms the core of an ETL pipeline designed to automate the extraction of data from 
# multiple external APIs, which includes educational assessments, user engagement, and program 
# performance metrics. 
# 
# This workbook covers the first step of the pipeline, the Extract phase, which involves fetching data from the API:
# 1. Pipeline: API Call
#    - Purpose: Retrieve data from the API, including information on users, 
#      groups, risk scores, phishing tests, and training campaigns.
#    - Description: This stage uses API endpoints to gather raw data, transforming it 
#      into a structured format for ingestion.
# 
# The main functionalities of this script include:
#
# 1. Securely fetching data from diverse API endpoints, each related to specific aspects of educational
#    programs and participant activities.
# 2. Normalizing and transforming this data into a structured format suitable for analytical purposes,
#    ensuring consistency and readiness for SQL database integration.
# 3. Preparing the data for merging and further processing by cleaning, renaming, and converting data types
#    to ensure optimal alignment with the database schema used in downstream analytics.
# 4. Loading the processed data into specific Pandas dataframes, which will further be processed in the Transform step.
#    
#
# The script is built with robust error handling and logging to track the data flow and catch issues early in
# the process, enhancing the reliability and maintainability of the data pipeline.
# -------------------------------------------------------------------------------
# End of Summary
# -------------------------------------------------------------------------------


In [None]:
# Step 0: Set up logging configuration, and import libraries
import logging
import requests
import pandas as pd
import time
import os
from requests.exceptions import RequestException
from datetime import datetime, timedelta
from pandas import json_normalize
from dotenv import load_dotenv

logging.basicConfig(
    level=logging.INFO,  
    format='%(asctime)s - %(levelname)s - %(message)s',  
    handlers=[
        logging.FileHandler("api_script.log"),  
        logging.StreamHandler()  
    ]
)

logging.info("Logging setup initialized with a single file and console output.")



# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Step 1: Define the API key, headers, and create functions to handle the API request with retry logic for rate limits

# Load environment variables
load_dotenv()
api_key = os.getenv('API_KEY')

# Set up headers for API requests
headers = {'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'}

# Function to handle the API request with retry logic for rate limits

# Rate Limiting notes from documentation
# API is limited to 2,000 requests per day plus the number of licensed users on your account. 
# The APIs may only be accessed four times per second. The API burst limit is 50 requests per minute. 
# Please note that the API bursts limits will start around five (5) minutes and the API daily limit starts around twenty-four (24) hours from the first API request.

# To track time intervals for rate limiting
last_request_time = None
requests_in_last_minute = 0
minute_start_time = datetime.now()

def fetch_data(url, headers, params, max_retries=5):
    """
    Fetch data from a given URL with specified headers and parameters, adhering to rate limits.
    This function makes a GET request to the specified URL with the provided headers and parameters.
    It enforces rate limits of 4 requests per second and 50 requests per minute. If the rate limit
    is exceeded, the function will pause and retry the request. If the server responds with a 429
    status code (rate limit exceeded), the function will wait for the specified 'Retry-After' time
    before retrying. The function will retry the request up to 'max_retries' times in case of failures.
    Args:
        url (str): The URL to send the GET request to.
        headers (dict): The headers to include in the GET request.
        params (dict): The parameters to include in the GET request.
        max_retries (int, optional): The maximum number of retries in case of failures. Defaults to 5.
    Returns:
        dict or None: The JSON response from the server if the request is successful, None otherwise.
    Raises:
        RequestException: If there is an issue with the request that is not related to rate limiting.
    """
    global last_request_time, requests_in_last_minute, minute_start_time
    retries = 0
    while retries < max_retries:
        # Check if we need to wait to comply with 4 requests/second and 50 requests/minute limits
        current_time = datetime.now()
        
        # Enforce 50 requests/minute limit
        if (current_time - minute_start_time).total_seconds() >= 60:
            requests_in_last_minute = 0
            minute_start_time = current_time
            
        # Pause if we exceed 50 requests within the last minute
        if requests_in_last_minute >= 50:
            sleep_time = 60 - (current_time - minute_start_time).total_seconds()
            logging.info(f"Reached burst limit. Pausing for {sleep_time:.2f} seconds to comply with rate limits.")
            time.sleep(sleep_time)
            requests_in_last_minute = 0
            minute_start_time = datetime.now()
        
        # Enforce 4 requests per second
        if last_request_time and (current_time - last_request_time).total_seconds() < 0.25:
            time.sleep(0.25 - (current_time - last_request_time).total_seconds())
        
        try:
            response = requests.get(url, params=params, headers=headers)
            last_request_time = datetime.now()
            requests_in_last_minute += 1
            
            if response.status_code == 200:
                logging.info("Data fetched successfully.")
                return response.json()
            elif response.status_code == 429:  
                retry_after = int(response.headers.get('Retry-After', 5))
                logging.warning(f"Rate limit hit. Retrying after {retry_after} seconds...")
                time.sleep(retry_after)
                retries += 1
            else:
                logging.error(f"Error {response.status_code}: {response.text}")
                break
        except RequestException as e:
            logging.error(f"RequestException: {str(e)}")
            break

    logging.error("Max retries reached. Failed to fetch data.")
    return None


# Function to apply date conversion to a list of columns
def convert_dates(df, columns):
    """
    Convert specified columns in a DataFrame to datetime format.

    This function takes a DataFrame and a list of column names, and converts
    each specified column to datetime format. If a column does not exist in 
    the DataFrame, it is skipped. Any errors during conversion will result in 
    NaT (Not a Time) values. The timezone information is removed from the 
    datetime values.

    Args:
        df (pd.DataFrame): The DataFrame containing the columns to be converted.
        columns (list of str): A list of column names to be converted to datetime.

    Returns:
        None: The function modifies the DataFrame in place.

    Logs:
        Logs an info message for each column that is successfully converted.
    """
    for col in columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce').dt.tz_localize(None)
            logging.info(f"Converted {col} to datetime.")

def rename_and_cast_columns(df, column_mappings):
    """
    Rename specified columns in a DataFrame and ensure they are cast to int64.

    Parameters:
    df (pd.DataFrame): The DataFrame containing columns to rename and cast.
    column_mappings (dict): A dictionary where keys are current column names and values are the new names.

    Returns:
    pd.DataFrame: The DataFrame with renamed columns and appropriate casting.
    """
    # Rename columns
    df.rename(columns=column_mappings, inplace=True)

    # Ensure columns are of type int64
    for new_col in column_mappings.values():
        if new_col in df.columns:
            df[new_col] = pd.to_numeric(df[new_col], errors='coerce').fillna(0).astype('int64')
            logging.info(f"Column '{new_col}' renamed and cast to int64.")

    return df

# Define the helper function to convert ID columns
def convert_id_columns_to_int64(dataframes, id_columns):
    """
    Converts specified ID columns in each DataFrame within the provided dictionary to int64.
    
    Parameters:
    dataframes (dict): Dictionary of DataFrames to process.
    id_columns (list): List of column names to convert to int64 if present in each DataFrame.
    
    Returns:
    dict: Updated dictionary with ID columns converted to int64 where applicable.
    """
    for table_name, df in dataframes.items():
        for col in id_columns:
            if col in df.columns:
                try:
                    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype('int64')
                    logging.info(f"Converted column '{col}' in DataFrame '{table_name}' to int64.")
                except Exception as e:
                    logging.error(f"Error converting column '{col}' in DataFrame '{table_name}': {e}")
    return dataframes



In [None]:
# Step 2: API Calls and Data Processing

# Log the start of the script
today = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
logging.info(f"API script started: {today}")


# -------- API 1: Data Retrieval Endpoint --------
url_data = 'https://api.example.com/v99/data_history'
params_data = {
    'include_all': 'true'  # Retrieve comprehensive historical data. Defaults to six months if not specified.
}

data_history = fetch_data(url_data, headers, params_data)
if data_history:
    df_data_history = pd.json_normalize(data_history, sep='_')
    # Convert dates to datetime
    date_cols = ['date']
    convert_dates(df_data_history, date_cols)

    # Rename date to date_recorded for insertion into SQL database
    df_data_history.rename(columns={'date': 'date_recorded'}, inplace=True)
    
    logging.info("Data History DataFrame created successfully.")
    logging.info("Completed call: API 1.")
else:
    blank_df_data_history = pd.DataFrame()
    logging.warning("No data history fetched.")



# -------- API 2: Get a List of All Users Endpoint --------
url_users = 'https://api.example.com/v99/user_data'
params_users = {
    'status': 'active',  # Returns a list of all active users.
    'expand': 'details',   # Expands additional details related to each user.
    'per_page': 500  # Number of records per page (maximum allowed).
}

user_list = fetch_data(url_users, headers, params_users)
if user_list:
    df_user_list = pd.json_normalize(user_list, sep='_')

    # Create full name column
    df_user_list['full_name'] = df_user_list['first_name'] + ' ' + df_user_list['last_name']

    # Clean up department names
    df_user_list['department'] = df_user_list['department'].str.replace('_', ' ')

    # Convert dates to datetime
    date_cols = ['last_sign_in', 'employee_start_date', 'archived_at', 'custom_date_1', 'custom_date_2']
    convert_dates(df_user_list, date_cols)

    # Rename columns for appropriate ID references
    df_user_list.rename(columns={'id': 'user_id', 'employee_number': 'internal_id'}, inplace=True)

    logging.info("Users DataFrame created successfully.")
    logging.info("Completed call: API 2.")

else:
    blank_df_user_list = pd.DataFrame()
    logging.warning("No user data fetched.")





# -------- API 3: Get a Specific User's Historical Data Endpoint --------
# This will be done in a loop for each user, fetching their historical data separately as one API call per user.
# Method: Loop through each unique user ID, fetch their historical data, and store it in a list of DataFrames.

# Define unique user IDs from the dataframe
user_ids = df_user_list['user_id'].unique()  
params_user_history = {
    'complete_history': 'true'  # Include the entire historical data of the user
}

# List to store individual user historical data
user_history_dfs = []

for user in user_ids:
    # Construct URL specific to each user
    url_user_history = f'https://api.example.com/v99/users/{user}/history'
    try:
        # Make API call for each user
        user_history_data = fetch_data(url_user_history, headers, params_user_history)
        
        # Process the data if it's available
        if user_history_data:
            df_user_history = pd.json_normalize(user_history_data, sep='_')
            
            # Convert dates to datetime if 'date' column exists
            date_cols = ['date']
            convert_dates(df_user_history, date_cols)

            # Add 'user_id' column to enable proper merging
            df_user_history['user_id'] = user
            
            # Append processed DataFrame to the list
            user_history_dfs.append(df_user_history)
        else:
            logging.warning(f"No historical data fetched for user ID: {user}")
    except Exception as e:
        logging.error(f"Error fetching data for user ID {user}: {e}")

# Concatenate all user historical data DataFrames, if any
if user_history_dfs:
    combined_user_history_df = pd.concat(user_history_dfs, ignore_index=True)
    
    # Select only the necessary columns from the user list DataFrame for the merge
    df_users_subset = df_user_list[['user_id', 'internal_id', 'first_name', 'last_name', 
                                    'full_name', 'department', 'job_title']]
    
    # Merge the combined history data with the subset of the original users listing DataFrame
    combined_user_history_df = pd.merge(combined_user_history_df, df_users_subset, 
                                              how='left', on='user_id')

    logging.info("Combined Historical Data DataFrame created successfully.")
    logging.info("Completed call: API 3.")
else:
    blank_df_combined_history = pd.DataFrame()
    logging.warning("No historical data available to merge.")




# -------- API 4: Get a List of All Groups Endpoint --------
group_list_url = 'https://api.example.com/v99/group_data'
params_groups = {
    'status': 'active',  # Returns a list of all active groups.
    'per_page': 500  # Number of records per page (maximum allowed).
}
list_of_groups = fetch_data(group_list_url, headers, params_groups)

# Process list of groups data
if list_of_groups:
    df_groups = pd.json_normalize(list_of_groups, sep='_')
    
    # Rename columns for appropriate ID references
    df_groups.rename(columns={'id': 'group_id', 'name':'group_name'}, inplace=True)

    logging.info("Group ID column renamed successfully.")
    logging.info("Groups DataFrame created successfully.")
    logging.info("Completed call: API 4.")
else:
    blank_df_groups = pd.DataFrame()
    logging.warning("No group data fetched.")





# -------- API 5: Get a List of All Users in All Groups Endpoint --------
# Ensure df_groups exists and contains the 'group_id' column before proceeding
if 'df_groups' in locals() and 'group_id' in df_groups.columns:
    group_ids = df_groups['group_id'].unique()  # Get unique group IDs
else:
    logging.error("DataFrame `df_groups` is either not defined or missing the 'group_id' column.")
    group_ids = []  # Set to empty to avoid further errors in looping

# This will be done in a loop for each group
params_group_users = {
    'include_full_details': 'true',  # Include all user details
    'per_page': 500  # Number of records per page (max 500)
}
group_ids = df_groups['group_id'].unique()  # Get unique group IDs

all_users_all_groups_dfs = []
for group_id in group_ids:
    # Construct URL specific to each group
    url_all_users_all_groups = f"https://api.example.com/v99/groups/{group_id}/users"
    try:
        # Make API call for each group
        all_users_in_group = fetch_data(url_all_users_all_groups, headers, params_group_users)
        
        # Process the data if it's available
        if all_users_in_group:
            df = pd.DataFrame(all_users_in_group)
                       
            # Add 'group_id' column for group association
            df['group_id'] = group_id
            
            # Append processed DataFrame to the list
            all_users_all_groups_dfs.append(df)
        else:
            logging.warning(f"No data fetched for group ID: {group_id}")
    except Exception as e:
        logging.error(f"Error fetching data for group ID {group_id}: {e}")

# Concatenate all user data across groups if any data was collected
if all_users_all_groups_dfs:
    combined_df_all_users_all_groups = pd.concat(all_users_all_groups_dfs, ignore_index=True)

    # Convert dates to datetime where applicable
    date_cols = ['joined_on', 'last_sign_in', 'employee_start_date', 'custom_date_1', 'custom_date_2']
    convert_dates(combined_df_all_users_all_groups, date_cols)
    # Rename ID columns for clarity
    column_mappings = {'id': 'user_id', 'employee_number': 'internal_id'}
    combined_df_all_users_all_groups = rename_and_cast_columns(combined_df_all_users_all_groups, column_mappings)

    logging.info("Combined all users across groups DataFrame created successfully.")
    logging.info("Completed call: API 5.")
else:
    blank_df_all_users_all_groups = pd.DataFrame()
    logging.warning("No user data available to merge across groups.")




# -------- API 6: Get a Specific Group's Historical Data Endpoint --------
# This will also be done in a looped manner for each group similar to users
group_ids = df_groups['group_id'].unique()
params_group_history = {
    'include_complete_history': 'true'  # Include the entire historical data of the group
}

# List to store individual group historical data
group_history_dfs = []
for group in group_ids:
    # Construct URL specific to each group
    url_group_history = f'https://api.example.com/v99/groups/{group}/history'
    try:
        # Make API call for each group
        group_history = fetch_data(url_group_history, headers, params_group_history)
        
        # Process the data if it's available
        if group_history:
            df_group_history = pd.json_normalize(group_history, sep='_')
            
            # Convert dates to datetime if 'date' column exists
            date_cols = ['date']
            convert_dates(df_group_history, date_cols)
            
            # Add 'group_id' column to enable proper merging
            df_group_history['group_id'] = group
            
            # Append processed DataFrame to the list
            group_history_dfs.append(df_group_history)
        else:
            logging.warning(f"No historical data fetched for group ID: {group}")
    except Exception as e:
        logging.error(f"Error fetching data for group ID {group}: {e}")

# Concatenate all group historical data DataFrames, if any
if group_history_dfs:
    combined_df_group_history = pd.concat(group_history_dfs, ignore_index=True)
    
    # Select only the necessary columns from df_groups for the merge
    df_group_subset = df_groups[['group_id', 'group_name', 'member_count']]
    
    # Merge the combined history data with the subset of the original groups DataFrame
    combined_df_group_history = pd.merge(combined_df_group_history, df_group_subset, 
                                         how='left', on='group_id')
    logging.info("Combined Historical Data DataFrame created successfully.")
    logging.info("Completed call: API 6.")
else:
    blank_df_combined_group_history = pd.DataFrame()
    logging.warning("No historical data available to merge.")





# -------- API 7: Get All Security Assessments Endpoint --------
url_security_assessments = 'https://api.example.com/v99/security/assessments'
params_security_assessments = {
    'assessment_type': 'callback',  # Returns information related to callback type assessments.
    'per_page': 500  # Number of records per page (maximum allowed).
}

data_security_assessments = fetch_data(url_security_assessments, headers, params_security_assessments)
if data_security_assessments:
    df_security_assessments = pd.json_normalize(data_security_assessments, sep='_')
    
    # Convert dates to datetime
    date_cols = ['start_date', 'end_date']
    convert_dates(df_security_assessments, date_cols)

    logging.info("Security Assessments DataFrame created successfully.")
    logging.info("Completed call: API 7.")
else:
    blank_df_security_assessments = pd.DataFrame()
    logging.warning("No security assessment data fetched.")




    
# -------- API 8: Get Specific Campaign Data Endpoint --------
# Define unique campaign IDs from the security assessments dataframe
campaign_ids = df_security_assessments['campaign_id'].unique()
params_campaign = {
    'per_page': 500  # Number of records per page (maximum allowed)
}

# Initialize lists to store campaign data and related group data
campaign_data_dfs = []
group_bridge_dfs = []

for campaign in campaign_ids:
    # Construct URL specific to each campaign
    url_campaigns = f'https://api.example.com/v99/campaigns/{campaign}'
    try:
        # Make API call for each campaign
        campaigns_data = fetch_data(url_campaigns, headers, params_campaign)
        
        # Process the data if available
        if campaigns_data:
            # Normalize the main campaign data, extracting nested records separately
            df_campaign = pd.json_normalize(
                campaigns_data, 
                record_path='details', 
                meta=['campaign_id', 'name', 'phish_prone_percentage', 'last_run', 'status', 
                      'send_duration', 'track_duration', 'frequency', 'create_date'],
                record_prefix='detail_',
                errors='ignore'
            )
            
            df_groups_bridge = pd.json_normalize(
                campaigns_data, 
                record_path='groups', 
                meta=['campaign_id', 'name'], 
                record_prefix='group_', 
                errors='ignore'
            )
            
            # Add each normalized DataFrame to the respective lists
            campaign_data_dfs.append(df_campaign)
            group_bridge_dfs.append(df_groups_bridge)
            
            # Log success message
            logging.info(f"Data for Campaign ID {campaign} processed successfully.")
        else:
            logging.warning(f"No data fetched for Campaign ID: {campaign}")
    except Exception as e:
        logging.error(f"Error fetching data for Campaign ID {campaign}: {e}")

# Concatenate all campaign and group bridge dataframes if any data was collected
if campaign_data_dfs:
    combined_df_campaigns = pd.concat(campaign_data_dfs, ignore_index=True)
    
    # Define columns that need datetime conversion
    date_columns = ['detail_start_date', 'last_run', 'create_date']
    convert_dates(combined_df_campaigns, date_columns)

    # Calculate 'campaign_end_date' by adding duration to 'last_run', if both columns exist
    if 'last_run' in combined_df_campaigns.columns and 'duration_days' in combined_df_campaigns.columns:
        combined_df_campaigns['campaign_end_date'] = combined_df_campaigns['last_run'] + pd.to_timedelta(combined_df_campaigns['duration_days'], unit='D')
        convert_dates(combined_df_campaigns, ['campaign_end_date'])

    logging.info("Combined Campaign Data DataFrame created successfully.")
else:
    blank_df_campaigns = pd.DataFrame()
    logging.warning("No campaign data available to merge.")

if group_bridge_dfs:
    combined_df_groups_bridge = pd.concat(group_bridge_dfs, ignore_index=True)
    
    # Define column mappings for renaming
    column_mappings = {
        'group_group_id': 'group_id',
    }
    combined_df_groups_bridge = rename_and_cast_columns(combined_df_groups_bridge, column_mappings)
    logging.info("Combined Groups-to-Campaign Bridge DataFrame created successfully.")
    logging.info("Completed call: API 8.")
else:
    blank_df_groups_bridge = pd.DataFrame()
    logging.warning("No group bridge data available to merge.")


# -------- API 9: Get All Assessment Results Endpoint --------
# Looped approach for each security assessment
assessment_ids = df_security_assessments['assessment_id'].unique()
params_assessment_results = {
    'per_page': 500  # Number of records per page (maximum allowed)
}

assessment_results_dfs = []  # Initialize list to store each assessment results DataFrame

for assessment_id in assessment_ids:
    # Construct URL specific to each assessment
    url_assessment_results = f'https://api.example.com/v99/assessments/{assessment_id}/results'
    try:
        # Make API call for each assessment
        assessment_data = fetch_data(url_assessment_results, headers, params_assessment_results)
        
        # Process the data if available
        if assessment_data:
            # Convert JSON data to a DataFrame
            df = pd.DataFrame(assessment_data)

            # Add a column for the Assessment ID to relate recipients back to the assessment
            df['assessment_id'] = assessment_id

            # Append the data to the results list
            assessment_results_dfs.append(df)

            logging.info(f"Data for Assessment ID {assessment_id} processed successfully.")
        else:
            logging.warning(f"No data fetched for Assessment ID: {assessment_id}")
    except Exception as e:
        logging.error(f"Error fetching data for Assessment ID {assessment_id}: {e}")

# Concatenate all Assessment DataFrames if any data was collected
if assessment_results_dfs:
    combined_df_assessment_results = pd.concat(assessment_results_dfs, ignore_index=True)

    # Normalize the DataFrame
    combined_df_assessment_results = json_normalize(combined_df_assessment_results.to_dict(orient='records'), sep='_')
    
    # Date columns to convert, if any
    date_columns = ['scheduled_at', 'delivered_at', 'opened_at',
       'clicked_at', 'replied_at', 'attachment_opened_at', 'macro_enabled_at',
       'data_entered_at', 'qr_code_scanned', 'reported_at', 'bounced_at']
    convert_dates(combined_df_assessment_results, date_columns)

    logging.info("Combined Assessment Results Data DataFrame created successfully.")
    logging.info("Completed call: API 9.")
else:
    logging.warning("No Assessment data available to merge.")
    blank_assessment_results = pd.DataFrame()




# -------- API 10: Get All Training Registrations Endpoint --------
url_registrations = 'https://api.example.com/v99/training/registrations'
params_registrations = {
    'exclude_archived': 'false',  
    'include_id_details': 'true',  
    'include_purchase_info': 'true',
    'per_page': 500,  # Number of records per page (maximum allowed)
}

data_registrations = fetch_data(url_registrations, headers, params_registrations)
if data_registrations:
    df_registrations = pd.json_normalize(data_registrations, sep='_')

    # Convert dates to datetime
    date_cols = ['enrollment_date', 'start_date', 'completion_date']
    convert_dates(df_registrations, date_cols)

    # Rename ID columns for clarity
    column_mappings = {'user_id': 'registration_id'}
    df_registrations = rename_and_cast_columns(df_registrations, column_mappings)

    # Feature Engineering
    df_registrations['duration_hours'] = (df_registrations['completion_date'] - df_registrations['start_date']).dt.total_seconds() / 3600
    df_registrations['duration_minutes'] = (df_registrations['completion_date'] - df_registrations['start_date']).dt.total_seconds() / 60
    df_registrations['ongoing_flag'] = df_registrations['start_date'].notna() & df_registrations['completion_date'].isna()
    df_registrations['completed_flag'] = df_registrations['completion_date'].notna().astype(int)
    df_registrations['enrollment_to_start_days'] = (df_registrations['start_date'] - df_registrations['enrollment_date']).dt.days
    df_registrations['start_to_completion_days'] = (df_registrations['completion_date'] - df_registrations['start_date']).dt.days
    df_registrations['enrollment_to_completion_days'] = (df_registrations['completion_date'] - df_registrations['enrollment_date']).dt.days
    current_date = pd.Timestamp.now().tz_localize(None)
    df_registrations['enrollment_duration_days'] = (current_date - df_registrations['enrollment_date']).dt.days

    logging.info("Training Registrations DataFrame created successfully.")
    logging.info("Completed call: API 10.")
else:
    blank_df_registrations = pd.DataFrame()
    logging.warning("No registration data fetched.")


# -------- API 11: Get All Educational Programs Endpoint --------
url_programs = 'https://api.example.com/v99/educational/programs'
params_programs = {
    'include_details': 'true',
    'per_page': 500  # Number of records per page (maximum allowed)
}

data_programs = fetch_data(url_programs, headers, params_programs)

if data_programs:
    # Initial flattening of the programs data
    df_programs = pd.json_normalize(data_programs, sep='_')
    
    # Flatten group and content fields (before applying datetime conversion)
    df_program_details = pd.json_normalize(
        data_programs,
        record_path=['groups'],
        meta=['program_id', 'name', 'status', 'duration_type', 'start_date', 'end_date', 'auto_enroll', 'completion_rate'],
        record_prefix='group_',
        errors='ignore'
    )
    
    df_program_content = pd.json_normalize(
        data_programs,
        record_path=['content'],
        meta=['program_id', 'name', 'status', 'duration_type', 'start_date', 'end_date', 'auto_enroll', 'completion_rate'],
        record_prefix='content_',
        errors='ignore'
    )

    # Convert dates to datetime after all json_normalize operations
    date_cols = ['start_date', 'end_date']
    for c in date_cols:
        if c in df_programs.columns:
            convert_dates(df_programs, date_cols)
        if c in df_program_details.columns:
            convert_dates(df_program_details, date_cols)
        if c in df_program_content.columns:
            convert_dates(df_program_content, date_cols)

        # Rename the group_id column for clarity
        df_program_details.rename(columns={'group_group_id': 'group_id'}, inplace=True)

    logging.info("Educational Programs DataFrames created successfully.")
    logging.info("Completed call: API 11.")
else:
    blank_df_programs = pd.DataFrame()
    blank_df_program_details = pd.DataFrame()
    blank_df_program_content = pd.DataFrame()
    logging.warning("No educational program data fetched.")



In [None]:
# Step 3: Final DataFrames for Power BI; Merging and Conversion

# For df_registrations, merge with df_programs to include program id, start, and end dates
# Merge by first selecting the columns to comprise the right-hand dataset
df_right = df_programs[['program_id', 'start_date', 'end_date']]
df_registrations = pd.merge(df_registrations, df_right, how='left', on='program_id')

# Rename the newly merged columns to avoid conflicts
df_registrations.rename(columns={'start_date_x': 'start_date',
                                 'start_date_y': 'program_start_date',
                                 'end_date': 'program_end_date'},
                        inplace=True)

logging.info("Merged Registrations with Programs DataFrame created successfully.")

# Perform the merge to include 'program_id' in combined_df_assessment_results
combined_df_assessment_results_merged = combined_df_assessment_results.merge(
    df_security_assessments[['assessment_id', 'program_id']], 
    on='assessment_id', 
    how='left'
)

logging.info("Merged Assessment Results with Programs DataFrame created successfully.")

# Log the start of the script that converts ID columns to int64
logging.info("Starting conversion of ID columns to int64.")

# Dictionary of your DataFrames; the keys are the table names that will be inserted into database, and the values are the DataFrames generated from the API calls
dataframes = {
    'scia_api_educational_account_history': df_group_history,
    'scia_api_educational_users': df_user_list,
    'scia_api_educational_user_history': combined_df_group_history,
    'scia_api_educational_groups': df_groups,
    'scia_api_educational_users_in_groups': combined_df_all_users_all_groups,
    'scia_api_educational_group_history': combined_df_group_history,
    'scia_api_educational_assessments': df_security_assessments,
    'scia_api_educational_programs': df_programs,
    'scia_api_educational_program_details': df_program_details,
    'scia_api_educational_content': df_program_content,
    'scia_api_educational_registrations': df_registrations,
    'scia_api_educational_assessment_results': combined_df_assessment_results,
    'scia_api_educational_groups_bridge': combined_df_groups_bridge,
    'scia_api_educational_assessment_results_merged': combined_df_assessment_results_merged
}

id_columns = [
    'program_id', 
    'content_purchase_id', 
    'registration_id', 
    'group_id', 
    'internal_id',
    'user_id', 
    'detail_id', 
    'recipient_id'
]

# Apply the helper function to the dataframes dictionary
convert_id_columns_to_int64(dataframes, id_columns)

# Assertion to check the ID conversion with an early exit if any conversion fails
for table_name, df in dataframes.items():
    for col in id_columns:
        if col in df.columns:
            try:
                assert df[col].dtype == 'int64', f"Column '{col}' in DataFrame '{table_name}' is not of type int64."
            except AssertionError as e:
                logging.error(e)
                exit()
                
logging.info("Completed conversion of ID columns to int64.")
# Log the end of the script and signal that we are moving to the next step
logging.info(f"API script completed successfully; returned back are {len(dataframes)} dataframes processed.")
