In [1]:
#Taken from other Notebooks setup (03-05-2024)
#Load in Prerequisite
import sys
! {sys.executable} -m pip install --upgrade 'xdmod-data>=1.0.0,<2.0.0' python-dotenv tabulate
#Configure formatting
import sys
def exception_handler(exception_type, exception, traceback):
    print("%s: %s" % (exception_type.__name__, exception), file=sys.stderr)
get_ipython()._showtraceback = exception_handler
import sys
def exception_handler(exception_type, exception, traceback):
    print("%s: %s" % (exception_type.__name__, exception), file=sys.stderr)
get_ipython()._showtraceback = exception_handler
from IPython.display import display, Markdown
def display_df_md_table(df):
    return display(Markdown(df.replace('\n', '<br/>', regex=True).to_markdown()))
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import xdmod_data.themes
pio.templates.default = "timeseries"



In [32]:
#Set up enviroment and connection
from pathlib import Path
from os.path import expanduser
xdmod_data_env_path = Path(expanduser('~/xdmod-data.env'))
try:
    with open(xdmod_data_env_path):
        pass
except FileNotFoundError:
    with open(xdmod_data_env_path, 'w') as xdmod_data_env_file:
        xdmod_data_env_file.write('XDMOD_API_TOKEN=')
    xdmod_data_env_path.chmod(0o600)
    
#Check if API key file is present
from dotenv import load_dotenv
load_dotenv(xdmod_data_env_path, override=True)

#Connect to sever 
#!NOTE: Update from test sever later on to generate with latest data and such
from xdmod_data.warehouse import DataWarehouse
dw = DataWarehouse('https://xdmod-dev.ccr.xdmod.org/')

In [33]:
#Create dictionary of all possible callable dates ranges
#yoinked from xdmod-data/xdmod_data/_validator.py (03-05-2024)
from datetime import date,timedelta

def __date_add_years(old_date, year_delta):
    # Make dates behave like Ext.JS, i.e., if a date is specified
    # with a day value that is too big, add days to the last valid
    # day in that month, e.g., 2023-02-31 becomes 2023-03-03.
    new_date_year = old_date.year + year_delta
    new_date_day = old_date.day
    days_above = 0
    keep_going = True
    while keep_going:
        try:
            new_date = date(new_date_year, old_date.month, new_date_day)
            keep_going = False
        except ValueError:
            new_date_day -= 1
            days_above += 1
    return new_date + timedelta(days=days_above)
def get_dates_from_duration():
    today = date.today()
    yesterday = today + timedelta(days=-1)
    last_week = today + timedelta(days=-7)
    last_month = today + timedelta(days=-30)
    last_quarter = today + timedelta(days=-90)
    this_month_start = date(today.year, today.month, 1)
    if today.month == 1:
        last_full_month_start_year = today.year - 1
        last_full_month_start_month = 12
    else:
        last_full_month_start_year = today.year
        last_full_month_start_month = today.month - 1
    last_full_month_start = date(
        last_full_month_start_year,
        last_full_month_start_month,
        1,
    )
    last_full_month_end = this_month_start + timedelta(days=-1)
    this_quarter_start = date(
        today.year,
        ((today.month - 1) // 3) * 3 + 1,
        1,
    )
    if today.month < 4:
        last_quarter_start_year = today.year - 1
    else:
        last_quarter_start_year = today.year
    last_quarter_start = date(
        last_quarter_start_year,
        (((today.month - 1) - ((today.month - 1) % 3) + 9) % 12) + 1,
        1,
    )
    last_quarter_end = this_quarter_start + timedelta(days=-1)
    this_year_start = date(today.year, 1, 1)
    this_year_end = date(today.year, 12, 31)
    previous_year_start = date(today.year - 1, 1, 1)
    previous_year_end = date(today.year - 1, 12, 31)
    durations_to_dates = {
        'Yesterday': (yesterday, yesterday),
        '7 day': (last_week, today),
        '30 day': (last_month, today),
        '90 day': (last_quarter, today),
        'Month to date': (this_month_start, today),
        'Previous month': (last_full_month_start, last_full_month_end),
        'Quarter to date': (this_quarter_start, today),
        'Previous quarter': (last_quarter_start, last_quarter_end),
        'Year to date': (this_year_start, today),
        'Previous year': (previous_year_start, previous_year_end),
        '1 year': (__date_add_years(today, -1), today),
        '2 year': (__date_add_years(today, -2), today),
        '3 year': (__date_add_years(today, -3), today),
        '5 year': (__date_add_years(today, -5), today),
        '10 year': (__date_add_years(today, -10), today),
        str(today.year): (this_year_start, this_year_end),
        str(__date_add_years(today, -1).year): (
            previous_year_start,
            previous_year_end,
        ),
    }
    for num_years in range(2, 7):
        durations_to_dates[str(__date_add_years(today, -num_years).year)] = (
            date(today.year - num_years, 1, 1),
            date(today.year - num_years, 12, 31),
        )
    return durations_to_dates

#Dictionary of all possible callable dates
possible_dates = get_dates_from_duration()

In [34]:
#store local copy of data in memory to reduce number of calls being made to sever, reduce network traffic

metrics = {}
dimensions = {}

with dw:  # Ensure 'dw' is properly defined and initialized
    realms = dw.describe_realms()
    aggression_units = dw.get_aggregation_units()
    durations = dw.get_durations()

    for realm in realms['label']:  # Ensure 'realms' has a 'label' key and it contains iterable data
        data = []
        for metric_index, metric_row in dw.describe_metrics(realm).iterrows():
            data.append([metric_index, metric_row['label'], metric_row['description']])
        metrics[realm] = data
        data = []
        for dim_index, dim_row in dw.describe_dimensions(realm).iterrows():
            data.append([dim_index,dim_row['label'],dim_row['description']])
        dimensions[realm] = data

In [35]:
import csv
import base64
import sys
import itertools
from io import StringIO

# User Query | Context (Chart Config, Chart Link, Chart Description) | Expected Answer  <- try to format data accordingly + Should I generate sample answers as well as questions?

gen_filenames = lambda realms: {realm: realm + '_data.tsv' for realm in realms['label']}
output_files = gen_filenames(realms)
file_header = ["ID", "URL", "Encoding String", "Sample Query", "Duration", "realm", "Metric", "Dimension", "Data Set Type", "Aggregation", "Description"]  # TODO - Finalize format later
count = 0

# Create a buffer to accumulate data before writing to the file
buffer = StringIO()
writer = csv.writer(buffer, delimiter='\t')

def write_to_file(file):
    with open(file, mode='a', newline='') as file:
        file.write(buffer.getvalue())
        buffer.seek(0)
        buffer.truncate(0)

# Takes in query values and adds a row to the CSV with the config data settings
def add_query(dur, realm, metric, dimension, data_set_type, ID):
    global count
    Description = ("Description: " + metric[1] + "-" + metric[2] + " " + dimention[1] + "-" + dimention[2] + " ").replace("\n","").replace("\t","")
    duration = possible_dates[dur]
    start_date, end_date = duration[0].strftime("%Y-%m-%d"), duration[1].strftime("%Y-%m-%d")
      
    #Do not leave spaces unless they are used instide of a string data tpye
    config = f"""  
            {{
            "featured":false,
            "trend_line":false,
            "x_axis":{{}},
            "y_axis":{{}},
            "legend":{{}},
            "defaultDatasetConfig":{{}},
            "swap_xy":false,
            "share_y_axis":false,
            "hide_tooltip":false,
            "show_remainder":false,
            "timeseries":{str(data_set_type.lower().startswith('t')).lower()},
            "title":"{metric[1]} per {(dimention[1] if dimention[1].lower() != 'none' else 'all')} {'totaled' if data_set_type != 'timeseries' else 'over time'}",
            "legend_type":"bottom_center",
            "font_size":3,
            "show_filters":true,
            "show_warnings":true,
            "data_series":{{
            "data":[
              {{
                "group_by":"{dimension[0]}",
                "color":"auto",
                "log_scale":false,
                "std_err":false,
                "value_labels":false,
                "display_type":"{graph}",
                "combine_type":"side",
                "sort_type":"value_desc",
                "ignore_global":false,
                "long_legend":true,
                "x_axis":false,
                "has_std_err":true,
                "trend_line":false,
                "line_type":"Solid",
                "line_width":2,
                "shadow":false,
                "filters":{{
                  "data":[],
                  "total":0
                }},
                "z_index": 0,
                "visibility":null,
                "enabled":true,
                "metric":"{metric[0]}",
                "realm":"{realm.replace(" ","")}",
                "category":"{realm}",
                "id":"{ID:010d}"
              }}
            ],
            "total":1
            }},
            "aggregation_unit":"Auto",
            "global_filters":{{
            "data": [],
            "total":0
            }},
            "timeframe_label":"{dur}",
            "start_date":"{start_date}",
            "end_date":"{end_date}",
            "start":0,
            "limit":10
    }}
    """
    # Encode the JSON string in Base64
    config_encoded = base64.b64encode(config.replace("\n","").replace("\t","").replace("  ","").encode('utf-8')).decode('utf-8')
    URL = f"https://xdmod-dev.ccr.xdmod.org/#main_tab_panel:metric_explorer?config={config_encoded}"

    # Generate Question
    question = f"Can you show me {metric[1]} per {(dimention[1] if dimention[1].lower() != 'none' else 'all')} from {realm} {'totaled' if data_set_type != 'timeseries' else 'over time'} for {dur} as a {graph} chart?"
    writer.writerow([f'{count:010d}', URL,"User Query:"+question+"|"+Description, question, dur, realm, metric[1], dimension[1], data_set_type, Description])
    count += 1

# Loop through all possible permutations of data querying (being duration, realm, metric, dimensions, filters, dataset_type, and aggregation)
with dw:
    for realm,filename in output_files.items():
        print(realm)
        print(filename)
        with open(filename, mode='w', newline='') as file:
            writer.writerow(file_header)
            for metric in metrics[realm]: # [code_name, human name, description]
                for data_set_type in ["timeseries", "aggregate"]:  # These values are hard coded -> may cause error if new types are later introduced
                    for graph in ['line', 'bar', 'area', 'areaspline', 'scatter', 'spline', 'pie']:  # NOTE bar is only possible for aggregate data
                        for dur in durations:
                            for dimention in dimensions[realm]: # [code_name, human name, description]
                                if graph == 'pie' and data_set_type == 'timeseries':
                                    continue
                                add_query(dur, realm, metric, dimention, data_set_type, count)
                                if not count % 500:
                                    write_to_file(filename)
                                    file.write(buffer.getvalue())

        write_to_file(filename)


Accounts
Accounts_data.tsv
log
log
log
Allocations
Allocations_data.tsv
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
Cloud
Cloud_data.tsv
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
Gateways
Gateways_data.tsv
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log


log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
log
