## IMPORTS

In [None]:
import requests
import time
import pandas as pd
from pyspark.sql.functions import *
from pyspark.sql.types import *

## URIs + HEADERS for requests

In [None]:
pbi_resource = "https://analysis.windows.net/powerbi/api"
pbi_Uri = 'https://api.powerbi.com/v1.0/myorg/'

## Unified functions

In [None]:
def get_token():
    return mssparkutils.credentials.getToken(pbi_resource)

In [None]:
def get_powerbiAPIclusterURI():
    fullurl = pbi_Uri+'datasets'
    pbi_access_token = get_token()
    headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {pbi_access_token}'}
    response = requests.get(fullurl, headers=headers)
    unaltered = response.json()['@odata.context']
    stripped = unaltered.split('/')
    return f'https://{stripped[2]}/beta/myorg/groups'

clusteredURI = get_powerbiAPIclusterURI()

### Functions definition

In [None]:
def get_AccessibleWorkspaces():
    fullUrl = pbi_Uri+"/groups?$filter=type eq 'Workspace'"
    pbi_access_token = get_token()
    headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {pbi_access_token}'}
    response = requests.get(fullUrl, headers=headers)
    return response.json()['value']

def get_WorkspaceUsageMetricsId(wsId):
    fullurl = f'{clusteredURI}/{wsId}/usageMetricsReportV2?experience=power-bi'
    # print('Asked for token')
    pbi_access_token = get_token()
    # print('Token received')

    headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {pbi_access_token}'}
    attemps = 0
    while attemps < 4:
       # print('Starting ' + str(attemps))
        try:
            response = requests.get(fullurl, headers=headers, timeout=60)
        #    print('Received data')
            return response.json()['models'][0]['dbName']
        except:
        #    print('Fallin asleep')
            time.sleep(30)
        #    print('Awaken')
            attemps += 1

def post_ExecuteQuery(wsId, dsId, daxQ):
    fullurl = f'{pbi_Uri}/groups/{wsId}/datasets/{dsId}/executeQueries'
    pbi_access_token = get_token()
    headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {pbi_access_token}'}
    content = {"queries": [{"query": daxQ}],"serializerSettings": {"includeNulls": True}}
    attemps = 0
    while attemps < 4:
        try:
            response = requests.post(fullurl, json=content, headers = headers, timeout=60).json()['results'][0]['tables'][0]
            return response
        except:
            time.sleep(30)
            attemps += 1

def replace_ColumnNames(frame, replacingName):
    return frame.toDF(*(c.replace(replacingName,'').replace('[','').replace(']','') for c in frame.columns))

def extract_DataFrame(response_list):
    rl = spark.createDataFrame(response_list)
    rl = rl.select(explode(rl.rows)).select("col").rdd.flatMap(lambda x: x).collect()
    rl = spark.createDataFrame(rl)
    return rl

### Tables

In [None]:
tblsInLK = spark.catalog.listTables()
tblsInLK = spark.createDataFrame(pd.DataFrame(tblsInLK))

## Workspace and Dataset IDs receiver

In [None]:
wsList = get_AccessibleWorkspaces()

In [None]:
doesTableUMDExists = tblsInLK.filter(col('name') == 'UsageMetricsDatasets').count()

In [None]:
response_list = []
# wsNum = 0
if doesTableUMDExists == 0:
    for ws in wsList:
        # wsNum += 1
        wsId = ws['id']
        # print('Workspace ID:' + wsId)
        dsId = get_WorkspaceUsageMetricsId(wsId)
        # print('DONE WS Number: ' + str(wsNum))
        response_list.append({'WorkspaceId': wsId, 'DatasetId': dsId})
    usgDatasetIds = spark.createDataFrame(response_list)
    writetolake = usgDatasetIds.write.mode("overwrite").format("delta").save("Tables/UsageMetricsDatasets")
else:
    sqlPrepare = spark.sql("SELECT * FROM usageMetrics.UsageMetricsDatasets WHERE DatasetId IS NOT NULL")
    # print('SQL Selection Completed')
    existingWsIds = sqlPrepare.select(col('WorkspaceId'))
    receivedListOfWs = spark.createDataFrame(wsList).select(col('id').alias("WorkspaceId"))
    listOfUsableWs = existingWsIds.intersectAll(receivedListOfWs)
    # print('Intersection Completed')
    listOfNewWs = receivedListOfWs.exceptAll(existingWsIds)
    # print('Except Completed')
    extractionPrepare = "('"+"','".join(list(listOfUsableWs.toPandas()['WorkspaceId']))+"')"
    # print('List of Workspace IDs for SQL Created')
    existedRecords = spark.sql(f'SELECT * FROM usageMetrics.UsageMetricsDatasets WHERE WorkspaceId IN {extractionPrepare} AND DatasetId IS NOT NULL')
    # print('Extraction of prepared dataset ID completed')
    collectionOfNewWs = listOfNewWs.collect()
    if listOfNewWs.count() != 0:
        for ws in collectionOfNewWs:
            # wsNum += 1
            wsId = ws['WorkspaceId']
            # print('Workspace ID:' + wsId)
            dsId = get_WorkspaceUsageMetricsId(wsId)
            # print('DONE WS Number: ' + str(wsNum))
            response_list.append({'WorkspaceId': wsId, 'DatasetId': dsId})
            newRecords = spark.createDataFrame(response_list)
        writetolake = newRecords.write.mode("append").format("delta").save("Tables/UsageMetricsDatasets")
        usgDatasetIds = existedRecords.union(newRecords)
    else: 
        usgDatasetIds = existedRecords

### Data Extraction

#### Preparation

In [None]:
wsList = usgDatasetIds.collect()

#### Basic Tables

In [None]:
basicListOfTables = ['Reports','Users',"'Report pages'", "''Workspace views''","'Report views'","'Report page views'","'Report load times'"] # List of all basic tables from that need to be extracted
for bsTbl in basicListOfTables:
    response_list = []
    for ws in wsList:
        dsId = ws[0]
        wsId = ws[1]
        response = post_ExecuteQuery(wsId,dsId,f'EVALUATE {bsTbl}')
        response_list.append(response)
        time.sleep(0.5)
    reports = extract_DataFrame(response_list)
    reports = replace_ColumnNames(reports,bsTbl)
    updatedText = bsTbl.replace("'","")
    writetolake = reports.write.mode("overwrite").format("delta").save(f"Tables/{updatedText}")