# Expand groups to users

## Config and Imports

In [11]:
spark.conf.set("spark.sql.parquet.vorder.enabled", "true")

# all the stuff I need
from notebookutils import mssparkutils

from pyspark.sql.types import StringType

from pyspark.sql.types import StructType
from pyspark.sql.types import StructField

from pyspark.sql.functions import when

import msal
import requests

StatementMeta(, 9c6bab1e-fc46-4536-ab40-6d3b9449435b, 13, Finished, Available, Finished)

## Read secret from Azure Key Vault

In [2]:
# The Azure Key Vault 
key_vault = ""

# The tenant id 
tenant_id = "<here goes your tenant id>"

# The Application Id (Client Id) of the service principal account 
client_id = "<here goes the client id of the SPN>"

# Fetching the Client Secret for the service principal account with permissions on the GraphAPI
client_secret = mssparkutils.credentials.getSecret(key_vault , "...") 

StatementMeta(, 9c6bab1e-fc46-4536-ab40-6d3b9449435b, 4, Finished, Available, Finished)

## Create MSAL client application

In [3]:
authority_url = f"https://login.microsoftonline.com/{tenant_id}"

# Create MSAL client application
app = msal.ConfidentialClientApplication(
    client_id, authority=authority_url, client_credential=client_secret
)

StatementMeta(, 9c6bab1e-fc46-4536-ab40-6d3b9449435b, 5, Finished, Available, Finished)

## Get the groups from the 'workspaces_scanned_users' table

In [4]:
df = spark.sql("SELECT GraphId FROM FUAM_Lakehouse.workspaces_scanned_users WHERE PrincipalType = 'Group'").distinct()

# transforms the colums of the above dataframe into a list
group_ids_list =df.rdd.map(lambda x: x[0]).collect()
#print(group_ids_list)

StatementMeta(, 9c6bab1e-fc46-4536-ab40-6d3b9449435b, 6, Finished, Available, Finished)

In [13]:
graph_api_url = "https://graph.microsoft.com/v1.0"

# Function to get an access token
def get_access_token():
    token_response = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
    return token_response
    #return token_response.get("access_token")

# Recursive function to get members of a specific group
def get_group_members_recursion(group_id_start, group_id_path, group_id, access_token):
    
    members_url = f"{graph_api_url}/groups/{group_id}/members"
    headers = {"Authorization": f"Bearer {access_token}"}
    response = requests.get(members_url, headers=headers)
    groupmembers = response.json().get("value", [])
    
    # check if there are members in the group
    if len(groupmembers) > 0:
        
        for member in groupmembers:

                #check if the member is a user or a group
                member_type = member.get("@odata.type")

                #if the member is of type user, a dictionary entry is created and added to the outer list object
                if member_type == "#microsoft.graph.user":
                    user_info = {
                        "graphId_Start": group_id_start,
                        "graphId_Path": group_id_path,
                        "graphId_Parent": group_id,
                        "graphId_UserPrincipalName": member.get('userPrincipalName')
                    }
                    users_from_group.append(user_info)
                elif member_type == "#microsoft.graph.group":
                    group_id_path = group_id_path + "|" + member.get('id')
                    get_group_members_recursion(group_id_start, group_id_path, member.get('id'), access_token)
                else:
                    print(f"Other Type: {member_type} (ID: {member.get('id')})")

# Main - iterating over the the initial list of groups
access_token = get_access_token()
#print(access_token)

if "access_token" in access_token:
    access_token = access_token["access_token"]

    users_from_group = []
    
    for group_id in group_ids_list:

        group_id_start = group_id
        
        groupmembers = get_group_members_recursion(group_id_start, group_id_start, group_id, access_token)

StatementMeta(, 9c6bab1e-fc46-4536-ab40-6d3b9449435b, 15, Finished, Available, Finished)

### Joining 'workspaces_scanned_users' and 'df_expandedGroups'

In [None]:

schema = StructType([
StructField("graphId_Parent", StringType(), True),
StructField("graphId_Path", StringType(), True),
StructField("graphId_Start", StringType(), True),
StructField("graphId_UserPrincipalName", StringType(), True)
])

# Create DataFrame from list
df_expandedGroups = spark.createDataFrame(users_from_group, schema)
#display(df_expandedGroups)

df_workspaces_scanned_users = spark.sql("SELECT * FROM FUAM_Lakehouse.workspaces_scanned_users")
#display(df_workspaces_scanned_users)

df_join = df_workspaces_scanned_users.join(df_expandedGroups, df_workspaces_scanned_users.GraphId == df_expandedGroups.graphId_Start, "leftouter")
df_join = df_join \
    .withColumn("UserPrincipalName_expanded", when(df_join.PrincipalType == 'User', df_join.Identifier).otherwise(df_join.graphId_UserPrincipalName))
#display(df_join)

### Writing the dataframe 'df_workspaces_scanned_users_expandedGroups' as delta table 'workspaces_scanned_users_expandedGroups'

In [7]:
df_join.write \
.option("mergeSchema", "true") \
.mode("overwrite") \
.format("delta") \
.saveAsTable("workspaces_scanned_users_expandedGroups")

StatementMeta(, 9c6bab1e-fc46-4536-ab40-6d3b9449435b, 9, Finished, Available, Finished)