2024-03-04

This does no additional finetuning but only topic modeling.

In [None]:
# imports
import umap.umap_ as UMAP # also had to change UMAP(n_... to UMAP.UMAP(n_... everywhere in BERTopic files
import pandas as pd
import transformers
import accelerate
import ast
import os
import numpy as np
from transformers import AutoTokenizer, AutoModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoModelForMaskedLM, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, models
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import joblib  # save the label encoder
from openai import OpenAI
# from umap import umap_ as UMAP
import time
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from bertopic import BERTopic
from hdbscan import HDBSCAN
from tenacity import (  # for retries to openai
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

In [2]:
# display tweaks
pd.set_option("display.max_colwidth", 200)  # how much text is showing within a cell
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
# warnings.filterwarnings("ignore")

In [3]:
# read api key
def read_key_from_file(filename=r"key.txt"):  # replace with your key
    with open(filename, 'r') as file:
        return file.read().strip()

In [4]:
# import data
df = pd.read_excel(r'data\df_2024-02-28.xlsx')

# get rid of needless columns
l = ['Unnamed: 0.1', 'extended_fipps_label', 'Unnamed: 1', 'Unnamed: 0','BERTembeddings', 'BERTlargeembeddings', 'ada2_embeddings', 'all-mpnet-base-v2_embeddings', 'ada2_embeddings_new', 'topic', 'probs']

for x in l:
    del df[x]

# Topic modeling

In [None]:
# Generate Embeddings Using the Fine-tuned Model
# Load fine-tuned sentence-transformers model
model_path = "outputs/sentence_transformers_compatible_model"
finetuned_model = SentenceTransformer(model_path)

# Generate embeddings for each document
embeddings = finetuned_model.encode(docs, show_progress_bar=True)

# Add the embeddings to DataFrame
df['finetune_embeddings'] = embeddings.tolist()

In [None]:
# Add the embeddings to DataFrame
df['finetune_embeddings'] = embeddings.tolist()

In [5]:
# getting saved model and skipping rows above
df = pd.read_excel(f'data/finalized/df_2024-03-04.xlsx', index_col=0)
df['finetune_embeddings'] = df['finetune_embeddings'].apply(ast.literal_eval)

print(df.shape)

(1125, 9)


In [None]:
# check data
df.tail(1)

# Topic modeling
See [BERTopic documentation](https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html#min_topic_size "More info on minimum topic size and other parameters") for more on min_topic_size and other parameter choices.

In [22]:
# topic modeling function - requires setting minimum topic size ()
def topic_modeling(min_topic_size):
    # Load fine-tuned sentence-transformers model
    # finetuned_model = SentenceTransformer(r"outputs\fine_tuned_model")

    # Load pre-generated embeddings
    pre_generated_embeddings = np.array(list(df['finetune_embeddings'].values))

    # # specifying dimensionality reduction
    umap_model = UMAP.UMAP(n_neighbors=15, n_components=5, metric='cosine', low_memory=False)  # may need to tweak

    # # specifying cluster model
    hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', prediction_data=True) 

    # Create a representation model, 3 parts (except 2 because openai one doesn't work :(
    keybert_model = KeyBERTInspired(random_state=42)
    mmr_model = MaximalMarginalRelevance(diversity=0.3)
    representation_model = {
        "KeyBERT": keybert_model,
        # "OpenAI": openai_model,
        "MMR": mmr_model
    }

    # Instantiate BERTopic with fine-tuned model's embeddings and the representation model
    topic_model = BERTopic(embedding_model='all-MiniLM-L6-v2',  # TODO
                        verbose=False,
                        n_gram_range=(1, 3),
                        min_topic_size=min_topic_size,
                        calculate_probabilities=True,
                        representation_model=representation_model)

    > Note: setting embedding_model=finetuned_model doesn't remake embeddings. However, we must set the parameter or the code won't work.

    
See [discussion](https://github.com/MaartenGr/BERTopic/issues/1601 "further parameter and hyperparameter discussion").

In [None]:
topics, probs = topic_model.fit_transform(docs, embeddings=pre_generated_embeddings)
    return topics, probs, topic_model

### Run topic modeling

In [23]:
# 

# run topic modeling using pre-generated finetuned embeddings
topics_2, probs_2, topic_model_2 = topic_modeling(2)  # min topic size 2
topics_5, probs_5, topic_model_5 = topic_modeling(5)  # min topic size 5

In [8]:
# saving progress
# embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
# topic_model_2.save("outputs/topic_model_2", serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model)
# topic_model_5.save("outputs/topic_model_5", serialization="pytorch", save_ctfidf=True, save_embedding_model=embedding_model)

In [6]:
# # loading topic models
# topic_model_5 = BERTopic.load("outputs/topic_model_5")
# topic_model_2 = BERTopic.load("outputs/topic_model_2")

In [26]:
# it didn't save the representative docs :(  why 316 topic -1 count?
topic_model_5.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,309,-1_the_and_of_to,"[the, and, of, to, information, or, system, for, in, personally]","[cloud personally identifiable, personally identifiable information, identifiable information, cloud service, privacy, personally identifiable, policies, security, identifiable, public cloud]","[the, and, of, to, information, or, system, for, in, personally]",[Policies for information security. Control 5.1.1 and the associated implementation guidance and other information specified in ISO/IEC 27002 apply. The following sector-specific guidance also app...
1,0,132,0_the_criterion_of_cloud,"[the, criterion, of, cloud, the cloud, of the, and, service, the cloud service, cloud service]","[cloud service provider, cloud service, the cloud service, cloud customers, cloud customer, service provider, cloud, policies and instructions, security, customer criterion notes]","[the, criterion, of, cloud, the cloud, of the, and, service, the cloud service, cloud service]",[Product safety and security (pss). Guidelines and recommendations for cloud customers. Basic criterion: The cloud service provider provides cloud customers with guidelines and recommendations for...
2,1,49,1_and_should_management_procedures,"[and, should, management, procedures, chain, supply chain, policies, and procedures, supply, risk]","[policies and procedures, audit and assurance, compliance, audit and, policies and, policies, management transparency and, audit, management transparency, accountability]","[and, should, management, procedures, chain, supply chain, policies, and procedures, supply, risk]","[Audit & assurance. Audit and assurance policy and procedures. Establish, document, approve, communicate, apply, evaluate, and maintain audit and assurance policies and procedures and standards. R..."
3,2,45,2_cloud_cloud service_service_the cloud,"[cloud, cloud service, service, the cloud, the cloud service, service customer, cloud service customer, customer, cloud service provider, service provider]","[cloud service provider, guidance for cloud, information for cloud, cloud service customer, cloud services cloud, cloud service, services cloud service, cloud services, services cloud, provider th...","[cloud, cloud service, service, the cloud, the cloud service, service customer, cloud service customer, customer, cloud service provider, service provider]",[Clock synchronization. Control 12.4.4 and the associated implementation guidance and other information specified in ISO/IEC 27002 apply. The following sector-specific guidance also applies. Imple...
4,3,42,3_data_access_and_should,"[data, access, and, should, should be, management, be, to, production, implement]","[identity access management, identity access, access management, privileged access roles, data security and, privileged access, access roles, data security, security, security and privacy]","[data, access, and, should, should be, management, be, to, production, implement]","[Identity & access management. Identity inventory. Manage, store, and review the information of system identities and the level of access. Organizations should maintain a database of all system id..."
5,4,38,4_system_software_and_protection,"[system, software, and, protection, of, network, to, or, boundary, integrity]","[communications protection, communications protection boundary, security functions, protection boundary protection, boundary protection, security, and communications protection, protection boundar...","[system, software, and, protection, of, network, to, or, boundary, integrity]",[System and communications protection. Security function isolation. Isolate security functions from non-security functions. Security functions are isolated from non-security functions by means of ...
6,5,32,5_personally identifiable information_personally identifiable_identifiable information_personally,"[personally identifiable information, personally identifiable, identifiable information, personally, identifiable, the organization shall, organization shall, shall, information, the organization]","[identifiable information principals, identifiable information control, personally identifiable information, information principals, identifiable information to, identifiable information, identifi...","[personally identifiable information, personally identifiable, identifiable information, personally, identifiable, the organization shall, organization shall, shall, information, the organization]",[Obligations to personally identifiable information principals objective:. To ensure that personally identifiable information (PII) principals are provided with the appropriate information about t...
7,6,32,6_customer_cloud service provider_service provider_provider,"[customer, cloud service provider, service provider, provider, cloud service, cloud, service, the customer, the cloud, shall]","[cloud service provider, cloud service providers, cloud service, cloud services, cloud services agreement, the cloud service, gdpr cloud service, the cloud services, service provider shall, custom...","[customer, cloud service provider, service provider, provider, cloud service, cloud, service, the customer, the cloud, shall]","[Cloud service provider shall establish documented procedures, that enables customer to ac-cess relevant information to comply with its obligations and duties under gdpr.. Cloud service provider s..."
8,7,24,7_alternate_contingency_site_sites,"[alternate, contingency, site, sites, planning, alternate processing, telecommunications, primary, contingency planning, processing]","[contingency planning alternate, contingency planning, contingency planning contingency, contingency plans, planning contingency, contingency plan, alternate processing sites, planning alternate, ...","[alternate, contingency, site, sites, planning, alternate processing, telecommunications, primary, contingency planning, processing]","[Contingency planning. Telecommunications services. Establish alternate telecommunications services, including necessary agreements to permit the resumption of [assignment: organization-defined sy..."
9,8,21,8_and_system_the_security,"[and, system, the, security, to, of, or, privacy, for, requirements]","[privacy requirements, security and privacy, security and, security, and privacy requirements, privacy plans, requirements, vulnerability monitoring, the security, privacy]","[and, system, the, security, to, of, or, privacy, for, requirements]",[Planning. Security and privacy architectures. a. Develop security and privacy architectures for the system that: \n1. Describe the requirements and approach to be taken for protecting the confide...


### Add outcomes to df

In [9]:
# df['topics_2'] = topics_2
# df['probs_2'] = probs_2.tolist()
# df['topics_5'] = topics_5
# df['probs_5'] = probs_5.tolist()

In [15]:
# the outputs are accessible like this, and are what we feed to openai
# topic_model_5.get_topic_info().head(1)

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,316,-1_the_and_of_to,"[the, and, of, to, or, information, for, system, in, is]","[personally identifiable information, identifiable information, cloud service provider, the cloud service, cloud service, security, public cloud, service provider, personally identifiable, identif...","[the, and, of, to, or, information, for, system, in, is]","[Response to information security incidents. The control implementation guidance and other information stated in ISO/IEC 27002:2013, 16.1.s and the following additional guidance applies:\n\nAdditi..."


# OpenAI labels

In [None]:
# THIS IS A BACKUP, DON'T TOUCH THE CODE
# function to get labels from OpenAI - input is a single topic model

    return client.completions.create(**kwargs)

def get_openai_lab(topic_model):

  # must manually add openai representation because the bertopic utility doesn't work
  client = OpenAI(api_key=read_key_from_file())
  # openai_model = OpenAI(client, model="gpt-4", exponential_backoff=True, chat=False)

  # main loop - get the topic names from OpenAI
  rows_to_append = []

  # client = openai.OpenAI(api_key=read_key_from_file())  # NOTE: this is the old and now defunct way of calling OPenAI

  for x, i in enumerate(topic_model.get_topics()):
    # skip -1, outliers
    if x == 0 and i == -1:
          continue
    # store keybert and mmr representation values
    keybertrep = topic_model.get_topic_info()[topic_model.get_topic_info()['Topic']==x]['KeyBERT']
    mmrrep = topic_model.get_topic_info()[topic_model.get_topic_info()['Topic']==x]['MMR']

    completion = client.chat.completions.create(
      model="gpt-4",
      messages = [
  {"role": "system", "content": "You are a helpful assistant, knowledgeable on data privacy and security standards and controls, that helps in topic modeling tasks."},
  # rep docs
  {"role": "user", "content": f"I have a topic that contains the following documents among others: {topic_model.get_representative_docs()[i]}."},
  # keybert representation
  {"role": "user", "content": f"The topic is described by the following keywords: {keybertrep}."},
  # MMR representation
  {"role": "user", "content": f"The topic is also described by the following keywords: {mmrrep}."},
  # ask
  {"role": "user", "content": """Based on the information above, extract a short but highly descriptive topic label of at most 5 words. 
  
      Make sure it is in the following format:
      
      topic: <topic label>."""},
  ]
    )

    # Extract the topic label from the completion response
    topic_label = completion.choices[0].message.content.split("topic: ")[1].strip()

    rows_to_append.append({
      "topic_num": i,
      "representative_docs": topic_model.get_representative_docs()[i],  
      "top_words": topic_model.get_topics()[i], 
      "topic_label": topic_label
      })

    print(f"Appended topic {i}: {topic_label}")

    time.sleep(2)

  return rows_to_append

In [None]:
# function to get labels from OpenAI - input is a single topic model, with backoff

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def completion_with_backoff(client, **kwargs):
    return client.chat.completions.create(**kwargs)

def get_openai_lab(topic_model):

  # must manually add openai representation because the bertopic utility doesn't work
  client = OpenAI(api_key=read_key_from_file())
  # openai_model = OpenAI(client, model="gpt-4", exponential_backoff=True, chat=False)

  # main loop - get the topic names from OpenAI
  rows_to_append = []

  # client = openai.OpenAI(api_key=read_key_from_file())  # NOTE: this is the old and now defunct way of calling OPenAI

  for x, i in enumerate(topic_model.get_topics()):
    # skip -1, outliers
    if x == 0 and i == -1:
          continue
    # store keybert and mmr representation values
    keybertrep = topic_model.get_topic_info()[topic_model.get_topic_info()['Topic']==x]['KeyBERT']
    mmrrep = topic_model.get_topic_info()[topic_model.get_topic_info()['Topic']==x]['MMR']

    completion = completion_with_backoff(client,
      model="gpt-4",
      messages = [
  {"role": "system", "content": "You are a helpful assistant, knowledgeable on data privacy and security standards and controls, that helps in topic modeling tasks."},
  # rep docs
  {"role": "user", "content": f"I have a topic that contains the following documents among others: {topic_model.get_representative_docs()[i]}."},
  # keybert representation
  {"role": "user", "content": f"The topic is described by the following keywords: {keybertrep}."},
  # MMR representation
  {"role": "user", "content": f"The topic is also described by the following keywords: {mmrrep}."},
  # ask
  {"role": "user", "content": """Based on the information above, extract a short but highly descriptive topic label of at most 5 words. 
  
      Make sure it is in the following format:
      
      topic: <topic label>."""},
  ]
    )

    # Extract the topic label from the completion response
    topic_label = completion.choices[0].message.content.split("topic: ")[1].strip()

    rows_to_append.append({
      "topic_num": i,
      "representative_docs": topic_model.get_representative_docs()[i],  
      "top_words": topic_model.get_topics()[i], 
      "topic_label": topic_label
      })

    # print(f"Appended topic {i}: {topic_label}")  # shows results verbose

    time.sleep(1)

  return rows_to_append

In [None]:
# get topic modeling results for cluster min size == 5
results_5 = get_openai_lab(topic_model=topic_model_5)

In [None]:
# Add OpenAI labels to df for 5 topic min

# Initial DataFrame
temp_df = pd.DataFrame({"topic_num": [],
                       "representative_docs": [],
                       "top_words": [],
                       "topic_label": []})
temp_df['topic_label'] = temp_df['topic_label'].astype(int)

# Concatenate collected rows with the original DataFrame
temp_df = pd.concat([temp_df, pd.DataFrame(results_5)], ignore_index=True)
# drop unneeded columns
temp_df = temp_df[['topic_num', 'topic_label']]
# merge   
df = pd.merge(df, temp_df, left_on='topics_5', right_on='topic_num', how='left')

In [None]:
# get topic modeling results for cluster min size == 2
results_2 = get_openai_lab(topic_model=topic_model_2)

In [None]:
# Add OpenAI labels to df for 2 topic min

# Initial DataFrame
temp_df = pd.DataFrame({"topic_num": [],
                       "representative_docs": [],
                       "top_words": [],
                       "topic_label": []})
temp_df['topic_label'] = temp_df['topic_label'].astype(int)

# Concatenate collected rows with the original DataFrame
temp_df = pd.concat([temp_df, pd.DataFrame(results_2)], ignore_index=True)
# drop unneeded columns
temp_df = temp_df[['topic_num', 'topic_label']]
temp_df['topic_label_2'] = temp_df['topic_label']
del temp_df['topic_label']
# merge   
df = pd.merge(df, temp_df, left_on='topics_2', right_on='topic_num', how='left')

In [None]:
# rename column for clarity distinguishing 2 and 5 minimum results
df = df.rename(columns={'topic_label': 'topic_label_5'})
del df['topic_num_y']  # same as topics_2
del df['topic_num_x']  # same as 'topics_5'

In [None]:
# check results
df.tail(3)

In [None]:
# save checkpoint
# df.to_excel(r'data\finalized\df_final.xlsx')

In [16]:
# reopen data from checkpoint
df = pd.read_excel(r'data\finalized\df_final.xlsx')
df['finetune_embeddings'] = df['finetune_embeddings'].apply(ast.literal_eval)
print(df.shape)

(1125, 16)


In [27]:
# how many topics? 
# 0-150 which is correct, and matches modified_tree2.txt
z = df['topics_2'].unique().tolist()
z.sort()
z[-5:]

[146, 147, 148, 149, 150]

In [28]:
# how many topics? 
# 0-39 which is??
z = df['topics_5'].unique().tolist()
z.sort()
z[-5:]

[35, 36, 37, 38, 39]

# Pivot table: 2 document min

In [35]:
df.columns

Index(['Unnamed: 0', 'control_category', 'control_code', 'control_name',
       'control_text', 'page', 'document', 'control_text_corrected',
       'full_control_text', 'finetune_embeddings', 'topics_2', 'probs_2',
       'topics_5', 'probs_5', 'topic_label_5', 'topic_label_2'],
      dtype='object')

In [34]:
df['topic_label_5'].value_counts()

topic_label_5
Cloud Service Provider Control                                 131
Cloud Service Security Standards                               111
Audit, Assurance, and Access Management                         67
Identity & Access Management Control                            46
Obligations in Processing Personal Identifiable Information     46
Cloud Service Provider Compliance                               44
Contingency Planning in Telecommunications                      27
System Security and Privacy Management                          24
Managing Vulnerabilities, Privacy, Resilience.                  23
Personal Data Privacy and Consent                               21
Temporary File and PII Management                               20
Cryptographic Key Management Processes                          20
Audit Record Security and Analysis                              18
Incident Response and Handling Management                       16
Incident Response and Security Functions        

In [18]:
# TODO: why NAN at far right of pivot table?
# Pivot table: two document min - now sorted by columns, displaying topic #, for easy navigation

# First, adapt df for clarity
tempdf = df.copy()
tempdf['topic_label_num'] = tempdf['topics_2'].astype(str) + ': ' + tempdf['topic_label_2'].astype(str)

# Add a helper column for counting if needed
tempdf['count'] = 1

# Create a MultiIndex for columns from topics_2 (for sorting) and topic_label_num (for display)
tempdf.set_index(['document', 'topics_2', 'topic_label_num'], inplace=True)

# Step 2: Create the pivot table with MultiIndex
pivot_table_2 = tempdf.pivot_table(index='document', columns=['topics_2', 'topic_label_num'], values='count', aggfunc='count', fill_value=0)

# Sort the pivot table columns by topics_2 first (for correct numeric order), then by topic_label_num
pivot_table_2.columns = pivot_table_2.columns.droplevel(0)  # Drop the 'topics_2' level, leaving 'topic_label_num'

# Optional: Sort columns alphanumerically if needed, but should already be in correct order from the MultiIndex sorting
# pivot_table_2.sort_index(axis=1, inplace=True)

# Step 3: Optionally add sums
pivot_table_2.loc['Total'] = pivot_table_2.sum()

# cleanup
del tempdf

# Show pivot_table
pivot_table_2

topic_label_num,-1: nan,0: Cloud Service Provider Responsibilities,1: Cloud Services Information Security,2: Privacy and Security Architectures Development,3: Data Privacy and Consent Management,4: Cryptographic Key Management Practices,5: Data Protection and Security Policies,6: Policy Development in Information Security,7: Public Cloud PII Protection,8: Personally Identifiable Information Management,9: Data Privacy and Control Implementation,10: Public Cloud PII Protection,11: Cryptographic Protection Mechanisms,12: Business Continuity and Resilience Management,13: Organizational Policy and Procedures Management,14: Software Configuration and Integrity Management,15: Contingency Planning and Telecommunications,16: Information Security and Privacy Policies,17: Organizational Security and Privacy Policies,18: ISO IEC 27002 PII Sharing.,19: Security and Personal Data Protection,20: System and Data Security Management,21: System Configuration and Vulnerability Management.,22: Threat & Vulnerability Management Practices.,23: Network Boundary and Remote Access Control,24: Managing Personally Identifiable Information Access,25: Identity & Privileged Access Management,26: Personnel Security and Audit Accountability,27: System Security and Function Isolation,28: Public Cloud Data Privacy and Security,29: Identification and Multi-factor Authentication,30: Emergency Power and Protection Measures,31: Incident Response and Management Training,32: Access Control and Account Management,33: Identification and Authentication Security,34: Wireless Access and Physical Control,35: Wireless Access Control Measures,36: Audit Record and Logging Management,37: Audit Record Analysis and Accountability,38: Personal Identifiable Information Control,39: Vulnerability Management and Mitigation,40: Contingency Planning and Recovery,41: Government Agency Investigation Requests Handling,42: Personnel Security and System Maintenance,43: System Integrity and Contingency Planning.,44: Privileged Access Control Management,45: Internal Control and Risk Management,46: Risk and Vulnerability Management.,47: Supply Chain Risk Management,48: Media Protection and Control,49: Personnel Security and Fire Protection,50: Data Security and Privacy Management,51: Protecting Personally Identifiable Information,52: Data Privacy and Security Controls,53: Human Resources Information Security,54: PII Principals Processing Obligations.,55: Cloud Service User Access Management,56: Public Cloud Information Security,57: Audit Records and Access Control,58: Secure Datacenter Policies and Procedures,59: Datacenter Security and Management Procedures,60: Data Security and Management Procedures,61: Personally Identifiable Information Controls,62: Access Control and Authorization Processes,63: Disaster Response and Continuity Planning.,64: Data Privacy and Authentication Standards,65: Cloud User ID Management,66: Personally Identifiable Information Control,67: Personally Identifiable Information Management,68: Personally Identifiable Information Management,69: System Maintenance and Mobile Code Security,70: Secure Erasure of Temporary Files,71: Cloud Service Security Auditing,72: Cloud Service Security Protocols,73: Security Controls and Access Authorizations,74: PII Transmission and Control,75: Cloud Service PII Protection,76: Cloud Service Provider Responsibilities.,77: PII Disclosure Records Management,78: System Security and Protocol Management,79: System Information Integrity and Protection,80: Personal Identifiable Information Protection Practices,81: Data Privacy and Equipment Security,82: Environmental Threat and Data Security Management,83: Identity Proofing and Authentication,84: Contingency Planning and System Security.,85: System Development and Security Management.,86: Data Security and System Protection,"87: Interoperability, Portability and Data Security",88: Social Media and System Access Control,89: Cloud Service Security Management,90: Cloud Service Security Management,91: Cloud Security Risk Management.,92: Human Resources Information Security,93: System Clock Synchronization and Security,94: Privileged User Access Control,95: Processing Integrity and Security Assessments,96: Continuous Security Assessment and Authorization,97: Information Security and Risk Management.,98: Security Assessment and Management.,99: Configuration Management and Automation,100: Public Cloud Information Security Compliance,101: Secure Data Disposal and Backup.,102: PII Backup and Restoration Compliance,103: Audit Management and Accountability,104: Supply Chain Management Standards,105: Data Privacy and Security Controls,106: Contingency and Incident Response Planning,107: Cloud Service and Security Incidents,108: Cloud Customer Obligations Control,109: Customer Compliance and Data Obligations.,110: Personal Identifiable Information Security,111: Cloud Security and Monitoring Procedures,112: Automated Security Configuration Management,113: Security Systems Operations and Procedures,114: Compliance and Information Security Management,115: Cloud Operations Logging & Monitoring,116: Configuration & Data Security Management,117: Configuration Management and Maintenance,118: Encryption and Access Control,119: Cloud Service Provider Responsibilities,120: Automated Access and System Monitoring.,121: Physical and Environmental Protection,122: PIV Credentials Verification and Acceptance,123: Access Control and Device Security.,124: Insider Threat and Security Measures.,125: Network Security and Authentication Controls,126: Authenticator and Password Management,127: Remote Access and Privileged Commands,128: Unique User ID Allocation,129: Automated Incident Reporting & Handling,130: GDPR Compliance in Data Transfers,131: Cloud Service Confidentiality Management.,132: Cloud Service Operational Security.,133: System Security and Functionality Management,134: Access Control and User Authentication,135: Privacy Regulations and Data Portability,"136: Interoperability, Data Security, Privacy Management.",137: Security Incident and Penetration Testing.,138: Cloud Service Access Management,139: Cloud Service Security Management.,140: Segregated Environment & Access Management.,141: Information Systems Development and Auditing,142: Cloud Service Security and Development,143: Cloud Service Security Standards.,144: ISO Compliance in Information Security,145: Cloud Service Event Logging,146: Change Control and Configuration Management,147: Data Security and Privacy Management,148: System Integrity and Information Monitoring,149: Cloud Service Provider Management,150: nan
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1
c5,53,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,3,0,0,0,0,0,0,3,0,0,2,3,0,0,3,0,0,0,0,0,0,0,1,0,0,3,0,3,0,3,0,0,0,3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0
ccm,46,5,0,20,0,0,17,0,0,1,0,0,0,0,10,0,0,11,0,0,0,0,0,0,0,8,8,0,0,7,0,0,5,0,1,0,3,0,0,0,5,0,0,0,0,0,0,0,0,5,2,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,4,0,0,0,0,0,0,1,0,0,1,4,0,0,0,4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,3,1,3,0,0,0,0,3,2,0,0,0,0,0,0,2,0,0,2,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0
eu_coc,17,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,5,1,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,3,0,0,3,3,0,3,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
fedramp,84,18,23,0,0,17,0,0,0,13,14,0,0,0,0,0,11,0,10,0,0,9,9,9,9,0,0,8,0,0,7,7,0,2,6,7,4,7,0,0,2,6,6,0,0,0,0,0,0,1,4,5,5,5,0,0,5,5,0,0,5,5,0,2,5,5,5,0,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,3,4,1,3,0,0,4,0,0,0,3,0,1,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,3,0,3,0,0,0,0,0,3,0,3,0,1,1,2,3,3,3,0,0,0,0,0,2,2,0,2,2,2,2,0,0,0,0,0,0,0,2
iso_27002,6,0,0,0,0,0,0,5,0,0,0,0,0,4,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,1,0,0,1,2,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0
iso_27017,11,0,0,0,18,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0
iso_27018,6,0,0,0,0,0,0,5,0,0,0,0,0,4,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,1,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
iso_27701,32,0,0,0,0,0,0,6,0,0,0,13,12,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,7,7,0,0,0,2,2,6,6,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,5,0,2,0,0,0,5,0,2,2,1,0,0,0,0,0,4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,1,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,1,0,0,0,0,1,2,0
soc2,13,2,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,6,0,0,0,0,0,5,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Total,268,25,23,20,19,17,17,16,14,14,14,13,12,12,11,11,11,11,10,10,10,9,9,9,9,8,8,8,8,8,7,7,7,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6,6,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2


In [19]:
# normalized pivot table

# Normalize the counts by row to get the percentage and then round to two decimals
pivot_table_percentage_rounded_2 = (pivot_table_2.div(pivot_table_2.sum(axis=1), axis=0) * 100).round(1)

# Display the pivot table with percentages rounded to two decimals
pivot_table_percentage_rounded_2

topic_label_num,-1: nan,0: Cloud Service Provider Responsibilities,1: Cloud Services Information Security,2: Privacy and Security Architectures Development,3: Data Privacy and Consent Management,4: Cryptographic Key Management Practices,5: Data Protection and Security Policies,6: Policy Development in Information Security,7: Public Cloud PII Protection,8: Personally Identifiable Information Management,9: Data Privacy and Control Implementation,10: Public Cloud PII Protection,11: Cryptographic Protection Mechanisms,12: Business Continuity and Resilience Management,13: Organizational Policy and Procedures Management,14: Software Configuration and Integrity Management,15: Contingency Planning and Telecommunications,16: Information Security and Privacy Policies,17: Organizational Security and Privacy Policies,18: ISO IEC 27002 PII Sharing.,19: Security and Personal Data Protection,20: System and Data Security Management,21: System Configuration and Vulnerability Management.,22: Threat & Vulnerability Management Practices.,23: Network Boundary and Remote Access Control,24: Managing Personally Identifiable Information Access,25: Identity & Privileged Access Management,26: Personnel Security and Audit Accountability,27: System Security and Function Isolation,28: Public Cloud Data Privacy and Security,29: Identification and Multi-factor Authentication,30: Emergency Power and Protection Measures,31: Incident Response and Management Training,32: Access Control and Account Management,33: Identification and Authentication Security,34: Wireless Access and Physical Control,35: Wireless Access Control Measures,36: Audit Record and Logging Management,37: Audit Record Analysis and Accountability,38: Personal Identifiable Information Control,39: Vulnerability Management and Mitigation,40: Contingency Planning and Recovery,41: Government Agency Investigation Requests Handling,42: Personnel Security and System Maintenance,43: System Integrity and Contingency Planning.,44: Privileged Access Control Management,45: Internal Control and Risk Management,46: Risk and Vulnerability Management.,47: Supply Chain Risk Management,48: Media Protection and Control,49: Personnel Security and Fire Protection,50: Data Security and Privacy Management,51: Protecting Personally Identifiable Information,52: Data Privacy and Security Controls,53: Human Resources Information Security,54: PII Principals Processing Obligations.,55: Cloud Service User Access Management,56: Public Cloud Information Security,57: Audit Records and Access Control,58: Secure Datacenter Policies and Procedures,59: Datacenter Security and Management Procedures,60: Data Security and Management Procedures,61: Personally Identifiable Information Controls,62: Access Control and Authorization Processes,63: Disaster Response and Continuity Planning.,64: Data Privacy and Authentication Standards,65: Cloud User ID Management,66: Personally Identifiable Information Control,67: Personally Identifiable Information Management,68: Personally Identifiable Information Management,69: System Maintenance and Mobile Code Security,70: Secure Erasure of Temporary Files,71: Cloud Service Security Auditing,72: Cloud Service Security Protocols,73: Security Controls and Access Authorizations,74: PII Transmission and Control,75: Cloud Service PII Protection,76: Cloud Service Provider Responsibilities.,77: PII Disclosure Records Management,78: System Security and Protocol Management,79: System Information Integrity and Protection,80: Personal Identifiable Information Protection Practices,81: Data Privacy and Equipment Security,82: Environmental Threat and Data Security Management,83: Identity Proofing and Authentication,84: Contingency Planning and System Security.,85: System Development and Security Management.,86: Data Security and System Protection,"87: Interoperability, Portability and Data Security",88: Social Media and System Access Control,89: Cloud Service Security Management,90: Cloud Service Security Management,91: Cloud Security Risk Management.,92: Human Resources Information Security,93: System Clock Synchronization and Security,94: Privileged User Access Control,95: Processing Integrity and Security Assessments,96: Continuous Security Assessment and Authorization,97: Information Security and Risk Management.,98: Security Assessment and Management.,99: Configuration Management and Automation,100: Public Cloud Information Security Compliance,101: Secure Data Disposal and Backup.,102: PII Backup and Restoration Compliance,103: Audit Management and Accountability,104: Supply Chain Management Standards,105: Data Privacy and Security Controls,106: Contingency and Incident Response Planning,107: Cloud Service and Security Incidents,108: Cloud Customer Obligations Control,109: Customer Compliance and Data Obligations.,110: Personal Identifiable Information Security,111: Cloud Security and Monitoring Procedures,112: Automated Security Configuration Management,113: Security Systems Operations and Procedures,114: Compliance and Information Security Management,115: Cloud Operations Logging & Monitoring,116: Configuration & Data Security Management,117: Configuration Management and Maintenance,118: Encryption and Access Control,119: Cloud Service Provider Responsibilities,120: Automated Access and System Monitoring.,121: Physical and Environmental Protection,122: PIV Credentials Verification and Acceptance,123: Access Control and Device Security.,124: Insider Threat and Security Measures.,125: Network Security and Authentication Controls,126: Authenticator and Password Management,127: Remote Access and Privileged Commands,128: Unique User ID Allocation,129: Automated Incident Reporting & Handling,130: GDPR Compliance in Data Transfers,131: Cloud Service Confidentiality Management.,132: Cloud Service Operational Security.,133: System Security and Functionality Management,134: Access Control and User Authentication,135: Privacy Regulations and Data Portability,"136: Interoperability, Data Security, Privacy Management.",137: Security Incident and Penetration Testing.,138: Cloud Service Access Management,139: Cloud Service Security Management.,140: Segregated Environment & Access Management.,141: Information Systems Development and Auditing,142: Cloud Service Security and Development,143: Cloud Service Security Standards.,144: ISO Compliance in Information Security,145: Cloud Service Event Logging,146: Change Control and Configuration Management,147: Data Security and Privacy Management,148: System Integrity and Information Monitoring,149: Cloud Service Provider Management,150: nan
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1
c5,43.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.3,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,1.7,2.5,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,2.5,0.0,2.5,0.0,2.5,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.7,1.7,0.0,0.0,0.0,0.0,0.0
ccm,23.4,2.5,0.0,10.2,0.0,0.0,8.6,0.0,0.0,0.5,0.0,0.0,0.0,0.0,5.1,0.0,0.0,5.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.1,4.1,0.0,0.0,3.6,0.0,0.0,2.5,0.0,0.5,0.0,1.5,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,1.5,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5,0.5,1.5,0.0,0.0,0.0,0.0,1.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eu_coc,27.0,0.0,0.0,0.0,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.6,0.0,0.0,0.0,0.0,0.0,15.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.9,0.0,0.0,0.0,7.9,1.6,0.0,0.0,7.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.8,0.0,0.0,4.8,4.8,0.0,4.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fedramp,20.5,4.4,5.6,0.0,0.0,4.1,0.0,0.0,0.0,3.2,3.4,0.0,0.0,0.0,0.0,0.0,2.7,0.0,2.4,0.0,0.0,2.2,2.2,2.2,2.2,0.0,0.0,2.0,0.0,0.0,1.7,1.7,0.0,0.5,1.5,1.7,1.0,1.7,0.0,0.0,0.5,1.5,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.2,1.0,1.2,1.2,1.2,0.0,0.0,1.2,1.2,0.0,0.0,1.2,1.2,0.0,0.5,1.2,1.2,1.2,0.0,0.0,0.0,0.2,0.0,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7,1.0,0.2,0.7,0.0,0.0,1.0,0.0,0.0,0.0,0.7,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7,0.0,0.7,0.0,0.0,0.0,0.0,0.0,0.7,0.0,0.7,0.0,0.2,0.2,0.5,0.7,0.7,0.7,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.5,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
iso_27002,14.6,0.0,0.0,0.0,0.0,0.0,0.0,12.2,0.0,0.0,0.0,0.0,0.0,9.8,0.0,0.0,0.0,0.0,0.0,9.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.4,0.0,0.0,2.4,4.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.9,4.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.4,0.0,0.0,0.0,0.0,0.0,0.0,2.4,0.0,0.0,0.0,0.0,0.0,4.9,0.0,2.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.9,0.0,0.0,2.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.4,0.0,0.0,2.4,0.0,0.0,0.0,0.0
iso_27017,23.4,0.0,0.0,0.0,38.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.3,0.0,0.0,0.0
iso_27018,13.6,0.0,0.0,0.0,0.0,0.0,0.0,11.4,0.0,0.0,0.0,0.0,0.0,9.1,0.0,0.0,0.0,0.0,0.0,11.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.3,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.3,0.0,0.0,0.0,0.0,0.0,6.8,2.3,0.0,0.0,0.0,0.0,0.0,4.5,0.0,2.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.3,0.0,0.0,2.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.3,0.0,2.3,0.0,0.0
iso_27701,22.7,0.0,0.0,0.0,0.0,0.0,0.0,4.3,0.0,0.0,0.0,9.2,8.5,2.1,0.0,0.0,0.0,0.0,0.0,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7,0.0,0.0,0.0,0.0,5.0,5.0,0.0,0.0,0.0,1.4,1.4,4.3,4.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,1.4,0.0,0.0,0.0,3.5,0.0,1.4,1.4,0.7,0.0,0.0,0.0,0.0,0.0,2.8,1.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.1,0.0,0.0,0.0,0.0,0.0,0.0,0.7,0.0,0.0,2.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.4,1.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7,0.0,0.0,0.0,0.0,0.7,1.4,0.0
soc2,21.3,3.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.8,9.8,0.0,0.0,0.0,0.0,0.0,8.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Total,23.8,2.2,2.0,1.8,1.7,1.5,1.5,1.4,1.2,1.2,1.2,1.2,1.1,1.1,1.0,1.0,1.0,1.0,0.9,0.9,0.9,0.8,0.8,0.8,0.8,0.7,0.7,0.7,0.7,0.7,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.6,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2


# Pivot table: 5 document min

In [20]:
# Pivot table: two document min

# Pivot table: two document min - now sorted by columns, displaying topic #, for easy navigation

# First, adapt df for clarity
tempdf = df.copy()
tempdf['topic_label_num'] = tempdf['topics_5'].astype(str) + ': ' + tempdf['topic_label_5'].astype(str)

# Add a helper column for counting if needed
tempdf['count'] = 1

# Create a MultiIndex for columns from topics_2 (for sorting) and topic_label_num (for display)
tempdf.set_index(['document', 'topics_5', 'topic_label_num'], inplace=True)

# Step 2: Create the pivot table with MultiIndex
pivot_table_5 = tempdf.pivot_table(index='document', columns=['topics_5', 'topic_label_num'], values='count', aggfunc='count', fill_value=0)

# Sort the pivot table columns by topics_2 first (for correct numeric order), then by topic_label_num
pivot_table_5.columns = pivot_table_5.columns.droplevel(0)  # Drop the 'topics_2' level, leaving 'topic_label_num'

# Optional: Sort columns alphanumerically if needed, but should already be in correct order from the MultiIndex sorting
# pivot_table_5.sort_index(axis=1, inplace=True)

# Step 3: Optionally add sums
pivot_table_5.loc['Total'] = pivot_table_5.sum()

# cleanup
del tempdf

# Show pivot_table
pivot_table_5

topic_label_num,-1: nan,0: Cloud Service Provider Control,1: Cloud Service Security Standards,"2: Audit, Assurance, and Access Management",3: Identity & Access Management Control,4: Obligations in Processing Personal Identifiable Information,5: Cloud Service Provider Compliance,6: Contingency Planning in Telecommunications,7: System Security and Privacy Management,"8: Managing Vulnerabilities, Privacy, Resilience.",9: Personal Data Privacy and Consent,10: Temporary File and PII Management,11: Cryptographic Key Management Processes,12: Audit Record Security and Analysis,13: Incident Response and Handling Management,14: Incident Response and Security Functions,15: System Security and Protection Measures,16: Personally Identifiable Information Controls,17: Organizational Information Security Alerts,18: Information Security and Data Protection,19: Cloud PII Protection Policies,20: Information Protection and Security Procedures,21: Cloud PII Processing & Breach Notification,22: Physical and Environmental Protection,23: Organizational User Identification and Authentication,24: Access Control and Physical Monitoring,25: System Maintenance and User Authorization,26: ISO/IEC 27002 PII Protection,27: Cryptographic Mechanisms for Information Protection,28: Internal Control and Privacy Policies,29: Organizational Policy and Procedure Management,30: Media Protection and Control,31: Secure Network and System Configuration.,32: Automated Tools in Data Security,33: Personal Identity Verification Methods,34: Cloud PII Security & Incident Management,35: Personnel Security and Data Retention,36: Cloud Service Data Protection,37: Cloud Service Provider Responsibilities,38: Managing Personally Identifiable Information.,39: Access Control and Privileged Accounts
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
c5,2,119,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
ccm,48,3,0,3,1,43,40,0,2,5,19,0,20,0,3,0,0,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,3,2,0
eu_coc,1,0,1,59,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
fedramp,159,5,0,0,0,2,1,27,0,16,2,0,0,18,13,16,0,14,0,9,10,11,11,10,7,9,0,9,1,9,0,0,7,7,7,6,7,6,3,3,5
iso_27002,12,0,16,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,1,0,0,0,2,0,2,3,0,0,0,0,0,0,0,0,0
iso_27017,2,0,0,0,44,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
iso_27018,13,0,15,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,1,0,0,0,2,0,2,3,0,0,0,0,0,0,0,0,0
iso_27701,22,1,79,2,0,0,1,0,0,0,0,0,0,0,0,0,14,0,3,0,0,0,0,0,0,0,9,0,4,0,4,2,0,0,0,0,0,0,0,0,0
soc2,13,3,0,0,0,0,0,0,22,2,0,20,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Total,272,131,111,67,46,46,44,27,24,23,21,20,20,18,16,16,14,14,12,12,11,11,11,10,10,9,9,9,9,9,8,8,7,7,7,7,7,6,6,5,5


In [21]:
# normalized pivot table

# Normalize the counts by row to get the percentage and then round to two decimals
pivot_table_percentage_rounded_5 = (pivot_table_5.div(pivot_table_5.sum(axis=1), axis=0) * 100).round(1)

# Display the pivot table with percentages rounded to two decimals
pivot_table_percentage_rounded_5

topic_label_num,-1: nan,0: Cloud Service Provider Control,1: Cloud Service Security Standards,"2: Audit, Assurance, and Access Management",3: Identity & Access Management Control,4: Obligations in Processing Personal Identifiable Information,5: Cloud Service Provider Compliance,6: Contingency Planning in Telecommunications,7: System Security and Privacy Management,"8: Managing Vulnerabilities, Privacy, Resilience.",9: Personal Data Privacy and Consent,10: Temporary File and PII Management,11: Cryptographic Key Management Processes,12: Audit Record Security and Analysis,13: Incident Response and Handling Management,14: Incident Response and Security Functions,15: System Security and Protection Measures,16: Personally Identifiable Information Controls,17: Organizational Information Security Alerts,18: Information Security and Data Protection,19: Cloud PII Protection Policies,20: Information Protection and Security Procedures,21: Cloud PII Processing & Breach Notification,22: Physical and Environmental Protection,23: Organizational User Identification and Authentication,24: Access Control and Physical Monitoring,25: System Maintenance and User Authorization,26: ISO/IEC 27002 PII Protection,27: Cryptographic Mechanisms for Information Protection,28: Internal Control and Privacy Policies,29: Organizational Policy and Procedure Management,30: Media Protection and Control,31: Secure Network and System Configuration.,32: Automated Tools in Data Security,33: Personal Identity Verification Methods,34: Cloud PII Security & Incident Management,35: Personnel Security and Data Retention,36: Cloud Service Data Protection,37: Cloud Service Provider Responsibilities,38: Managing Personally Identifiable Information.,39: Access Control and Privileged Accounts
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
c5,1.7,98.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ccm,24.4,1.5,0.0,1.5,0.5,21.8,20.3,0.0,1.0,2.5,9.6,0.0,10.2,0.0,1.5,0.0,0.0,0.0,0.0,1.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,1.5,1.0,0.0
eu_coc,1.6,0.0,1.6,93.7,1.6,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fedramp,38.8,1.2,0.0,0.0,0.0,0.5,0.2,6.6,0.0,3.9,0.5,0.0,0.0,4.4,3.2,3.9,0.0,3.4,0.0,2.2,2.4,2.7,2.7,2.4,1.7,2.2,0.0,2.2,0.2,2.2,0.0,0.0,1.7,1.7,1.7,1.5,1.7,1.5,0.7,0.7,1.2
iso_27002,29.3,0.0,39.0,0.0,0.0,0.0,2.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.8,0.0,0.0,0.0,0.0,0.0,2.4,0.0,0.0,0.0,4.9,0.0,4.9,7.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
iso_27017,4.3,0.0,0.0,0.0,93.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
iso_27018,29.5,0.0,34.1,6.8,0.0,0.0,2.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.1,0.0,0.0,0.0,0.0,0.0,2.3,0.0,0.0,0.0,4.5,0.0,4.5,6.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
iso_27701,15.6,0.7,56.0,1.4,0.0,0.0,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.9,0.0,2.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.4,0.0,2.8,0.0,2.8,1.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
soc2,21.3,4.9,0.0,0.0,0.0,0.0,0.0,0.0,36.1,3.3,0.0,32.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Total,24.2,11.6,9.9,6.0,4.1,4.1,3.9,2.4,2.1,2.0,1.9,1.8,1.8,1.6,1.4,1.4,1.2,1.2,1.1,1.1,1.0,1.0,1.0,0.9,0.9,0.8,0.8,0.8,0.8,0.8,0.7,0.7,0.6,0.6,0.6,0.6,0.6,0.5,0.5,0.4,0.4


# Manual merging

In [25]:
# First we view topic trees

# use fitted BERTopic model to extract possible hierarchies from our c-TF-IDF matrix
# https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html#merge-topics
hierarchical_topics_2 = topic_model_2.hierarchical_topics(docs)  
hierarchical_topics_5 = topic_model_5.hierarchical_topics(docs)  

100%|██████████| 149/149 [00:01<00:00, 148.13it/s]
100%|██████████| 35/35 [00:00<00:00, 140.43it/s]


In [39]:
# view representative docs
topic_model_5.get_topic_info(18)['Representative_Docs']

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,18,12,18_authentication_device_identification_or,"[authentication, device, identification, or, identification and authentication, and authentication, access, identification and, local, users]","[authentication mechanisms, authentication for, identification and authentication, authentication, multi factor authentication, identification or authentication, factor authentication, authenticat...","[authentication, device, identification, or, identification and authentication, and authentication, access, identification and, local, users]",[Identification and authentication. Identification and authentication (organizational users). Uniquely identify and authenticate organizational users and associate that unique identification with ...


: 

In [26]:
# min cluster size 5
topic_model_5.visualize_hierarchy(hierarchical_topics=hierarchical_topics_5)

In [32]:
# print whole tree: min cluster size 5
tree_5 = topic_model_5.get_topic_tree(hierarchical_topics_5)
print(tree_5)

# Save the tree in a file
file_path = 'outputs/tree5.txt'
with open(file_path, 'w', encoding='utf8') as file:
    file.write(tree_5)

.
├─and_system_or_the_of
│    ├─authentication_identification_identification and authentication_and authentication_identification an
│    │    ├─authentication_identification_device_identification and authentication_and authentication
│    │    │    ├─■──authentication_authenticators_identification_identification and authentication_and authentication ── Topic: 29
│    │    │    └─■──authentication_device_identification_or_identification and authentication ── Topic: 18
│    │    └─■──identity_piv_evidence_credentials_identification and authentication ── Topic: 23
│    └─and_system_or_the_of
│         ├─procedures_and_policy_and procedures_policies
│         │    ├─■──procedures_policy_and_and procedures_policies ── Topic: 21
│         │    └─■──and_procedures_policy_and procedures_or ── Topic: 16
│         └─and_system_the_of_to
│              ├─audit_incident_response_records_incident response
│              │    ├─■──configuration_to unauthorized_spam protection_spam_unauthorized ── T

In [28]:
# min cluster size 2

# hierarchical_topics is a dataframe in which merged topics are described. 
# For example, if you would merge two topics, what would the topic representation of the new topic be? 
print(hierarchical_topics_2.shape)
hierarchical_topics_2.tail(3) 

(149, 8)


Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance
2,152,policy_procedures_and procedures_policy and_policies,"[10, 22, 124]",150,policy_procedures_and procedures_policy and_security and privacy,124,accounts_account_group_policy_procedures,0.454026
1,151,cloud service_service customer_cloud service customer_cloud_service,"[23, 53]",53,cloud service customer_service customer_cloud service_cloud_the cloud service,23,cloud service_cloud service customer_service customer_cloud_service,0.369916
0,150,policy_procedures_and procedures_policy and_security and privacy,"[10, 22]",10,policy_procedures_and procedures_policies_policy and,22,policy_and procedures_procedures_policy and_security and,0.268803


In [33]:
# print whole tree: min cluster size 2
tree_2 = topic_model_2.get_topic_tree(hierarchical_topics_2)
print(tree_2)

# Save the trree in a file
file_path = 'outputs/tree2.txt'
with open(file_path, 'w', encoding='utf8') as file:
    file.write(tree_2)

.
├─and_system_or_to_of
│    ├─entity_the entity_personal information_objectives_of focus
│    │    ├─entity_the entity_personal information_objectives_of focus
│    │    │    ├─personal information_personal_the entity_entity_criteria for
│    │    │    │    ├─■──capacity_availability_recovery_criteria for availability_for availability ── Topic: 85
│    │    │    │    └─personal information_personal_the entity_entity_data subjects
│    │    │    │         ├─■──personal information_personal_the entity_entity_data subjects ── Topic: 2
│    │    │    │         └─■──completely_processing_output_criteria for processing_accurately ── Topic: 89
│    │    │    └─entity_the entity_objectives_focus_of focus
│    │    │         ├─■──entity_the entity_communicates_objectives_business partners ── Topic: 46
│    │    │         └─■──control activities_board_board of directors_of directors_board of ── Topic: 12
│    │    └─the entity_entity_software_access_protected information assets
│    │         ├

In [30]:
# min cluster size 2
topic_model_2.visualize_hierarchy(hierarchical_topics=hierarchical_topics_2)

In [36]:
# view representative docs
topic_model_2.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,-1,256,-1_the_of_and_cloud,"[the, of, and, cloud, service, cloud service, to, the cloud, in, for]","[cloud service provider, cloud service customer, cloud service, the cloud service, security, cloud, service provider, protection, identifiable information, service customer]","[the, of, and, cloud, service, cloud service, to, the cloud, in, for]",[Policy on the use of cryptographic controls. Control 10.1.1 and the associated implementation guidance and other information specified in ISO/IEC 27002 apply. The following sector-specific guidan...
1,0,25,0_and_system_security_systems,"[and, system, security, systems, privacy, monitoring, or, the system, security and privacy, and privacy]","[privacy requirements, vulnerability monitoring, security and privacy, security, security and, and privacy requirements, the security and, privacy plans, requirements, the security]","[and, system, security, systems, privacy, monitoring, or, the system, security and privacy, and privacy]",[Planning. Security and privacy architectures. a. Develop security and privacy architectures for the system that: \n1. Describe the requirements and approach to be taken for protecting the confide...
2,1,22,1_alternate_contingency_site_sites,"[alternate, contingency, site, sites, alternate processing, telecommunications, primary, planning, contingency planning, processing site]","[contingency planning alternate, contingency planning, contingency planning telecommunications, contingency planning contingency, contingency plans, planning contingency, contingency plan, plannin...","[alternate, contingency, site, sites, alternate processing, telecommunications, primary, planning, contingency planning, processing site]",[Contingency planning. Alternate processing site | separation from primary site. Identify an alternate processing site that is sufficiently separated from the primary processing site to reduce sus...
3,2,21,2_personal information_personal_the entity_entity,"[personal information, personal, the entity, entity, data subjects, subjects, privacy, criteria for privacy, criteria for, for privacy]","[criteria for privacy, privacy the entity, related to privacy, of personal information, privacy the following, privacy, personal information, to privacy, confidential information, personal informa...","[personal information, personal, the entity, entity, data subjects, subjects, privacy, criteria for privacy, criteria for, for privacy]","[Additional criteria for privacy. The entity communicates choices available regarding the collection, use, retention, disclosure, and disposal of personal information to the data subjects and the ..."
4,3,18,3_key_keys_key management_encryption,"[key, keys, key management, encryption, ckms, management, cryptographic, compromised, encryption key, cryptography encryption]","[encryption key management, key management, key management key, cryptographic keys, management key, key information, compromised keys, cryptographic key, ckms cryptography, system ckms cryptography]","[key, keys, key management, encryption, ckms, management, cryptographic, compromised, encryption key, cryptography encryption]","[Cryptography, encryption & key management. Key archival. Define, implement, and evaluate processes, procedures, and technical measures to manage archived keys in a secure repository requiring lea..."
...,...,...,...,...,...,...,...
146,145,2,145_classification_manager system_convention for asset_convention for,"[classification, manager system, convention for asset, convention for, facility management should, data and system, follow communicated and, for asset classification, concepts user manager, concep...","[management authorization mechanisms, authorization concepts, authorization mechanisms define, security assets classification, multiple authorization concepts, management authorization, access man...","[classification, manager system, convention for asset, convention for, facility management should, data and system, follow communicated and, for asset classification, concepts user manager, concep...","[Datacenter security. Assets classification. Classify and document the physical and logical assets (e.g., applications) based on the organizational business risk. The facility management should de..."
147,146,2,146_threats and non_and non conformance_non conformance_conformance,"[threats and non, and non conformance, non conformance, conformance, detection, threats and, and non, mitigation, and procedures, policies and procedures]","[management detection updates, vulnerability management detection, mitigation of threats, detection updates define, to update detection, detection updates, detection tools threat, monitoring and o...","[threats and non, and non conformance, non conformance, conformance, detection, threats and, and non, mitigation, and procedures, policies and procedures]","[Security incident management, e-discovery, & cloud forensics. Service management policy and procedures. Establish, document, approve, communicate, apply, evaluate, and maintain policies and proce..."
148,147,2,147_name_resolution_address resolution_dns,"[name, resolution, address resolution, dns, authoritative, address, dns servers, servers, clients, child]","[system dns servers, system dns, dns servers, name system dns, dns servers with, domain name system, the dns, address resolution services, address resolution service, dns]","[name, resolution, address resolution, dns, authoritative, address, dns servers, servers, clients, child]",[System and communications protection. Architecture and provisioning for name/address resolution service. Ensure the systems that collectively provide name/address resolution service for an organi...
149,148,2,148_error_error messages_discoverable_information that,"[error, error messages, discoverable, information that, discoverable information, messages, corrective actions, numbers, designated information, corrective]","[exploitable information, exploitable information includes, integrity error handling, information that adversaries, intentionally discoverable information, information determine information, infor...","[error, error messages, discoverable, information that, discoverable information, messages, corrective actions, numbers, designated information, corrective]",[Risk assessment. Vulnerability monitoring and scanning | discoverable information. Determine information about the system that is discoverable and take [assignment: organization-defined correctiv...


## Results visualization

In [None]:
df.head(1)

In [None]:
# to make trees better, we can replace the values using the OpenAI generated ones

def replace_lines_with_labels(input_file, df):
    topic_numbers = set(df['topics_2'])
    
    with open(input_file, 'r', encoding='utf8') as file:
        lines = file.readlines()

    for i, line in enumerate(lines):
        if '─ Topic: ' in line:
            try:
                topic_number = int(line.strip().split('─ Topic: ')[-1])
                if topic_number in topic_numbers:
                    label = df.loc[df['topics_2'] == topic_number, 'topic_label_2'].values[0]
                    # Ensure label is a string and handle NaN or float conversion
                    label_str = str(label) if not pd.isnull(label) else 'No Label'
                    # Include the topic number at the end of the line
                    lines[i] = line.rsplit('─', 1)[0] + '─ ' + label_str + ' ── Topic: ' + str(topic_number) + '\n'
            except ValueError:
                continue

    with open('outputs/modified_tree2.txt', 'w', encoding='utf8') as file:
        file.writelines(lines)

replace_lines_with_labels('outputs/tree2.txt', df)

In [None]:
# for merge, ensure we make a new topic model and not rewrite the old one

In [None]:
topic_model_2.get_topic_info(3)

In [None]:
topic_model_2.get_topic_info(3)['Represen']

In [None]:
#TODO: need to add representative docs to the main df and save, otherwise I can't complete this merging process.

# Optional: Q&A with our finetuned model

In [None]:
# Getting names for topics with our domain fine-tuned model
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

model_name_or_path = "/finetuned_model"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_name_or_path)


In [None]:
# Simple Q&A Function: Define a function that takes a question and a context as inputs, and returns the answer
# This involves encoding the inputs, performing inference to get start and end token positions, and then decoding the answer
def answer_question(question, context):
    # Tokenize the inputs (question and context)
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
    
    # Perform inference
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    # Extract the start and end positions of the answer
    answer_start_scores, answer_end_scores = outputs.start_logits, outputs.end_logits
    answer_start = answer_start_scores.argmax()
    answer_end = answer_end_scores.argmax() + 1  # Add 1 because end position is exclusive
    
    # Decode the answer
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][answer_start:answer_end]))
    
    return answer


In [None]:
# Use Q&A function
question = "What is the capital of France?"
context = "France is a country in Europe. Its capital is Paris."

answer = answer_question(question, context)
print(answer)
