# Finetunes an LLM to improve topic modeling results.

2024-03-01

Zachary Kilhoffer

Requirements:
- 'data\df_2024-02-28.xlsx'
- 'data\df_ada_2024-02-28.xlsx'

Outputs:
- fine_tuned_model
- domain_adapted_model

In [147]:
import pandas as pd
import transformers
import accelerate
import ast
import os
import numpy as np
from transformers import AutoTokenizer, AutoModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoModelForMaskedLM, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, models
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import joblib  # save the label encoder
from openai import OpenAI
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from bertopic import BERTopic

from umap import UMAP
from hdbscan import HDBSCAN  # https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html#umap


In [121]:
import sys
print(sys.executable)


c:\analysis 2024-02-19\privacyvenvv2\Scripts\python.exe


In [120]:
!where pip

c:\analysis 2024-02-19\privacyvenvv2\Scripts\pip.exe
C:\Program Files\Python311\Scripts\pip.exe
C:\Users\zakki\AppData\Roaming\Python\Python311\Scripts\pip.exe


In [119]:
!where python

c:\analysis 2024-02-19\privacyvenvv2\Scripts\python.exe
C:\Program Files\Python311\python.exe
C:\Users\zakki\AppData\Local\Microsoft\WindowsApps\python.exe


In [3]:
# display tweaks
pd.set_option("display.max_colwidth", 200)  # how much text is showing within a cell
pd.set_option("display.max_columns", False)
pd.set_option("display.max_rows", False)
# warnings.filterwarnings("ignore")

In [111]:
# read api key
def read_key_from_file(filename=r"key.txt"):  # replace with yours
    with open(filename, 'r') as file:
        return file.read().strip()

In [4]:
# getting updated dfs
df = pd.read_excel(r'data\df_2024-02-28.xlsx')
df_ada = pd.read_excel(r'data\df_ada_2024-02-28.xlsx')

In [5]:
# Merge df with df_ada on df['topic'] and df_ada['topic_num']
df_merged = pd.merge(df, df_ada, left_on='topic', right_on='topic_num', how='left')

# drop the 'topic_num' column if it's redundant
df_merged.drop('topic_num', axis=1, inplace=True)

In [6]:
# get rid of needless columns
l = ['Unnamed: 0.1', 'Unnamed: 0_x', 'Unnamed: 0_y', 'Unnamed: 1']
for x in l:
    del df_merged[x]

In [7]:
# getting a way to say if ALL rows in the df are representative
df_temp = df_ada[['topic_num', 'representative_docs']]
df_temp

Unnamed: 0,topic_num,representative_docs
0,-1,['Event logging. Control 12.4.1 and the associated implementation guidance and other information specified in ISO/IEC 27002 apply. The following sector-specific guidance also applies: Public cloud...
1,0,['Control and monitoring of service providers and suppliers (sso). Policies and instructions for controlling and monitoring third parties. Basic criterion: Policies and instructions for controllin...
2,1,['Obligations to personally identifiable information principals. Objective: To ensure that personally identifiable information principals are provided with the appropriate information about the pr...
3,2,['Cloud service provider shall process customer personal data according to customer\'s in-structions. the scope of customer\'s in-structions for the processing of customer personal data shall be d...
4,3,"['Logical and physical access controls. The entity restricts physical access to facilities and protected information assets (for example, data center facilities, backup media storage, and other se..."
5,4,['Configuration management. Configuration settings. A. Establish and document configuration settings for components employed within the system that reflect the most restrictive mode consistent wit...
6,5,['Information access restriction. Control 9.4.1 and the associated implementation guidance and other information specified in ISO/IEC 27002 apply. The following sector-specific guidance also appli...
7,6,['Audit and accountability. Audit record generation. a. Provide audit record generation capability for the event types the system is capable of auditing as defined in au-2a on [assignment: organiz...
8,7,"['Additional criteria for privacy. The entity retains personal information consistent with the entity’s objectives related to privacy.. The following points of focus, which apply only to an engage..."
...,...,...


In [8]:
# read in strings as lists as they should be
df_temp['representative_docs'] = df_temp['representative_docs'].apply(ast.literal_eval)
df_temp

Unnamed: 0,topic_num,representative_docs
0,-1,[Event logging. Control 12.4.1 and the associated implementation guidance and other information specified in ISO/IEC 27002 apply. The following sector-specific guidance also applies: Public cloud ...
1,0,[Control and monitoring of service providers and suppliers (sso). Policies and instructions for controlling and monitoring third parties. Basic criterion: Policies and instructions for controlling...
2,1,[Obligations to personally identifiable information principals. Objective: To ensure that personally identifiable information principals are provided with the appropriate information about the pro...
3,2,[Cloud service provider shall process customer personal data according to customer's in-structions. the scope of customer's in-structions for the processing of customer personal data shall be defi...
4,3,"[Logical and physical access controls. The entity restricts physical access to facilities and protected information assets (for example, data center facilities, backup media storage, and other sen..."
5,4,[Configuration management. Configuration settings. A. Establish and document configuration settings for components employed within the system that reflect the most restrictive mode consistent with...
6,5,[Information access restriction. Control 9.4.1 and the associated implementation guidance and other information specified in ISO/IEC 27002 apply. The following sector-specific guidance also applie...
7,6,[Audit and accountability. Audit record generation. a. Provide audit record generation capability for the event types the system is capable of auditing as defined in au-2a on [assignment: organiza...
8,7,"[Additional criteria for privacy. The entity retains personal information consistent with the entity’s objectives related to privacy.. The following points of focus, which apply only to an engagem..."
...,...,...


In [9]:
# explode to get items in list out
df_temp = df_temp.explode(['representative_docs'])
df_temp.head(5)

Unnamed: 0,topic_num,representative_docs
0,-1,Event logging. Control 12.4.1 and the associated implementation guidance and other information specified in ISO/IEC 27002 apply. The following sector-specific guidance also applies: Public cloud p...
0,-1,Event logging. Control 12.4.1 and the associated implementation guidance and other information specified in ISO/IEC 27002 apply. The following sector-specific guidance also applies.\n\nPublic clou...
0,-1,Information backup. Control 12.3.1 and the associated implementation guidance specified in ISO/IEC 27002 apply. The following sector-specific guidance also applies. Public cloud personally identif...
1,0,Control and monitoring of service providers and suppliers (sso). Policies and instructions for controlling and monitoring third parties. Basic criterion: Policies and instructions for controlling ...
1,0,"Portability and interoperability (pi). Contractual agreements for the provision of data. Basic Criterion: In contractual agreements, the following aspects are defined with regard to the terminatio..."


In [10]:
# Create a set for faster lookup times of representative_docs
representative_docs_set = set(df_temp['representative_docs'])

# Use apply to check each row in df for a match in representative_docs_set
df_merged['is_representative'] = df_merged['full_control_text'].apply(lambda x: x in representative_docs_set)

In [11]:
df_merged['is_representative'].value_counts()

is_representative
False    1032
True       93
Name: count, dtype: int64

In [None]:
# # to visually/manually inspect representative docs. Do they pass the smell test?
# df_merged[df_merged['is_representative'] == True].to_excel(r'data\finalized\df to inspect 2024-02-28.xlsx')

### TBD: in df_merged.topic_label, add 'NA' if df_merged.topic == -1
That will make the pivot table more informative
Rather, just change the label "Cloud data privacy and security" into "Unlabeled"

In [12]:
# Pivot table: Document summary

# Step 1: Add a helper column for counting if needed
df_merged['count'] = 1

# Step 2: Create the pivot table using 'docs' as index, 'topic' as columns, and the new 'count' column for values
pivot_table = df_merged.pivot_table(index='document', columns='topic_label', values='count', aggfunc='count', fill_value=0)

pivot_table.loc['Total'] = pivot_table.sum()

# Now, pivot_table should have documents as rows and topics as columns with counts
pivot_table

topic_label,Audit Accountability and Logging Procedures.,Auditing Vulnerabilities in Cloud Security,Cloud Data Privacy and Security,Cloud Data Privacy and Security.,Cloud Service Security and Virtualization,Communication and COSO Process Assessments.,Cryptographic Key Management Processes.,GDPR Compliance in Cloud Services,ISO Standards for PII Data Protection.,Impartial Security Assessment and Authorization.,Incident Response and Organization Security.,Media Sanitization and Protection.,Network Security and Unauthorized Access,Organization's Group Account Security Management.,Organization's Security and Access Control,Organizational Configuration Management Processes.,Organizational Contingency and Reconstitution Planning,Organizational Security Policies and Procedures.,Organizational Security Process and Documentation,Organizational Security and Employee Training.,Organizational User Identification and Authentication.,PII Handling and Lawful Processing.,Personal Data Security Lifecycle Management,Physical Access Control Measures.,Privacy Control and Data Management,Public Cloud PII Protection Guidance.,SSRM in Cloud Service Management.,Supply Chain Risk Mitigation.,Unauthorized Malicious System Intrusion Alerts.,Universal Endpoint Management and Security,Vulnerability Management and Remediation Strategies
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
c5,0,121,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
ccm,11,21,37,0,0,0,22,1,0,1,7,2,0,3,1,8,0,2,0,13,3,0,16,10,0,0,16,0,0,13,10
eu_coc,0,0,2,0,0,0,0,59,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
fedramp,27,0,40,0,0,0,4,0,0,12,22,10,28,23,12,32,34,18,21,13,31,0,0,26,0,0,0,13,34,0,10
iso_27002,0,0,18,12,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,2,0,0,0,6,0,0,0,0,0
iso_27017,0,4,2,0,38,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
iso_27018,0,0,19,11,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,3,0,0,0,7,0,0,0,0,0
iso_27701,0,0,23,0,0,0,0,0,24,0,0,5,0,0,0,0,0,0,0,0,0,86,0,0,0,3,0,0,0,0,0
soc2,0,0,0,0,0,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,36,0,0,0,0,0,0
Total,38,146,141,23,38,17,26,61,24,13,29,26,28,26,13,40,34,20,21,27,34,91,16,44,36,17,16,13,34,13,20


In [13]:
# normalized pivot table

# Normalize the counts by row to get the percentage and then round to two decimals
pivot_table_percentage_rounded = (pivot_table.div(pivot_table.sum(axis=1), axis=0) * 100).round(1)

# Display the pivot table with percentages rounded to two decimals
pivot_table_percentage_rounded

topic_label,Audit Accountability and Logging Procedures.,Auditing Vulnerabilities in Cloud Security,Cloud Data Privacy and Security,Cloud Data Privacy and Security.,Cloud Service Security and Virtualization,Communication and COSO Process Assessments.,Cryptographic Key Management Processes.,GDPR Compliance in Cloud Services,ISO Standards for PII Data Protection.,Impartial Security Assessment and Authorization.,Incident Response and Organization Security.,Media Sanitization and Protection.,Network Security and Unauthorized Access,Organization's Group Account Security Management.,Organization's Security and Access Control,Organizational Configuration Management Processes.,Organizational Contingency and Reconstitution Planning,Organizational Security Policies and Procedures.,Organizational Security Process and Documentation,Organizational Security and Employee Training.,Organizational User Identification and Authentication.,PII Handling and Lawful Processing.,Personal Data Security Lifecycle Management,Physical Access Control Measures.,Privacy Control and Data Management,Public Cloud PII Protection Guidance.,SSRM in Cloud Service Management.,Supply Chain Risk Mitigation.,Unauthorized Malicious System Intrusion Alerts.,Universal Endpoint Management and Security,Vulnerability Management and Remediation Strategies
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
c5,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ccm,5.6,10.7,18.8,0.0,0.0,0.0,11.2,0.5,0.0,0.5,3.6,1.0,0.0,1.5,0.5,4.1,0.0,1.0,0.0,6.6,1.5,0.0,8.1,5.1,0.0,0.0,8.1,0.0,0.0,6.6,5.1
eu_coc,0.0,0.0,3.2,0.0,0.0,0.0,0.0,93.7,0.0,0.0,0.0,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fedramp,6.6,0.0,9.8,0.0,0.0,0.0,1.0,0.0,0.0,2.9,5.4,2.4,6.8,5.6,2.9,7.8,8.3,4.4,5.1,3.2,7.6,0.0,0.0,6.3,0.0,0.0,0.0,3.2,8.3,0.0,2.4
iso_27002,0.0,0.0,43.9,29.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.9,0.0,0.0,0.0,14.6,0.0,0.0,0.0,0.0,0.0
iso_27017,0.0,8.5,4.3,0.0,80.9,0.0,0.0,2.1,0.0,0.0,0.0,2.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.1,0.0,0.0,0.0,0.0,0.0
iso_27018,0.0,0.0,43.2,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.8,0.0,0.0,0.0,15.9,0.0,0.0,0.0,0.0,0.0
iso_27701,0.0,0.0,16.3,0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.0,0.0,0.0,0.0,2.1,0.0,0.0,0.0,0.0,0.0
soc2,0.0,0.0,0.0,0.0,0.0,27.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.1,59.0,0.0,0.0,0.0,0.0,0.0,0.0
Total,3.4,13.0,12.5,2.0,3.4,1.5,2.3,5.4,2.1,1.2,2.6,2.3,2.5,2.3,1.2,3.6,3.0,1.8,1.9,2.4,3.0,8.1,1.4,3.9,3.2,1.5,1.4,1.2,3.0,1.2,1.8


In [14]:
# Sum the values for each column and add as a new row called "Total"
pivot_table_percentage_rounded.loc['Total'] = pivot_table_percentage_rounded.sum()

# Display the updated pivot table with the "Total" row
pivot_table_percentage_rounded

topic_label,Audit Accountability and Logging Procedures.,Auditing Vulnerabilities in Cloud Security,Cloud Data Privacy and Security,Cloud Data Privacy and Security.,Cloud Service Security and Virtualization,Communication and COSO Process Assessments.,Cryptographic Key Management Processes.,GDPR Compliance in Cloud Services,ISO Standards for PII Data Protection.,Impartial Security Assessment and Authorization.,Incident Response and Organization Security.,Media Sanitization and Protection.,Network Security and Unauthorized Access,Organization's Group Account Security Management.,Organization's Security and Access Control,Organizational Configuration Management Processes.,Organizational Contingency and Reconstitution Planning,Organizational Security Policies and Procedures.,Organizational Security Process and Documentation,Organizational Security and Employee Training.,Organizational User Identification and Authentication.,PII Handling and Lawful Processing.,Personal Data Security Lifecycle Management,Physical Access Control Measures.,Privacy Control and Data Management,Public Cloud PII Protection Guidance.,SSRM in Cloud Service Management.,Supply Chain Risk Mitigation.,Unauthorized Malicious System Intrusion Alerts.,Universal Endpoint Management and Security,Vulnerability Management and Remediation Strategies
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
c5,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ccm,5.6,10.7,18.8,0.0,0.0,0.0,11.2,0.5,0.0,0.5,3.6,1.0,0.0,1.5,0.5,4.1,0.0,1.0,0.0,6.6,1.5,0.0,8.1,5.1,0.0,0.0,8.1,0.0,0.0,6.6,5.1
eu_coc,0.0,0.0,3.2,0.0,0.0,0.0,0.0,93.7,0.0,0.0,0.0,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fedramp,6.6,0.0,9.8,0.0,0.0,0.0,1.0,0.0,0.0,2.9,5.4,2.4,6.8,5.6,2.9,7.8,8.3,4.4,5.1,3.2,7.6,0.0,0.0,6.3,0.0,0.0,0.0,3.2,8.3,0.0,2.4
iso_27002,0.0,0.0,43.9,29.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.9,0.0,0.0,0.0,14.6,0.0,0.0,0.0,0.0,0.0
iso_27017,0.0,8.5,4.3,0.0,80.9,0.0,0.0,2.1,0.0,0.0,0.0,2.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.1,0.0,0.0,0.0,0.0,0.0
iso_27018,0.0,0.0,43.2,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.8,0.0,0.0,0.0,15.9,0.0,0.0,0.0,0.0,0.0
iso_27701,0.0,0.0,16.3,0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.0,0.0,0.0,0.0,2.1,0.0,0.0,0.0,0.0,0.0
soc2,0.0,0.0,0.0,0.0,0.0,27.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.1,59.0,0.0,0.0,0.0,0.0,0.0,0.0
Total,15.6,132.2,152.0,56.3,84.3,29.4,14.5,101.7,19.1,4.6,11.6,29.3,9.3,9.4,4.6,15.5,11.3,7.2,7.0,13.8,12.1,80.8,9.5,28.4,62.2,36.2,9.5,4.4,11.3,7.8,9.3


# Training domain-customized finetuning model

In [15]:
# 1. Prepare dataset
texts = list(df_merged['full_control_text'].values)

In [None]:
# 2. Tokenize Your Dataset
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Create a Dataset object from your texts
dataset = Dataset.from_dict({"text": texts})

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# 3. Create a Data Collator for MLM
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [None]:
model = AutoModelForMaskedLM.from_pretrained("bert-base-cased")

training_args = TrainingArguments(
    output_dir="./pretrained_model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,  # Directly use the tokenized dataset here
)

trainer.train()

In [None]:
# 5. Saving the Further Pretrained Model
model.save_pretrained("outputs\domain_adapted_model")
tokenizer.save_pretrained("outputs\domain_adapted_model")

# TBD
finetune with labeled data

In [None]:
# importing the model already trained 
domain_adapted_model = 'outputs\domain_adapted_model'

## Prepping training data

In [None]:
# importing training data not yet in proper format
df_training = pd.read_excel(r'data\labels\df_labels_consensus.xlsx', sheet_name='Data')
temp_labels = pd.read_excel(r'data\labels\df_labels_consensus.xlsx', sheet_name='Label Choices')

In [None]:
# inspecting
df_training.head()

In [None]:
df_training['66_label'][2]

In [None]:
# inspecting
temp_labels.head()

In [None]:
# must strip these strings
str(temp_labels['Abbreviation'][22])

In [None]:
temp_labels['Abbreviation'] = temp_labels['Abbreviation'].apply(lambda x: str(x).strip())

In [None]:
# confirming it worked
str(temp_labels['Abbreviation'][22])

In [None]:
# removing all but necessary columns for simplicity
temp_labels = temp_labels[['Category', 'Definition', 'Abbreviation']]
temp_labels.head()

In [None]:
# only taking the majority opinion (66_label) because I"m not sure about weighting

# Merge df_training with temp_labels on df_training['33_label'] and df_ada['topic_num']
df_training = pd.merge(df_training, temp_labels, left_on='66_label', right_on='Abbreviation', how='left')

# drop unneeded columns
to_drop = ['33_label', '66_label', 'page', 'document', 'Abbreviation', 'indexmaybe', 'control_category', 'control_text']

for x in to_drop:
    del df_training[x]

df_training.head()

In [None]:
# concatenate the category and definition as we need one column of labels
df_training['label'] = df_training['Category'] + ': ' + df_training['Definition']
df_training['label']
del df_training['Category']
del df_training['Definition']

# check results
df_training.head()

## Tokenizing training data

In [None]:
# df_training is DataFrame with labeled data
texts = df_training['control_text_corrected'].values
labels = df_training['label'].values

# Convert text labels to integers
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Create a Dataset object from your texts and encoded labels
labeled_dataset = Dataset.from_dict({"text": texts, "label": encoded_labels})

# Tokenize the labeled dataset
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    # Adjust the function to handle the mapping correctly for sequence classification
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_labeled_dataset = labeled_dataset.map(tokenize_function, batched=True)

# Ensure model is suited for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(domain_adapted_model, num_labels=len(label_encoder.classes_))

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="outputs/fine_tuned_model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_labeled_dataset,
)

# Train the model
trainer.train()

# Optionally, save the model and the tokenizer
model.save_pretrained("outputs/fine_tuned_model")
tokenizer.save_pretrained("outputs/fine_tuned_model")

In [None]:
# Also, save the label encoder for later use in inference to decode the predicted labels
joblib.dump(label_encoder, "outputs/fine_tuned_model/label_encoder.joblib")

# Retry Topic Modeling w/Fine-tuned model

In [19]:
# convert model to something BERTopic can use

# Load fine-tuned model and its tokenizer
model_path = 'outputs/fine_tuned_model'  # Adjust path as necessary
transformer_model = AutoModel.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Create a mean pooling layer
pooling_model = models.Pooling(
    transformer_model.config.hidden_size,
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False
)

# Combine the models into a SentenceTransformers model
model = SentenceTransformer(modules=[models.Transformer(model_path), pooling_model])

# Save the SentenceTransformers model for future use
model.save('outputs/sentence_transformers_compatible_model')


In [20]:
# Block 1: Generate Embeddings Using the Fine-tuned Model

# make list of texts for embeddings
docs = list(df_merged['full_control_text'].values)

# Load your fine-tuned sentence-transformers model
model_path = "outputs/sentence_transformers_compatible_model"
finetuned_model = SentenceTransformer(model_path)

# Generate embeddings for each document
embeddings = finetuned_model.encode(docs, show_progress_bar=True)

# Add the embeddings to DataFrame
df_merged['finetune_embeddings'] = embeddings.tolist()

Batches: 100%|██████████| 36/36 [04:49<00:00,  8.04s/it]


In [None]:
# # Block 2: Use Pre-generated Embeddings for Topic Modeling with BERTopic, KeyBERTInspired

# # Load pre-generated embeddings
# pre_generated_embeddings = list(df_merged['finetune_embeddings'].values)
# pre_generated_embeddings = np.array(pre_generated_embeddings)

# # Create a representation model
# representation_model = KeyBERTInspired(random_state=42)

# # Instantiate BERTopic with your fine-tuned model's embeddings and the representation model
# topic_model = BERTopic(embedding_model='all-MiniLM-L6-v2',  # needed, but doesn't regenerate embeddings. see https://github.com/MaartenGr/BERTopic/issues/1601
#                        verbose=True,
#                        n_gram_range=(1, 3),
#                        min_topic_size=2,
#                        calculate_probabilities=True,
#                        representation_model=representation_model)

# # Fit BERTopic to your documents using pre-generated embeddings
# topics, probs = topic_model.fit_transform(docs, embeddings=pre_generated_embeddings)


In [None]:
# # Optional: Print top terms for each topic
# space = " "*10
# for k, v in topic_model.get_topics().items():
    
#     if k != -1:  # Exclude outlier topic if present
#         print(f'Cluster {k: 3}: {",     ".join([term for term, _ in v[:3]])}')

## Try 2 showing documents per class (to abandon)

In [None]:
# # Block 2: Use Pre-generated Embeddings for Topic Modeling with BERTopic, KeyBERTInspired

# # Load pre-generated embeddings
# pre_generated_embeddings = list(df_merged['finetune_embeddings'].values)
# pre_generated_embeddings = np.array(pre_generated_embeddings)

# # Create a representation model
# representation_model = KeyBERTInspired(random_state=42)

# # prepare classes - experimental, trying to visualize topics by class (document) easier! https://maartengr.github.io/BERTopic/getting_started/visualization/visualize_topics.html#visualize-topics-per-class
# classes = df_merged['document']

# # Instantiate BERTopic with your fine-tuned model's embeddings and the representation model
# topic_model = BERTopic(embedding_model='all-MiniLM-L6-v2',  # needed, but doesn't regenerate embeddings. see https://github.com/MaartenGr/BERTopic/issues/1601
#                        verbose=True,
#                        n_gram_range=(1, 3),
#                        min_topic_size=2,
#                        calculate_probabilities=True,
#                        representation_model=representation_model)

# # Fit BERTopic to your documents using pre-generated embeddings
# topics, probs = topic_model.fit_transform(docs, embeddings=pre_generated_embeddings)
# topics_per_class = topic_model.topics_per_class(docs, classes=classes)

In [None]:
# topic_model.visualize_topics_per_class(topics_per_class)

In [None]:
# df_merged['topics_new'] = topics

In [None]:
# # Pivot table: Document summary

# # Step 1: Add a helper column for counting if needed
# df_merged['count'] = 1

# # Step 2: Create the pivot table using 'docs' as index, 'topic' as columns, and the new 'count' column for values
# pivot_table = df_merged.pivot_table(index='document', columns='topics_new', values='count', aggfunc='count', fill_value=0)

# pivot_table.loc['Total'] = pivot_table.sum()


# # Now, pivot_table should have documents as rows and topics as columns with counts
# pivot_table

In [None]:
# # normalized pivot table

# # Normalize the counts by row to get the percentage and then round to two decimals
# pivot_table_percentage_rounded = (pivot_table.div(pivot_table.sum(axis=1), axis=0) * 100).round(1)

# # Display the pivot table with percentages rounded to two decimals
# pivot_table_percentage_rounded

# Try 3

### TO DO: 

- Set min topic size == 2, Get the topics to be done hierarchically, then merge manually as needed

In [None]:
# # Block 2: Use Pre-generated Embeddings for Topic Modeling with BERTopic, KeyBERTInspired

# # Load pre-generated embeddings
# pre_generated_embeddings = list(df_merged['finetune_embeddings'].values)
# pre_generated_embeddings = np.array(pre_generated_embeddings)

# # Create a representation model
# representation_model = KeyBERTInspired(random_state=42)

# # Instantiate BERTopic with your fine-tuned model's embeddings and the representation model
# topic_model = BERTopic(embedding_model=finetuned_model,  # needed, but doesn't regenerate embeddings. see https://github.com/MaartenGr/BERTopic/issues/1601
#                        verbose=True,
#                        n_gram_range=(1, 3),
#                        min_topic_size=2,
#                        calculate_probabilities=True,
#                        representation_model=representation_model)

# # Fit BERTopic to your documents using pre-generated embeddings
# topics, probs = topic_model.fit_transform(docs, embeddings=pre_generated_embeddings)

In [None]:
# # Optional: Print top terms for each topic
# space = " "*10
# for k, v in topic_model.get_topics().items():
#     if k != -1:  # Exclude outlier topic if present
#         print(f'Cluster {k: 3}: {",     ".join([term for term, _ in v[:3]])}')

# 2024-03-04

MaximalMarginalRelevance¶

When we calculate the weights of keywords, we typically do not consider whether we already have similar keywords in our topic. Words like "car" and "cars" essentially represent the same information and often redundant.



To decrease this redundancy and improve the diversity of keywords, we can use an algorithm called Maximal Marginal Relevance (MMR).

***

In [142]:
# instantiate topic modeling

# Load fine-tuned sentence-transformers model
model_path = "outputs/sentence_transformers_compatible_model"
finetuned_model = SentenceTransformer(model_path)

# Load pre-generated embeddings
pre_generated_embeddings = list(df_merged['finetune_embeddings'].values)
pre_generated_embeddings = np.array(pre_generated_embeddings)

# specifying dimensionality reduction
umap_model = UMAP(n_neighbors=15, n_components=5, metric='cosine', low_memory=False)  # may need to tweak

# specifying cluster model
hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', prediction_data=True)  # To Do: check with new min cluster size

# Create a representation model, 3 parts
keybert_model = KeyBERTInspired(random_state=42)
mmr_model = MaximalMarginalRelevance(diversity=0.3)
representation_model = {
    "KeyBERT": keybert_model,
    # "OpenAI": openai_model,  # Uncomment if you will use OpenAI
    "MMR": mmr_model
}

# Instantiate BERTopic with fine-tuned model's embeddings and the representation model
topic_model = BERTopic(embedding_model=finetuned_model,  
                       verbose=True,
                       n_gram_range=(1, 3),
                       min_topic_size=2,
                       calculate_probabilities=True,
                       representation_model=representation_model)

# note that embedding_model=finetuned_model doesn't remake embeddings. see https://github.com/MaartenGr/BERTopic/issues/1601                   

In [143]:
# run topic modeling
topics, probs = topic_model.fit_transform(docs, embeddings=pre_generated_embeddings)

2024-03-04 15:27:47,800 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-04 15:27:50,888 - BERTopic - Dimensionality - Completed ✓
2024-03-04 15:27:50,890 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-04 15:27:51,681 - BERTopic - Cluster - Completed ✓
2024-03-04 15:27:51,684 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-04 15:32:04,502 - BERTopic - Representation - Completed ✓


In [160]:
df_merged['document'].value_counts()

document
fedramp      410
ccm          197
iso_27701    141
c5           121
eu_coc        63
soc2          61
iso_27017     47
iso_27018     44
iso_27002     41
Name: count, dtype: int64

In [159]:
df_merged.head(3)

Unnamed: 0,control_category,control_code,control_name,control_text,page,document,control_text_corrected,full_control_text,finetune_embeddings,count
0,organisation of information security (ois),OIS-01,information security management system (isms),basic criterion: the cloud service provider operates an information security management system (isms) in accordance with iso/iec 27001. the scope of the isms covers the cloud service provider’s or...,36.0,c5,Basic criterion: The cloud service provider operates an Information Security Management System (ISMS) in accordance with ISO/IEC 27001. The scope of the ISMS covers the cloud service provider's or...,Organisation of information security (ois). Information security management system (isms). Basic criterion: The cloud service provider operates an Information Security Management System (ISMS) in ...,"[0.26916763186454773, -0.17895491421222687, 0.05843820422887802, 0.10326196253299713, 0.41640037298202515, -0.26914629340171814, 0.020081838592886925, 0.2376757264137268, 0.32093170285224915, 0.48...",1
1,organisation of information security (ois),OIS-02,information security policy,basic criterion: the top management of the cloud service provider has adopted an information security policy and communicated it to internal and external employees as well as cloud customers. the ...,36.0,c5,"Basic criterion: The top management of the cloud service provider has adopted an information security policy and communicated it to internal and external employees, as well as cloud customers. The...",Organisation of information security (ois). Information security policy. Basic criterion: The top management of the cloud service provider has adopted an information security policy and communicat...,"[0.30318355560302734, -0.28403574228286743, 0.011182880960404873, 0.11411823332309723, 0.5018870830535889, -0.15271033346652985, 0.13693717122077942, 0.2578427791595459, 0.3121120035648346, 0.2964...",1
2,organisation of information security (ois),OIS-03,interfaces and dependencies,basic criterion: interfaces and dependencies between cloud service delivery activities performed by the cloud service provider and activities performed by third parties are documented and communic...,37.0,c5,Basic criterion: Interfaces and dependencies between cloud service delivery activities performed by the cloud service provider and activities performed by third parties are documented and communic...,Organisation of information security (ois). Interfaces and dependencies. Basic criterion: Interfaces and dependencies between cloud service delivery activities performed by the cloud service provi...,"[0.3901818096637726, -0.3114907145500183, 0.045606814324855804, 0.10600920766592026, 0.46738895773887634, -0.3043411374092102, 0.04243272915482521, 0.09063242375850677, 0.15170085430145264, 0.3258...",1


In [158]:
# must manually add openai representation because the bertopic utility doesn't work
client = OpenAI(api_key=read_key_from_file())
# openai_model = OpenAI(client, model="gpt-4", exponential_backoff=True, chat=False)

# main loop - get the topic names from OpenAI
import time
rows_to_append = []

# client = openai.OpenAI(api_key=read_key_from_file())  # NOTE: this is the old and now defunct way of calling OPenAI

for x, i in enumerate(topic_model.get_topics()):
  # skip -1, outliers
  if x == 0 and i == -1:
        continue
  completion = client.chat.completions.create(
    model="gpt-4",
    messages = [
{"role": "system", "content": "You are a helpful assistant, knowledgeable on data privacy and security standards and controls, that helps in topic modeling tasks."},
{"role": "user", "content": f"I have a topic that contains the following documents: {topic_model.get_representative_docs()[i]}."},
{"role": "user", "content": f"The topic is described by the following keyword-probability pairs: {topic_model.get_topics()[i]}."},
{"role": "user", "content": """Based on the information above, extract a short but highly descriptive topic label of at most 5 words. 
 
    Make sure it is in the following format:
    
    topic: <topic label>."""},
]
  )

  # Extract the topic label from the completion response
  topic_label = completion.choices[0].message.content.split("topic: ")[1].strip()

  rows_to_append.append({
    "topic_num": i,
    "representative_docs": topic_model.get_representative_docs()[i],  
    "top_words": topic_model.get_topics()[i], 
    "topic_label": topic_label
    })

  print(f"Appended topic {i}: {topic_label}")

  time.sleep(.1)

Appended topic 0: Cloud Service Security and Privacy
Appended topic 1: Cloud Service Provider Criteria Compliance.
Appended topic 2: Cloud Service Provider Data Security.
Appended topic 3: Contingency Planning for Alternate Sites.
Appended topic 4: Privacy & Personal Information Handling
Appended topic 5: Cryptography and Key Management Processes.
Appended topic 6: Public Cloud Personal Identifiable Information Control
Appended topic 7: ISO Standards in Information Security
Appended topic 8: Security and Privacy System Requirements
Appended topic 9: Identification and Multi-Factor Authentication
Appended topic 10: Organizational Control over Personal Information.
Appended topic 11: Maintenance and Configuration Management Tools
Appended topic 12: Cloud Service Physical Security Requirements.
Appended topic 13: Personally Identifiable Information Control
Appended topic 14: System Component Configuration Management
Appended topic 15: Cloud Service Provider Data Management.
Appended topic

In [None]:
# Initial DataFrame
df_ada = pd.DataFrame({"topic_num": [],
                       "representative_docs": [],
                       "top_words": [],
                       "topic_label": []})
# Concatenate collected rows with the original DataFrame
df_ada = pd.concat([df_ada, pd.DataFrame(rows_to_append)], ignore_index=True)

***

In [82]:
# use fitted BERTopic model to extract possible hierarchies from our c-TF-IDF matrix
hierarchical_topics = topic_model.hierarchical_topics(docs)  # https://maartengr.github.io/BERTopic/getting_started/hierarchicaltopics/hierarchicaltopics.html#merge-topics

100%|██████████| 82/82 [00:23<00:00,  3.47it/s]


In [83]:
hierarchical_topics.tail(20)

Unnamed: 0,Parent_ID,Parent_Name,Topics,Child_Left_ID,Child_Left_Name,Child_Right_ID,Child_Right_Name,Distance
19,102,authentication_identification and authentication_and authentication_authenticators_multi factor authentication,"[21, 40]",21,authentication_identification and authentication_and authentication_multi factor authentication_factor authentication,40,authentication_identification and authentication_cached authenticators_authenticators identification and_cryptographic module,0.746193
18,101,data subjects_criteria for privacy_of personal information_the entity objectives_objectives related to,"[7, 76]",76,criteria for processing_for processing integrity_processing integrity_for processing_completely and accurately,7,data subjects_criteria for privacy_of personal information_objectives related to_to privacy,0.744382
17,100,the cloud_cloud service_the cloud service_provider_service provider,"[0, 4, 13, 61]",98,cloud service provider_service provider_cloud service_the customer_the cloud service,0,the cloud service_continuous_cloud service_provider_service provider,0.721467
16,99,and privacy_security and privacy_security and_development_life cycle,"[8, 36, 57]",36,system component or_system component_supply chain_system system_system system component,94,security and privacy_and privacy_security and_development_systems,0.700403
15,98,cloud service provider_service provider_cloud service_the customer_the cloud service,"[4, 13, 61]",61,the cloud service_cloud service provider_cloud service_provide the customer_subcontractors or,85,cloud service provider_service provider_cloud service_the customer_the cloud service,0.699175
14,97,iec 27002 2013_iec 27002_iso iec 27002_personally identifiable information_personally identifiable,"[16, 31, 78]",89,iec 27002 2013_iec 27002_iso iec 27002_personally identifiable_personally identifiable information,78,technical compliance review_27002 2013 18_27002 2013_iec 27002 2013_in iso iec,0.666756
13,96,personally identifiable_personally identifiable information_identifiable information_cloud personally identifiable_public cloud,"[23, 26, 63, 72]",87,personally identifiable information_personally identifiable_identifiable information_cloud personally identifiable_public cloud,92,personally identifiable information_personally identifiable_identifiable information_identifiable_public cloud,0.638196
12,95,of focus_points of focus_points of_board of_the board of,"[19, 48]",19,the board of_of directors_board of directors_board of_of focus,48,communicates_business partners_and business_and business partners_vendors and business,0.635446
11,94,security and privacy_and privacy_security and_development_systems,"[8, 57]",8,monitoring_security and privacy_and privacy_security and_the system,57,and privacy_security and privacy_system development life_development life cycle_system development,0.608296
...,...,...,...,...,...,...,...,...


In [84]:
# TBC: is this the same as below, or do we have different topics/topic representations here?
# Print top terms for each topic
space = " "*10
for k, v in topic_model.get_topics().items():
    if k != -1:  # Exclude outlier topic if present
        print(f'Cluster {k: 3}: {",     ".join([term for term, _ in v[:3]])}')

Cluster   0: the cloud service,     continuous,     cloud service
Cluster   1: supply chain,     and procedures,     policies and procedures
Cluster   2: privileged access,     access management,     identity access
Cluster   3: personally identifiable,     personally identifiable information,     identifiable information
Cluster   4: cloud service provider,     service provider,     cloud service
Cluster   5: business continuity,     resilience,     and procedures
Cluster   6: contingency,     alternate processing,     contingency planning
Cluster   7: data subjects,     criteria for privacy,     of personal information
Cluster   8: monitoring,     security and privacy,     and privacy
Cluster   9: key management,     cryptography encryption,     encryption key management
Cluster  10: incident response,     incident response incident,     response training
Cluster  11: cloud service,     the cloud service,     cloud service customer
Cluster  12: security and privacy,     security and,

- check here: https://maartengr.github.io/BERTopic/index.html#visualizations

In [85]:
# visualize these results
# NOTE: the differences between topics aren't that clear from the keywords. I want to try with the min-max distance thing
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

***

In [88]:
# inspect topics per document (e.g., FedRAMP). not sure this is displaying right? can check with pivot tables
# topics per class
classes = np.array(df_merged['document'])

# Create topic model and calculate topics per class
# topic_model = BERTopic()
# topic_model = BERTopic(embedding_model=finetuned_model,  
#                        verbose=True,
#                        n_gram_range=(1, 3),
#                        min_topic_size=5,
#                        calculate_probabilities=True,
#                        representation_model=representation_model)
# topics, probs = topic_model.fit_transform(docs)
topics_per_class = topic_model.topics_per_class(docs, classes=classes)
topic_model.visualize_topics_per_class(topics_per_class)


0it [00:00, ?it/s]

9it [00:46,  5.11s/it]


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

## Merge Topics

In [89]:
# merge topics

"""
In BERTopic, you can use .merge_topics to manually select and merge those topics. Doing so will update their topic representation which in turn updates the entire model

topics_to_merge = [1, 2]
topic_model.merge_topics(docs, topics_to_merge)
"""

# If you have several groups of topics you want to merge, create a list of lists instead:

# NOTE: there are more merges possible but these are a good starting point
topics_to_merge = [[56, 41, 75, 34, 44, 82, 51, 26, 72, 23, 63, 74, 16, 31, 78, 43, 3, 22, 17, 66],  # PII management
                    [15, 11, 79],  # CSP general? unclear but really closely related topics
                    [10, 71, 6],  # incident response and contingency plan 
                    [8, 57],
                    [25, 36, 1],  # supply chain. note that 1 isn't in same hierarchy, could impact results after merge?
                    [37, 70],  # mobile
                    [6, 71, 10, 54],  # contingency planning and incident response
                    [45, 30, 46]]  # network isolation
            
# to merge:
topic_model.merge_topics(docs, topics_to_merge)

In [94]:
# Print top terms for each topic

for k, v in topic_model.get_topics().items():
    if k != -1:
        print(f'Cluster {k : >2}:  {v[0][0]: >35} {v[1][0]: >35} {v[2][0]: >35}')

Cluster  0:  personally identifiable information             personally identifiable            identifiable information
Cluster  1:                    the cloud service                       cloud service                          continuous
Cluster  2:                         supply chain                            security                                  of
Cluster  3:                          contingency                   incident response                alternate processing
Cluster  4:                            should be                   privileged access                   access management
Cluster  5:                        cloud service                   the cloud service              cloud service customer
Cluster  6:               cloud service provider                    service provider                       cloud service
Cluster  7:                 security and privacy                         and privacy                        security and
Cluster  8:                  bus

In [95]:
# printing tree
hierarchical_topics = topic_model.hierarchical_topics(docs)
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

100%|██████████| 53/53 [00:13<00:00,  3.82it/s]

.
├─cloud service_the cloud_the cloud service_customer_service provider
│    ├─cloud service_the cloud_the cloud service_provider_service provider
│    │    ├─customer_cloud service_cloud service provider_service provider_the customer
│    │    │    ├─■──the cloud service_cloud service_cloud service provider_provide the customer_subcontractors or ── Topic: 44
│    │    │    └─customer_cloud service_cloud service provider_service provider_the customer
│    │    │         ├─■──cloud service provider_service provider_cloud service_service provider shall_customer personal data ── Topic: 6
│    │    │         └─■──cloud service_cloud service provider_service provider_the customer_the cloud service ── Topic: 13
│    │    └─the cloud_cloud service_the cloud service_criterion_provider
│    │         ├─■──cloud service_the cloud service_cloud service customer_cloud service provider_service provider ── Topic: 5
│    │         └─■──the cloud service_cloud service_continuous_provider_service provi




## pivot table to check how the topics now align to documents


In [99]:
# first we trim crap from df_merged
to_delete = ['extended_fipps_label', 'BERTembeddings', 'BERTlargeembeddings', 'ada2_embeddings', 'all-mpnet-base-v2_embeddings', 'ada2_embeddings_new', 'topic', 'probs', 'representative_docs', 'top_words', 'topic_label', 'is_representative', 'count']

for col in to_delete:
    del df_merged[col]

df_merged.head(2)

Unnamed: 0,control_category,control_code,control_name,control_text,page,document,control_text_corrected,full_control_text,finetune_embeddings
0,organisation of information security (ois),OIS-01,information security management system (isms),basic criterion: the cloud service provider operates an information security management system (isms) in accordance with iso/iec 27001. the scope of the isms covers the cloud service provider’s or...,36.0,c5,Basic criterion: The cloud service provider operates an Information Security Management System (ISMS) in accordance with ISO/IEC 27001. The scope of the ISMS covers the cloud service provider's or...,Organisation of information security (ois). Information security management system (isms). Basic criterion: The cloud service provider operates an Information Security Management System (ISMS) in ...,"[0.26916763186454773, -0.17895491421222687, 0.05843820422887802, 0.10326196253299713, 0.41640037298202515, -0.26914629340171814, 0.020081838592886925, 0.2376757264137268, 0.32093170285224915, 0.48..."
1,organisation of information security (ois),OIS-02,information security policy,basic criterion: the top management of the cloud service provider has adopted an information security policy and communicated it to internal and external employees as well as cloud customers. the ...,36.0,c5,"Basic criterion: The top management of the cloud service provider has adopted an information security policy and communicated it to internal and external employees, as well as cloud customers. The...",Organisation of information security (ois). Information security policy. Basic criterion: The top management of the cloud service provider has adopted an information security policy and communicat...,"[0.30318355560302734, -0.28403574228286743, 0.011182880960404873, 0.11411823332309723, 0.5018870830535889, -0.15271033346652985, 0.13693717122077942, 0.2578427791595459, 0.3121120035648346, 0.2964..."


In [97]:
# next we make a df from the topic modeling
df_temp = topic_model.get_document_info(docs)
df_temp.rename(columns={'Document': 'temp_full_control_text'}, inplace = True)
df_temp.head(2)

Unnamed: 0,temp_full_control_text,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,Organisation of information security (ois). Information security management system (isms). Basic criterion: The cloud service provider operates an Information Security Management System (ISMS) in ...,1,1_the cloud service_cloud service_continuous_provider,"[the cloud service, cloud service, continuous, provider, service provider, cloud service provider, auditing, continuous auditing, of the cloud, basic criterion]",[Product safety and security (pss). Guidelines and recommendations for cloud customers. Basic criterion: The cloud service provider provides cloud customers with guidelines and recommendations for...,the cloud service - cloud service - continuous - provider - service provider - cloud service provider - auditing - continuous auditing - of the cloud - basic criterion,0.164448,False
1,Organisation of information security (ois). Information security policy. Basic criterion: The top management of the cloud service provider has adopted an information security policy and communicat...,1,1_the cloud service_cloud service_continuous_provider,"[the cloud service, cloud service, continuous, provider, service provider, cloud service provider, auditing, continuous auditing, of the cloud, basic criterion]",[Product safety and security (pss). Guidelines and recommendations for cloud customers. Basic criterion: The cloud service provider provides cloud customers with guidelines and recommendations for...,the cloud service - cloud service - continuous - provider - service provider - cloud service provider - auditing - continuous auditing - of the cloud - basic criterion,0.328016,False


In [100]:
# concat the dfs
df_for_inspectionv2 = pd.concat([df, df_temp], axis=1)

In [161]:
df_for_inspectionv2.head(4)

Unnamed: 0.2,Unnamed: 0.1,extended_fipps_label,Unnamed: 1,Unnamed: 0,control_category,control_code,control_name,control_text,page,document,control_text_corrected,full_control_text,BERTembeddings,BERTlargeembeddings,ada2_embeddings,all-mpnet-base-v2_embeddings,ada2_embeddings_new,topic,probs,temp_full_control_text,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,0,,0,0.0,organisation of information security (ois),OIS-01,information security management system (isms),basic criterion: the cloud service provider operates an information security management system (isms) in accordance with iso/iec 27001. the scope of the isms covers the cloud service provider’s or...,36.0,c5,Basic criterion: The cloud service provider operates an Information Security Management System (ISMS) in accordance with ISO/IEC 27001. The scope of the ISMS covers the cloud service provider's or...,Organisation of information security (ois). Information security management system (isms). Basic criterion: The cloud service provider operates an Information Security Management System (ISMS) in ...,[-2.34765545e-01 -2.21934378e-01 -9.93741989e-01 3.67022187e-01\n 8.49737585e-01 -1.64016023e-01 -6.38136506e-01 -1.12917721e-01\n -9.66692090e-01 -9.99156654e-01 -7.52763391e-01 8.85846198e-01...,[-0.99997824 -0.9996931 0.9999947 ... -0.99998593 0.9952955\n -0.9961389 ],"[0.017099885269999504, -0.01439644955098629, 0.00117865193169564, -0.0317063108086586, -0.01916027069091797, 0.03131260722875595, -0.025590771809220314, -0.03850426897406578, 0.0047671012580394745...","[0.025447199121117592, -0.047563113272190094, -0.0010698908008635044, -0.02945668436586857, -0.05244108662009239, 0.0008069048053584993, 0.042670611292123795, 0.04808742180466652, -0.0093944948166...","[0.017112663, -0.0144224055, 0.001129416, -0.031731915, -0.019304238, 0.031469453, -0.025629625, -0.03850349, 0.004901518, -0.020209739, 0.021745155, 0.0050229076, -0.011312205, -0.00075540465, -0...",0,1.0,Organisation of information security (ois). Information security management system (isms). Basic criterion: The cloud service provider operates an Information Security Management System (ISMS) in ...,1,1_the cloud service_cloud service_continuous_provider,"[the cloud service, cloud service, continuous, provider, service provider, cloud service provider, auditing, continuous auditing, of the cloud, basic criterion]",[Product safety and security (pss). Guidelines and recommendations for cloud customers. Basic criterion: The cloud service provider provides cloud customers with guidelines and recommendations for...,the cloud service - cloud service - continuous - provider - service provider - cloud service provider - auditing - continuous auditing - of the cloud - basic criterion,0.164448,False
1,1,,1,1.0,organisation of information security (ois),OIS-02,information security policy,basic criterion: the top management of the cloud service provider has adopted an information security policy and communicated it to internal and external employees as well as cloud customers. the ...,36.0,c5,"Basic criterion: The top management of the cloud service provider has adopted an information security policy and communicated it to internal and external employees, as well as cloud customers. The...",Organisation of information security (ois). Information security policy. Basic criterion: The top management of the cloud service provider has adopted an information security policy and communicat...,[-6.86250806e-01 -4.82106537e-01 -9.97918665e-01 6.63456619e-01\n 8.98568869e-01 -3.34756434e-01 1.12952009e-01 2.46998787e-01\n -9.75181043e-01 -9.99981463e-01 -7.59770513e-01 9.54410553e-01...,[-0.9995211 -0.998107 0.9999736 ... -0.99775356 0.9597814\n -0.98308253],"[0.01658015325665474, -0.006132795009762049, 0.003490208415314555, -0.04595108702778816, -0.0068142167292535305, 0.02606022357940674, -0.009393647313117981, -0.023387720808386803, 0.00281377276405...","[0.04575734958052635, -0.019889475777745247, -0.00013527192641049623, -0.029681429266929626, -0.07318992912769318, -0.021198689937591553, 0.035301703959703445, 0.044299498200416565, 0.001085930271...","[0.016641092, -0.006172235, 0.0033654645, -0.04578628, -0.006897207, 0.026138885, -0.009265004, -0.023478437, 0.0030578503, -0.03195196, 0.020618457, -0.004482852, 0.0057864706, 0.007921479, -0.00...",0,1.0,Organisation of information security (ois). Information security policy. Basic criterion: The top management of the cloud service provider has adopted an information security policy and communicat...,1,1_the cloud service_cloud service_continuous_provider,"[the cloud service, cloud service, continuous, provider, service provider, cloud service provider, auditing, continuous auditing, of the cloud, basic criterion]",[Product safety and security (pss). Guidelines and recommendations for cloud customers. Basic criterion: The cloud service provider provides cloud customers with guidelines and recommendations for...,the cloud service - cloud service - continuous - provider - service provider - cloud service provider - auditing - continuous auditing - of the cloud - basic criterion,0.328016,False
2,2,,2,2.0,organisation of information security (ois),OIS-03,interfaces and dependencies,basic criterion: interfaces and dependencies between cloud service delivery activities performed by the cloud service provider and activities performed by third parties are documented and communic...,37.0,c5,Basic criterion: Interfaces and dependencies between cloud service delivery activities performed by the cloud service provider and activities performed by third parties are documented and communic...,Organisation of information security (ois). Interfaces and dependencies. Basic criterion: Interfaces and dependencies between cloud service delivery activities performed by the cloud service provi...,[-0.31162736 -0.3930869 -0.9984547 0.36828983 0.9144251 -0.33310673\n -0.6419001 0.16103128 -0.9918847 -0.9995398 -0.8787695 0.941377\n 0.09024994 0.94268304 -0.5306556 -0.5521342 -...,[-0.9999912 -0.99972457 0.999996 ... -0.9999959 0.99831015\n -0.9972545 ],"[0.007835658267140388, -0.0041145686991512775, 0.017787961289286613, -0.014626561664044857, 0.002270983997732401, 0.030338583514094353, -0.007794953417032957, -0.0182221457362175, 0.00386355607770...","[0.031252745538949966, -0.0051760380156338215, 0.004831104073673487, -0.03803708404302597, -0.08529587835073471, -0.011238569393754005, 0.0484217032790184, 0.01367255300283432, 0.04232234135270119...","[0.00785735, -0.004132233, 0.017587436, -0.0146154845, 0.0023629724, 0.030316614, -0.0077623557, -0.018510234, 0.003952423, -0.039436024, 0.015958969, 0.0066699265, 0.0059947916, -0.0014749494, -0...",0,1.0,Organisation of information security (ois). Interfaces and dependencies. Basic criterion: Interfaces and dependencies between cloud service delivery activities performed by the cloud service provi...,1,1_the cloud service_cloud service_continuous_provider,"[the cloud service, cloud service, continuous, provider, service provider, cloud service provider, auditing, continuous auditing, of the cloud, basic criterion]",[Product safety and security (pss). Guidelines and recommendations for cloud customers. Basic criterion: The cloud service provider provides cloud customers with guidelines and recommendations for...,the cloud service - cloud service - continuous - provider - service provider - cloud service provider - auditing - continuous auditing - of the cloud - basic criterion,0.24826,False
3,3,risk assessment,3,3.0,organisation of information security (ois),OIS-04,segregation of duties,basic criterion: conflicting tasks and responsibilities are separated based on an ois-06 risk assessment to reduce the risk of unauthorised or unintended changes or misuse of cloud customer data p...,38.0,c5,Basic criterion: Conflicting tasks and responsibilities are separated based on an OIS-06 risk assessment to reduce the risk of unauthorized or unintended changes or misuse of cloud customer data p...,Organisation of information security (ois). Segregation of duties. Basic criterion: Conflicting tasks and responsibilities are separated based on an OIS-06 risk assessment to reduce the risk of un...,[-1.26607642e-01 1.74753163e-02 -8.93225491e-01 -1.01690978e-01\n 5.05746186e-01 -1.56378970e-01 -8.32797289e-01 -1.71892509e-01\n -6.36364758e-01 -9.94106650e-01 -3.80956352e-01 2.64591187e-01...,[-0.9999812 -0.99968517 0.99999386 ... -0.9999936 0.9962414\n -0.99698454],"[0.011924328282475471, -0.02276644855737686, 0.007795905228704214, -0.0424332395195961, -0.020201215520501137, 0.03396262601017952, -0.009666387923061848, -0.015765497460961342, -0.002267960458993...","[0.04159489646553993, 0.02648993209004402, -0.00977355893701315, -0.015823548659682274, -0.060550931841135025, 0.00342398788779974, 0.03466673940420151, -0.0167149156332016, 0.002295789774507284, ...","[0.011999014, -0.02294126, 0.007658234, -0.042351037, -0.020346155, 0.03411091, -0.009684823, -0.015717773, -0.0022590118, -0.024278942, 0.023850884, -0.013289878, 0.008581235, -0.00031477347, -0....",0,1.0,Organisation of information security (ois). Segregation of duties. Basic criterion: Conflicting tasks and responsibilities are separated based on an OIS-06 risk assessment to reduce the risk of un...,1,1_the cloud service_cloud service_continuous_provider,"[the cloud service, cloud service, continuous, provider, service provider, cloud service provider, auditing, continuous auditing, of the cloud, basic criterion]",[Product safety and security (pss). Guidelines and recommendations for cloud customers. Basic criterion: The cloud service provider provides cloud customers with guidelines and recommendations for...,the cloud service - cloud service - continuous - provider - service provider - cloud service provider - auditing - continuous auditing - of the cloud - basic criterion,0.186898,False


In [109]:
# for i in range(1, max(topic_model.get_topics().keys())+1):
#     print(f"{i} {topic_model.get_topic(i, full=True)['OpenAI'][0][0]}: {topic_model.get_topic_freq(i)}")

KeyError: 'OpenAI'

In [105]:
# Step 1: Add a helper column for counting if needed
df_merged['count'] = 1

# Step 2: Create the pivot table using 'docs' as index, 'topic' as columns, and the new 'count' column for values
pivot_table = df_merged.pivot_table(index='document', columns='topic_label', values='count', aggfunc='count', fill_value=0)

pivot_table.loc['Total'] = pivot_table.sum()

KeyError: 'topic_label'

In [None]:
# Now, pivot_table should have documents as rows and topics as columns with counts
pivot_table


In [None]:
# # Block 2: Use Pre-generated Embeddings for Topic Modeling with BERTopic, min-max

# # Load pre-generated embeddings
# pre_generated_embeddings = list(df_merged['finetune_embeddings'].values)
# pre_generated_embeddings = np.array(pre_generated_embeddings)

# # Create a representation model
# representation_model = KeyBERTInspired(random_state=42)

# # Instantiate BERTopic with your fine-tuned model's embeddings and the representation model
# topic_model = BERTopic(embedding_model='all-MiniLM-L6-v2',  # needed, but doesn't regenerate embeddings. see https://github.com/MaartenGr/BERTopic/issues/1601
#                        verbose=True,
#                        n_gram_range=(1, 3),
#                        min_topic_size=2,
#                        calculate_probabilities=True,
#                        representation_model=representation_model)

# # Fit BERTopic to your documents using pre-generated embeddings
# topics, probs = topic_model.fit_transform(docs, embeddings=pre_generated_embeddings)


In [None]:


# # Optional: Print top terms for each topic
# space = " "*10
# for k, v in topic_model.get_topics().items():
    
#     if k != -1:  # Exclude outlier topic if present
#         print(f'Cluster {k: 3}: {",     ".join([term for term, _ in v[:3]])}')