# Finetunes an LLM to improve topic modeling results.

2024-06-17

Zachary Kilhoffer

Requirements:
- 'data\df_2024-02-28.xlsx'
- 'data\df_ada_2024-02-28.xlsx'

Outputs:
- fine_tuned_model
- domain_adapted_model

# https://maartengr.github.io/BERTopic/getting_started/parameter%20tuning/parametertuning.html#umap


In [44]:
# import sys
# print(os.path.dirname(sys.executable))

# next step: try to install an earlier version of python, like 3.8.5

I'm in Python 3.8.5. It sounds too simple to be real, but I had this same issue and all I did was reinstall numpy. Gone.

    > pip install --upgrade numpy

In [45]:
import pandas as pd
import transformers
# import ast
# import os
# import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    AutoModelForMaskedLM,
    AutoModelForSequenceClassification,
)
from datasets import Dataset
# from sklearn.preprocessing import LabelEncoder
# import joblib  # save the label encoder
# from openai import OpenAI
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from bertopic import BERTopic
# from umap import UMAP
# # from hdbscan import (HDBSCAN)
import accelerate
# from sentence_transformers import SentenceTransformer, models


In [46]:
# display tweaks
pd.set_option("display.max_colwidth", 200)  # how much text is showing within a cell
pd.set_option("display.max_columns", False)
pd.set_option("display.max_rows", False)
# warnings.filterwarnings("ignore")

# getting data

In [47]:
# # read api key
# def read_key_from_file(filename=r"key.txt"):  # replace with yours
#     with open(filename, 'r') as file:
#         return file.read().strip()

In [48]:
# get data
# df = pd.read_excel(r'data\df_2024-02-28.xlsx')
# df_ada = pd.read_excel(r'data\df_ada_2024-02-28.xlsx')

In [49]:
# load data
data = "../data/df_embeddings_publicdomain.xlsx"  # the documents we want to embed must be in their own rows
df = pd.read_excel(data)

In [50]:
# inspect df
df.columns

Index(['extended_fipps_label', 'Unnamed: 1', 'Unnamed: 0', 'control_category',
       'control_code', 'control_name', 'document', 'control_text_corrected',
       'full_control_text', 'BERTembeddings', 'BERTlargeembeddings',
       'ada2_embeddings', 'all-mpnet-base-v2_embeddings'],
      dtype='object')

In [51]:
# # Merge df with df_ada on df['topic'] and df_ada['topic_num']
# df_merged = pd.merge(df, df_ada, left_on='topic', right_on='topic_num', how='left')

# # drop the 'topic_num' column if it's redundant
# df_merged.drop('topic_num', axis=1, inplace=True)

In [52]:
# # getting a way to say if ALL rows in the df are representative
# df = df[['topic_num', 'representative_docs']]
# df

In [53]:
# # read in strings as lists as they should be
# df_temp['representative_docs'] = df_temp['representative_docs'].apply(ast.literal_eval)
# df_temp

In [54]:
# # explode to get items in list out
# df_temp = df_temp.explode(['representative_docs'])
# df_temp.head(5)

In [55]:
# # Create a set for faster lookup times of representative_docs
# representative_docs_set = set(df_temp['representative_docs'])

# # Use apply to check each row in df for a match in representative_docs_set
# df_merged['is_representative'] = df_merged['full_control_text'].apply(lambda x: x in representative_docs_set)

In [56]:
# df_merged['is_representative'].value_counts()

In [57]:
# # to visually/manually inspect representative docs. Do they pass the smell test?
# df_merged[df_merged['is_representative'] == True].to_excel(r'data\finalized\df to inspect 2024-02-28.xlsx')

### TBD: in df_merged.topic_label, add 'NA' if df_merged.topic == -1
That will make the pivot table more informative
Rather, just change the label "Cloud data privacy and security" into "Unlabeled"

In [58]:
# # Pivot table: Document summary

# # Step 1: Add a helper column for counting if needed
# df_merged['count'] = 1

# # Step 2: Create the pivot table using 'docs' as index, 'topic' as columns, and the new 'count' column for values
# pivot_table = df_merged.pivot_table(index='document', columns='topic_label', values='count', aggfunc='count', fill_value=0)

# pivot_table.loc['Total'] = pivot_table.sum()

# # Now, pivot_table should have documents as rows and topics as columns with counts
# pivot_table

In [59]:
# # normalized pivot table

# # Normalize the counts by row to get the percentage and then round to two decimals
# pivot_table_percentage_rounded = (pivot_table.div(pivot_table.sum(axis=1), axis=0) * 100).round(1)

# # Display the pivot table with percentages rounded to two decimals
# pivot_table_percentage_rounded

In [60]:
# # Sum the values for each column and add as a new row called "Total"
# pivot_table_percentage_rounded.loc['Total'] = pivot_table_percentage_rounded.sum()

# # Display the updated pivot table with the "Total" row
# pivot_table_percentage_rounded

# Training domain-customized finetuning model

In [61]:
# 1. Prepare dataset
texts = list(df['full_control_text'].values)

# Finetuning with domain-specific lexicon

    > The lexicon is just all of the control texts.

In [62]:
# 2. Tokenize Your Dataset
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Create a Dataset object from your texts
dataset = Dataset.from_dict({"text": texts})

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# 3. Create a Data Collator for MLM
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

Map: 100%|██████████| 670/670 [00:00<00:00, 2945.24 examples/s]


In [63]:
import accelerate
print(accelerate.__version__)


0.31.0


In [64]:
# initialize model settings
model = AutoModelForMaskedLM.from_pretrained("bert-base-cased")

training_args = TrainingArguments(
    output_dir="./outputs/test_pretrained_model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,  # Directly use the tokenized dataset here
)

# train the model
trainer.train()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [None]:
# 5. Save the further pretrained Model
model.save_pretrained("outputs\domain_adapted_model")
tokenizer.save_pretrained("outputs\domain_adapted_model")

# Finetune with labeled data

In [None]:
# importing the model already trained 
domain_adapted_model = 'outputs\domain_adapted_model'

## Prepping training data

In [None]:
# importing training data not yet in proper format
df_training = pd.read_excel(r'data\labels\df_labels_consensus.xlsx', sheet_name='Data')
temp_labels = pd.read_excel(r'data\labels\df_labels_consensus.xlsx', sheet_name='Label Choices')

In [None]:
# inspecting
df_training.head()

In [None]:
df_training['66_label'][2]

In [None]:
# inspecting
temp_labels.head()

In [None]:
# must strip these strings
str(temp_labels['Abbreviation'][22])

In [None]:
temp_labels['Abbreviation'] = temp_labels['Abbreviation'].apply(lambda x: str(x).strip())

In [None]:
# confirming it worked
str(temp_labels['Abbreviation'][22])

In [None]:
# removing all but necessary columns for simplicity
temp_labels = temp_labels[['Category', 'Definition', 'Abbreviation']]
temp_labels.head()

In [None]:
# only taking the majority opinion (66_label) because I"m not sure about weighting

# Merge df_training with temp_labels on df_training['33_label'] and df_ada['topic_num']
df_training = pd.merge(df_training, temp_labels, left_on='66_label', right_on='Abbreviation', how='left')

# drop unneeded columns
to_drop = ['33_label', '66_label', 'page', 'document', 'Abbreviation', 'indexmaybe', 'control_category', 'control_text']

for x in to_drop:
    del df_training[x]

df_training.head()

In [None]:
# concatenate the category and definition as we need one column of labels
df_training['label'] = df_training['Category'] + ': ' + df_training['Definition']
df_training['label']
del df_training['Category']
del df_training['Definition']

# check results
df_training.head()

## Tokenizing training data

In [None]:
# df_training is DataFrame with labeled data
texts = df_training['control_text_corrected'].values
labels = df_training['label'].values

# Convert text labels to integers
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Create a Dataset object from your texts and encoded labels
labeled_dataset = Dataset.from_dict({"text": texts, "label": encoded_labels})

# Tokenize the labeled dataset
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    # Adjust the function to handle the mapping correctly for sequence classification
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_labeled_dataset = labeled_dataset.map(tokenize_function, batched=True)

# Ensure model is suited for sequence classification
model = AutoModelForSequenceClassification.from_pretrained(domain_adapted_model, num_labels=len(label_encoder.classes_))

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="outputs/fine_tuned_model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_labeled_dataset,
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Optionally, save the model and the tokenizer
model.save_pretrained("outputs/fine_tuned_model")
tokenizer.save_pretrained("outputs/fine_tuned_model")

In [None]:
# Also, save the label encoder for later use in inference to decode the predicted labels
joblib.dump(label_encoder, "outputs/fine_tuned_model/label_encoder.joblib")