In [None]:
import pandas as pd
from tqdm.notebook import tqdm
import pickle
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
SENTENCES_FILE = './data/<YOUR_FILE_HERE>.csv'
EMBEDDINGS_FILE = './data/<YOUR_EMBEDDINGS_HERE>.npy'

EXPLANATIONS_FILE = './data/<YOUR_EXPLANATIONS_FILE_HERE>.pickle'

In [None]:
df_texts = pd.read_csv(SENTENCES_FILE)
print(df_texts.shape)
df_texts.head()

In [None]:
from collections import Counter

def filter_matching_explanations(df, target_configuration):

    labels = df['label'].tolist()
    print("  Original labels:", len(labels), Counter(labels))
    llm_labels = df[target_configuration].tolist()
    matching_items = [y for (x,y) in zip(labels,llm_labels) if x == y[0]]
    matching_labels = [y[0] for (x,y) in zip(labels,llm_labels) if x == y[0]]
    print("  Matching labels:", len(matching_labels), Counter(matching_labels))

    positive_matching = [d for x, d in matching_items if x == 1]
    negative_matching = [d for x, d in matching_items if x == 0]
    result = dict()
    result[0] = negative_matching
    result[1] = positive_matching

    return result # matching_items

In [None]:
# Test for filtering the explanations for the Topic modeling
llm_explanations_df = pd.read_pickle(EXPLANATIONS_FILE)
print(llm_explanations_df.columns)
llm_explanations_df.head()

In [None]:
target_configuration = 'responses_bert_FS_expli_gpt-3.5-turbo_dynamic_all_cosine_False_4'

print("llm:", target_configuration)
df = llm_explanations_df[['label', target_configuration]].dropna()
corpus_dict = filter_matching_explanations(df, target_configuration)

# Removing spurious strings (if any)
print("Before:", len(corpus_dict[0])+len(corpus_dict[1]))
explanations_corpus = [s for s in (corpus_dict[0]+corpus_dict[1]) if (len(s) > 0) and not(s.isspace())]
print("After:", len(explanations_corpus))

labels = [0]*len(corpus_dict[0]) + [1]*len(corpus_dict[1])

corpus_dict[1] # Positive class

----
## **BERT-Topic**

In [None]:
# column = 'text'
corpus = df_texts['text'].tolist() # For sentences
# corpus = explanations_corpus # For explanations
print("Before:", len(corpus))

# Removing spurious strings (if any)
corpus = [s for s in corpus if (len(s) > 0) and not(s.isspace())]
print("After:", len(corpus))

In [None]:
from umap import UMAP
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer

In [None]:
import tiktoken
import openai
from bertopic.representation import OpenAI
import os
import sys

In [None]:
# Configuring OpenAI (GPT)
OPENAI_API_KEY = "YOUR_API_KEY_HERE"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# Tokenizer
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")

prompt = """
Usted es un asistente de un abogado que debe realizar un analisis de parrafos de una sentencia judicial.
En este contexto, hay un topico común que afecta a los siguientes parrafos, denominados DOCUMENTS:
[DOCUMENTS]

El topico se describe con las siguientes palabras clave, denominadas KEYWORDS: 
[KEYWORDS]

Brinde una descripcion breve que resuma e identifique al topico en cuestion.
"""

# Create your representation model
client = openai.OpenAI(api_key=OPENAI_API_KEY)
representation_model2 = OpenAI(
    client,
    model="gpt-3.5-turbo", 
    delay_in_seconds=2, 
    chat=True,
    nr_docs=4,
    doc_length=100,
    tokenizer=tokenizer,
    prompt=prompt
)

representation_model1 = KeyBERTInspired()

representation_model = {
    "Main": representation_model1, # KeyBERT
    "OpenAI": representation_model2,
}

In [None]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

# Pre-calculate embeddings
# predefined_embedding_model = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
predefined_embedding_model = 'espejelomar/sentece-embeddings-BETO' # Or any other embedding model
embedding_model = SentenceTransformer(predefined_embedding_model) 
embeddings = embedding_model.encode(corpus, show_progress_bar=True)

# Pre-reduce embeddings for visualization purposes
umap_model = UMAP(n_neighbors=15, n_components=2, metric='cosine', random_state=42)
reduced_embeddings = umap_model.fit_transform(embeddings)

In [None]:
import umap.plot

umap_args1 = {'n_neighbors': 15,
             'n_components': 2,
             'metric': 'cosine'}

df = pd.read_csv(EMBEDDINGS_FILE)
embeddings1 = df.drop(['doc','page','text','bias'], axis=1).values
umap_model2D = umap.UMAP(**umap_args1, random_state=42).fit(embeddings1)
umap.plot.points(umap_model2D)
plt.show()

In [None]:
# Configure BERTopic model
vectorizer_model = CountVectorizer(stop_words="spanish", ngram_range=(1, 3))
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

model = BERTopic(language="spanish", 
                 embedding_model=embedding_model, 
                 representation_model=representation_model,
                 calculate_probabilities=True,
                 verbose=True
                 )

In [None]:
topics, probs = model.fit_transform(corpus) 
n = len(model.get_topics())
print("Topics:", n)

In [None]:
topics_df = model.get_topic_info()
# topics_df.to_csv('topics_dataframe.csv', index=False)
topics_df

### **Inspection of topics**

In [None]:
topics_df = model.get_topic_info()
topics_df

In [None]:
model.get_topic(-1) # Outliers

In [None]:
model.get_topic(3)

In [None]:
model.visualize_hierarchy()

In [None]:
model.visualize_barchart(top_n_topics=9)

In [None]:
model.visualize_heatmap(width=800, height=800)

In [None]:
model.visualize_topics()

### **Wordclouds and metrics**

In [None]:
from wordcloud import WordCloud
from wordcloud import get_single_color_func

def create_wordcloud(model, topic):
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()

# TODO: The outlier topic (-1) could be empty in some cases
def show_topics_as_wordcloud(model, topic_labels, size=(5,4), title=None, noise=False, stopwords=[]):
  n = len(model.get_topics())
  custom_palette = sns.color_palette("tab10", n+1).as_hex()

  # Create subplots for each topic
  if noise:
     begin = -1
     n = n - 1
  else:
     begin = 0

  for i in range(begin,n):
      #print(i)
      text = {word: value for word, value in model.get_topic(i) if word not in stopwords}
      wc = WordCloud(background_color="white", max_font_size=150, random_state=42, color_func=get_single_color_func(custom_palette[i+1]))
      wc.generate_from_frequencies(text)

      if noise:
        plt.subplot(size[0], size[1], i+2)
      else:
        plt.subplot(size[0], size[1], i+1)
      plt.imshow(wc, interpolation="bilinear")
      plt.axis("off")
      if noise:
        plt.title(i) #topic_labels[i+1])
      else:
        plt.title(i) #topic_labels[i])
      

  if title is not None:
    plt.suptitle(title, fontsize=20)
    plt.tight_layout()
  plt.show()


# Test: Show wordcloud for a given topic
create_wordcloud(model, topic=1)

In [None]:
# set the figure size
plt.rcParams['figure.figsize'] = [6, 6]

topic_labels = topics_df['Name'].tolist()
show_topics_as_wordcloud(model, topic_labels, size=(6,4), noise=True) 

### **Inspection of documents**

In [None]:
model.get_representative_docs()

In [None]:
model.get_document_info(corpus)

### **Reducing the topics**

In [None]:
def reduce_topics(topic_model, corpus):
  topics = topic_model.topics_
  new_topics = topic_model.reduce_outliers(corpus, topics)

  new_model = topic_model.reduce_topics(corpus, nr_topics='auto')
  #df_topics = new_model.get_topic_info()
  new_topic_labels = new_model.generate_topic_labels()
  print("New topic labels:", new_topic_labels)

  return new_model

In [None]:
new_model = reduce_topics(model, corpus)
n = len(model.get_topics())
print("Topics:", n)

### **Cluster visualization with UMAP**

In [None]:
import umap.plot
import matplotlib

umap_args2 = {'n_neighbors': 15,
             'n_components': 2,
             'metric': 'cosine'}

def show_clusters_with_umap(topic_model, topics, embeddings):

  custom_palette = sns.color_palette("tab10", len(topics)).as_hex()
  topic_colors = {t:c for t,c in zip(topics,custom_palette)}

  umap_model2D = umap.UMAP(**umap_args2, random_state=42).fit(embeddings)
  umap.plot.points(umap_model2D, labels=np.array(topic_model.topics_), color_key_cmap='tab10') 

  plt.show()

In [None]:
show_clusters_with_umap(model, topics, embeddings)

In [None]:
import datamapplot
from matplotlib.colors import rgb2hex
import re

def get_palette_mapping(single_labels, palette='tab10'):

    custom_color_map = dict(
        zip(
            single_labels,
            map(rgb2hex, sns.color_palette(palette, len(np.unique(single_labels))))
        )
    )
    print(single_labels[0], custom_color_map[single_labels[0]])
    custom_color_map["Unlabelled"] = custom_color_map[single_labels[0]] # Noise
    return custom_color_map

def get_all_labels(representation, model, topics, n_labels=1, stopwords=[]):
    # Create a label for each document
    initial_labels = [label[0:n_labels] for label in model.get_topics(full=True)[representation].values()]
    initial_labels = [[re.sub(r'\W+', ' ', x.split("\n")[0].replace('"', '')) for x,_ in lb] for lb in initial_labels]
    llm_labels = [','.join([w for w in x if (w not in stopwords)]) for x in initial_labels if (len(x) > 0)]
    llm_labels = [ x+" ("+str(idx-1)+")" if idx != 0 else x for idx, x in enumerate(llm_labels)]
    print(initial_labels)
    single_labels = llm_labels #["Unlabelled"] + llm_labels
    print(single_labels)
    
    llm_labels = [label if label else "Unlabelled" for label in llm_labels]
    all_labels = [llm_labels[topic+model._outliers] if topic != -1 else "Unlabelled" for topic in topics]
    return all_labels, single_labels


representation_col = "Main" #"Main" #"OpenAI"
all_labels, single_labels = get_all_labels(representation_col, model, topics, n_labels=5)
custom_palette = get_palette_mapping(single_labels)

# Run the visualization
datamapplot.create_plot(
    reduced_embeddings,
    all_labels,
    label_font_size=11,
    title="Distribucion de Topicos en Sentencias (parrafos)",
    sub_title="BETO Embeddings (BERTopic + UMAP)", 
    label_wrap_width=20,
    use_medoids=True,
    label_color_map=custom_palette,
    noise_color=custom_palette["Unlabelled"],
    logo_width=0.16
)
plt.show()

In [None]:
print(custom_palette)
plt.colormaps["tab10"]

---