# Parameter settings - EU Daten und KI

Rodar a seguinte rotina: For loop com as matrix de similaridade para ```umap::n_neighbors [5, 7, 10, 20]```, ```umap::min_dist [0.0, 0.1, 0.25]``` , and ```hdbscan::min_cluster_size``` and ```min_samples {10: 2.5, 15: 3.75, 20: 5.0, 25: 6.25, 30: 7.5}```  . Esses parametros baseiam-se nas documentacoes do BERTopic , HDBSCAN e UMAP . Decidimos permanecer com os valores default para outros importantes parametros como ```umap::n_components (5)```,  ```CountVectorizer::n_gram_range (1,1)```, ```BERTopic::min_topic_size (10)``` e automático ```BERTopic::nr_topics``` . Vamos olhar o AVG similarity e o número de tópicos para cada uma dessas combinacoes, a partir disso vamos decidir quais visualizacoes vamos fazer.

In [1]:
# Importando tudo que já está fixo:

# Data processing
import json
import numpy as np
import pandas as pd
import re


# Topic model
import umap
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN

# Dimension reduction
from umap import UMAP


# Countvectorizer (para stopwords)
from sklearn.feature_extraction.text import CountVectorizer


# German stopwords que eu alterei:
# Carregue a lista a partir do arquivo
with open("C:\\Users\\opc\\Documents\\Python_Scripts\\champions_2.0\\01_gesetze_sammlung\\german_stopwords.json", "r") as f:
    german_stopwords_alterada = json.load(f) 


## For UMAP Analisys:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
%matplotlib inline
sns.set()


from sentence_transformers import SentenceTransformer

# para montar as combinacoes:
from itertools import product

# Para a matrix 
from sklearn.metrics.pairwise import cosine_similarity

# Topic Coherence:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

## Functions to clean data:

def replace_char(x):
  return re.sub('[^a-zA-Z\süöäß]', '', x)


def clean_corpus(s):
    s = s.apply(lambda x: x.lower() if isinstance(x, str) else x)     # tolower:
    idx = s.apply(lambda x: isinstance(x, str))                       # Criar um índice booleano para filtrar os valores não-texto
    s = s.drop(idx[~idx].index)                                       # Remover os valores não-texto
    s = s.apply(replace_char)                                         # remover as partes de palavras que nao sao caracteres ou espacos
    idx = s.apply(lambda x: len(x) < 2)                               # Criar um índice booleano para filtrar os elementos com comprimento menor que 2
    s = s.drop(idx[idx].index)                                        # Remover os elementos com comprimento menor que 2
    idx = s.apply(lambda x: x == '')                                  # Criar um índice booleano para filtrar os elementos vazios
    s = s.drop(idx[idx].index)                                        # Remover os elementos vazios
    
    return s

def calculuate_coherence_score(topic_model):  
  #variable
  topic_words = topic_words = [[words for words, _ in topic_model.get_topic(topic) if words!=''] 
               for topic in range(len(set(topics))-1)]
  vectorizer = topic_model.vectorizer_model
  tokenizer = vectorizer.build_tokenizer()
  #dictionary
  tokens = [doc.split() for doc in docs]
  dictionary = corpora.Dictionary(tokens)
  corpus = [dictionary.doc2bow(token) for token in tokens]


  coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
  coherence = coherence_model.get_coherence()

  
  return coherence


# Abrindo o arquivos
df_data_ki = pd.read_csv("C:\\Users\\opc\\Documents\\Python_Scripts\\champions_2.0\\01_gesetze_sammlung\\df_data_ki.csv")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# creating parameters combinations

n = [5, 7, 10, 20]  # n_neighbors
d = [0.0, 0.1, 0.25]   # min_dist
c = [10, 15, 20, 25, 30] #min_cluster
s = [c_i/4 for c_i in c]  # min_sample

combinations1 = list(product(n, d, c, s))

combinations1 = [comb for comb in combinations1 if comb[3] == comb[2]/4]

# #For test:
combinations = combinations1[:2]

In [None]:
# Daten and KI:

# Dados
docs = df_data_ki['text']
docs = clean_corpus(docs).to_list()

### início do loop:

df = pd.DataFrame({'parameters': [],'n_neighbors': [], 'min_dist': [], 'min_cluster' : [], 
                      'min_sample': [], 'topic_coherence': [], 'nr_topics': [], 'avg_similarity' : [], 'topics': []})

for comb in combinations:
    n_neighbors = comb[0]
    min_dist = comb[1]
    min_cluster = comb[2]
    min_sample = int(comb[3])



    umap_model  = UMAP(n_neighbors=n_neighbors, 
                            min_dist = min_dist,
                            n_components=5, 
                            metric='cosine', random_state=42)


    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster, 
                            min_samples = min_sample, 
                            metric='euclidean', 
                            prediction_data=True)

    vectorizer_model = CountVectorizer(stop_words=german_stopwords_alterada) #german_stop_words alterada

    topic_model = BERTopic(language = "german", vectorizer_model=vectorizer_model, 
                        umap_model=umap_model, 
                        hdbscan_model=hdbscan_model).fit(docs)

    topics, probs = topic_model.fit_transform(docs)

    # Matrix de similaridade:

    matrixA = cosine_similarity(np.array(topic_model.topic_embeddings_)[1:, :])
    avg_similarity = np.average(matrixA)
    labels = (topic_model.get_topic_info().sort_values("Topic", ascending=True).Name)[1:]
    nr_topics = len(labels)
    labels = labels.str.cat(sep='; ')
    title = 'n_neighbors = ' + str(n_neighbors) + ' min_dist = ' + str(min_dist) + ' min_cluster = ' + str(min_cluster) + ' min_sample = ' + str(min_sample)
    
    #coherence score:
    cs = calculuate_coherence_score(topic_model)

    df_temp = pd.DataFrame({'parameters': [title],
                            'n_neighbors': [n_neighbors],
                            'min_dist': [min_dist],
                            'min_cluster' : [min_cluster],
                            'min_sample': [min_sample], 
                            'topic_coherence': [cs],
                            'nr_topics': [nr_topics],
                            'avg_similarity' : [avg_similarity],
                            'topics': [labels]})
    

    pd.DataFrame({'parameters': [], 
                       'nr_topics': [], 'avg_similarity' : [], 'topics': []})


    df = pd.concat([df, df_temp])

In [None]:
#df.to_csv("df_avg_similarity_coerence_coeficient_daten_ki_eu.csv", sep = ';', index = False, )

In [None]:
df = pd.read_csv("df_avg_similarity_coerence_coeficient_daten_ki_eu.csv", sep=";")
df1 = df
df1 = df1.sort_values('avg_similarity')
df1.head()

Topic coherence is a way to judge the quality of topics via a single quantitative, scalar value. There are many ways to compute the coherence score. For the u_mass and c_v options, a higher is always better. Note that u_mass is between -14 and 14 and c_v is between 0 and 1.

https://datascience.oneoffcoder.com/topic-modeling-gensim.html

In [None]:
# Criando os subplots
fig, axs = plt.subplots(2, 2, figsize=(10, 10))

# Plotando os gráficos de dispersão
sns.scatterplot(x="nr_topics", y="topic_coherence", data=df, ax=axs[0][0])
sns.scatterplot(x="avg_similarity", y="topic_coherence", data=df, ax=axs[0][1])
sns.scatterplot(x="n_neighbors", y="topic_coherence", data=df, ax=axs[1][0])
sns.scatterplot(x="min_dist", y="topic_coherence", data=df, ax=axs[1][1])

# Adicionando títulos e rótulos de eixos
axs[0][0].set_title("NR_Topics VS Topic Coherence")
axs[0][1].set_title("Avg_similarity VS Topic Coherence")
axs[1][0].set_title("N_neighbors VS Topic Coherence")
axs[1][1].set_title("Min_dist VS Topic Coherence")

plt.show()


In [None]:
# Verificando a dispersao:

import seaborn as sns
import matplotlib.pyplot as plt

# criando o gráfico de dispersão
sns.scatterplot(data=df1, x='avg_similarity', y='topic_coherence', hue='nr_topics')

# adicionando labels de eixo
plt.xlabel('avg_similarity')
plt.ylabel('topic_coherence')

# adicionando título
plt.title("EU Daten Und KI: coherence Vs avg_similarity")

# exibindo o gráfico
plt.show()

### Segunda parte: pegando exemplos dentro dos modelos escolhidos

In [2]:
df = pd.read_csv("df_avg_similarity_coerence_coeficient_daten_ki_eu.csv", sep = ";")
#df.describe()
# topic_coherence > 0,55 & avg_similarity < 0.85
df1 = df.query('topic_coherence > 0.55 & avg_similarity < 0.85')
df1 = df1.reset_index()

combinations = []

for i in range(0,len(df1)):
    n = int(df1.iloc[i, 2])  # linha coluna
    d = int(df1.iloc[i, 3])
    c = int(df1.iloc[i, 4])
    s = int(df1.iloc[i, 5])
    
    com = [(n, d, c, s)]

    combinations.extend(com)

# #Pra teste:
# combinations = combinations[:2]

In [3]:
# Daten and KI:

# Dados
docs = df_data_ki['text']
docs = clean_corpus(docs).to_list()

### início do loop:

df = pd.DataFrame({'Document': [],'Topic': [], 'Name': [], 'Top_n_words' : [], 'Probability': [], 'Representative_document': [], 'model': []})

for comb in combinations:
    n_neighbors = comb[0]
    min_dist = comb[1]
    min_cluster = comb[2]
    min_sample = int(comb[3])


    umap_model  = UMAP(n_neighbors=n_neighbors, 
                            min_dist = min_dist,
                            n_components=5, 
                            metric='cosine', random_state=42)


    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster, 
                            min_samples = min_sample, 
                            metric='euclidean', 
                            prediction_data=True)

    vectorizer_model = CountVectorizer(stop_words=german_stopwords_alterada) #german_stop_words alterada

    topic_model = BERTopic(language = "german", vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model).fit(docs)

    topics, probs = topic_model.fit_transform(docs)

    # Dataframe with representative examples:

    df_temp = topic_model.get_document_info(docs)
    df_temp = df_temp.query('Representative_document == True')

    #confirmando o modelo desse df
    position = combinations.index(comb)
    position = str(position)

    #add modelo
    df_temp['model'] = 'model ' + position

    df = pd.concat([df, df_temp], axis=0)

    ##### Visualizacao dos resultados

    title = 'EU_Daten_ki_n_neighbors = ' + str(n_neighbors) + ' min_dist = ' + str(min_dist) + ' min_cluster = ' + str(min_cluster) + ' min_sample = ' + str(min_sample)

    fig_1 = topic_model.visualize_heatmap()
    fig_1.update_layout(title = title)

    ############################################Visualizacao da distribuicao

    sentence_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")   #Suitable Score Functions: cosine
    embeddings = sentence_model.encode(docs, show_progress_bar=False)


    # Train BERTopic
    topic_model = BERTopic(language = "german", 
                            umap_model=umap_model,
                            vectorizer_model=vectorizer_model,
                            hdbscan_model=hdbscan_model).fit(docs, embeddings)


    # Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
    reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)
    fig_2 = topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)
    fig_2.update_layout(title = title)

    title = re.sub(r'\s', '_', title) + '.html'

    with open(title, 'a') as f:
        f.write(fig_1.to_html(full_html=False, include_plotlyjs='cdn'))
        f.write(fig_2.to_html(full_html=False, include_plotlyjs='cdn'))


In [4]:
df.to_csv("df_examples_models_daten_ki_eu.csv", sep = ';', index = False, )