# Lista 09 - Agrupamento

> Yanna Torres Gonçalves
> 
> Matrícula: 587299
> 
> Mestrado em Ciências da Computação

## Setup: limpeza e pré-processamento dos dados

In [None]:
!pip install nltk

In [None]:
!pip install pyLDAvis

In [2]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
tqdm.pandas()

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [18]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()

In [3]:
data_url = "https://media.githubusercontent.com/media/yanna-torres/CKP9011-ciencia-de-dados/refs/heads/lista-01/data/fakeTelegram.BR_2022.csv"

df = pd.read_csv(data_url)

In [4]:
df.describe()

Unnamed: 0,dataset_info_id,score_sentiment,score_misinformation,id_message
count,557586.0,444157.0,167238.0,557586.0
mean,5.0,0.01733,0.312245,445061.7
std,0.0,0.464165,0.293699,486021.1
min,5.0,-1.0,3e-06,2.0
25%,5.0,-0.1779,0.078454,21275.0
50%,5.0,0.0,0.197577,121093.5
75%,5.0,0.3182,0.490351,972604.5
max,5.0,0.9992,1.0,1516436.0


In [5]:
def has_trava_zap(message):
    if not isinstance(message, str):
        return False  # Garante que só analisa strings

    # Regra 1: muito longo
    if len(message) > 10000:
        return True

    # Regra 2: caracteres invisíveis
    invisible = ''.join(chr(c) for c in range(0x200B, 0x200F + 1)) + '\uFEFF'
    if sum(message.count(c) for c in invisible) > 100:
        return True

    # Regra 3: emojis repetidos
    emoji_pattern = re.compile("[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
                              "\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]+", flags=re.UNICODE)
    emojis = emoji_pattern.findall(message)
    if any(len(e) > 20 for e in emojis):
        return True

    # Regra 4: baixa diversidade
    unique = set(message)
    if len(unique) < 10 and len(message) > 500:
        return True

    return False

In [6]:
df['trava_zap'] = df['text_content_anonymous'].apply(has_trava_zap)
df_clean = df[df['trava_zap'] == False]
df_clean.drop(columns=['trava_zap'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean.drop(columns=['trava_zap'], inplace=True)


In [7]:
df_clean = df_clean.drop_duplicates()

In [8]:
df_clean = df_clean[df_clean['text_content_anonymous'].str.split().str.len() >= 5]

In [9]:
df_clean.columns

Index(['date_message', 'id_member_anonymous', 'id_group_anonymous', 'media',
       'media_type', 'media_url', 'has_media', 'has_media_url',
       'text_content_anonymous', 'dataset_info_id', 'date_system',
       'score_sentiment', 'score_misinformation', 'id_message', 'message_type',
       'messenger', 'media_name', 'media_md5'],
      dtype='object')

## Pré-processar o texto

In [11]:
texts = df_clean['text_content_anonymous'].dropna().astype(str).tolist()

In [12]:
stop_words = set(stopwords.words('portuguese', 'english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-záéíóúâêîôûãõç\s]', '', text)  # remove punctuation
    tokens = word_tokenize(text, language='portuguese')
    return [word for word in tokens if word not in stop_words and len(word) > 2]

texts_preprocessed = [preprocess(text) for text in texts]

## Avaliação Experimental

### Latent Dirichlet Allocation (LDA)

In [None]:
!pip install gensim

In [15]:
from gensim import corpora, models

dictionary = corpora.Dictionary(texts_preprocessed)
corpus = [dictionary.doc2bow(text) for text in texts_preprocessed]

lda_model = models.LdaModel(corpus=corpus, num_topics=5, id2word=dictionary, passes=10)

for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx}: {topic}")

Topic 0: 0.007*"brasil" + 0.007*"deus" + 0.006*"sobre" + 0.005*"contra" + 0.005*"dia" + 0.005*"todos" + 0.004*"dias" + 0.004*"governo" + 0.004*"sistema" + 0.004*"mundo"
Topic 1: 0.012*"bolsonaro" + 0.012*"agora" + 0.012*"todos" + 0.011*"povo" + 0.010*"vai" + 0.010*"pra" + 0.010*"brasil" + 0.009*"vamos" + 0.007*"fazer" + 0.006*"tudo"
Topic 2: 0.088*"user" + 0.055*"canal" + 0.050*"grupo" + 0.029*"link" + 0.022*"telegram" + 0.018*"youtube" + 0.017*"verdade" + 0.013*"inscreva" + 0.013*"envie" + 0.012*"compartilhe"
Topic 3: 0.040*"the" + 0.033*"brazil" + 0.030*"this" + 0.029*"tse" + 0.027*"superior" + 0.027*"was" + 0.027*"court" + 0.027*"decision" + 0.027*"electoral" + 0.027*"following"
Topic 4: 0.027*"lula" + 0.017*"bolsonaro" + 0.012*"tse" + 0.009*"brasil" + 0.008*"presidente" + 0.007*"militar" + 0.007*"diz" + 0.007*"veja" + 0.006*"eleições" + 0.006*"sobre"


In [19]:
lda_vis = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(lda_vis)

### Gibbs Sampling Dirichlet Multinomial Mixture (GSDMM)

In [None]:
!pip install git+https://github.com/rwalk/gsdmm.git

In [23]:
from gsdmm import MovieGroupProcess

mgp = MovieGroupProcess(K=5, alpha=0.1, beta=0.3, n_iters=30)
vocab = set(word for doc in texts_preprocessed for word in doc)
vocab_size = len(vocab)

mgp.fit(texts_preprocessed, vocab_size)

In stage 0: transferred 264788 clusters with 5 clusters populated
In stage 1: transferred 132169 clusters with 5 clusters populated
In stage 2: transferred 46830 clusters with 5 clusters populated
In stage 3: transferred 28008 clusters with 5 clusters populated
In stage 4: transferred 23187 clusters with 5 clusters populated
In stage 5: transferred 21683 clusters with 5 clusters populated
In stage 6: transferred 21137 clusters with 5 clusters populated
In stage 7: transferred 20659 clusters with 5 clusters populated
In stage 8: transferred 20357 clusters with 5 clusters populated
In stage 9: transferred 19973 clusters with 5 clusters populated
In stage 10: transferred 19916 clusters with 5 clusters populated
In stage 11: transferred 19454 clusters with 5 clusters populated
In stage 12: transferred 19464 clusters with 5 clusters populated
In stage 13: transferred 19459 clusters with 5 clusters populated
In stage 14: transferred 19574 clusters with 5 clusters populated
In stage 15: trans

AttributeError: 'MovieGroupProcess' object has no attribute 'get_top_words'

### Pseudo-document based Topic Model (PTM)

### BERTopic

In [None]:
!pip install bertopic

In [None]:
from bertopic import BERTopic

# Flatten texts back into sentences
documents = [' '.join(text) for text in texts_preprocessed]

topic_model = BERTopic(language="portuguese")
topics, probs = topic_model.fit_transform(documents)
topic_model.get_topic_info()