# Lista 06 - Classificação Binária

>Yanna Torres Gonçalves
>
>Matrícula: 587299
>
>Mestrado em Ciências da Computação


## Parte 01: limpeza e pré-processamento dos dados

In [1]:
# !pip install faiss-gpu-cu12
# !pip install scikit-learn fuzzywuzzy python-Levenshtein spacy nltk sentence_transformers emoji unidecode

In [2]:
!python -m spacy download pt_core_news_sm

Collecting pt-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.8.0/pt_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')


In [3]:
import pandas as pd
import numpy as np
import re

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
import spacy
from tqdm import tqdm

nltk.download('punkt')
nltk.download('stopwords')

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
tqdm.pandas()

### a) Ler o dataset fakeTelegram.BR_2022.csv

Utilizamos o mesmo dataset da lista 01, acessando pelo mesmo link.

[https://github.com/yanna-torres/CKP9011-ciencia-de-dados/blob/lista-01/data/fakeTelegram.BR_2022.csv](https://github.com/yanna-torres/CKP9011-ciencia-de-dados/blob/lista-01/data/fakeTelegram.BR_2022.csv)
   

In [5]:
data_url = "https://media.githubusercontent.com/media/yanna-torres/CKP9011-ciencia-de-dados/refs/heads/lista-01/data/fakeTelegram.BR_2022.csv"

df = pd.read_csv(data_url)

In [6]:
df.describe()

Unnamed: 0,dataset_info_id,score_sentiment,score_misinformation,id_message
count,557586.0,444157.0,167238.0,557586.0
mean,5.0,0.01733,0.312245,445061.7
std,0.0,0.464165,0.293699,486021.1
min,5.0,-1.0,3e-06,2.0
25%,5.0,-0.1779,0.078454,21275.0
50%,5.0,0.0,0.197577,121093.5
75%,5.0,0.3182,0.490351,972604.5
max,5.0,0.9992,1.0,1516436.0


### b) Remova os trava-zaps, as linhas repetidas e textos com menos de 5 palavras

#### Removendo os trava-zaps

In [7]:
def has_trava_zap(message):
    if not isinstance(message, str):
        return False  # Garante que só analisa strings

    # Regra 1: muito longo
    if len(message) > 10000:
        return True

    # Regra 2: caracteres invisíveis
    invisible = ''.join(chr(c) for c in range(0x200B, 0x200F + 1)) + '\uFEFF'
    if sum(message.count(c) for c in invisible) > 100:
        return True

    # Regra 3: emojis repetidos
    emoji_pattern = re.compile("[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
                              "\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]+", flags=re.UNICODE)
    emojis = emoji_pattern.findall(message)
    if any(len(e) > 20 for e in emojis):
        return True

    # Regra 4: baixa diversidade
    unique = set(message)
    if len(unique) < 10 and len(message) > 500:
        return True

    return False

In [8]:
df['trava_zap'] = df['text_content_anonymous'].apply(has_trava_zap)

In [9]:
df_clean = df[df['trava_zap'] == False]

In [10]:
df_clean.drop(columns=['trava_zap'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean.drop(columns=['trava_zap'], inplace=True)


In [11]:
df_clean.describe()

Unnamed: 0,dataset_info_id,score_sentiment,score_misinformation,id_message
count,556305.0,442876.0,166416.0,556305.0
mean,5.0,0.017676,0.311496,444851.4
std,0.0,0.463547,0.293146,485954.4
min,5.0,-1.0,3e-06,2.0
25%,5.0,-0.1779,0.078334,21251.0
50%,5.0,0.0,0.196892,120869.0
75%,5.0,0.3182,0.488192,972422.0
max,5.0,0.9992,1.0,1516436.0


#### Removendo as linhas repetidas

In [12]:
df_clean = df_clean.drop_duplicates()

In [13]:
df_clean.describe()

Unnamed: 0,dataset_info_id,score_sentiment,score_misinformation,id_message
count,556305.0,442876.0,166416.0,556305.0
mean,5.0,0.017676,0.311496,444851.4
std,0.0,0.463547,0.293146,485954.4
min,5.0,-1.0,3e-06,2.0
25%,5.0,-0.1779,0.078334,21251.0
50%,5.0,0.0,0.196892,120869.0
75%,5.0,0.3182,0.488192,972422.0
max,5.0,0.9992,1.0,1516436.0


#### Removendo textos com menos de 5 palavras

In [14]:
df_clean = df_clean[df_clean['text_content_anonymous'].str.split().str.len() >= 5]

In [15]:
df_clean.describe()

Unnamed: 0,dataset_info_id,score_sentiment,score_misinformation,id_message
count,338045.0,337998.0,166402.0,338045.0
mean,5.0,0.022497,0.311514,421487.5
std,0.0,0.529214,0.29315,484234.6
min,5.0,-1.0,3e-06,2.0
25%,5.0,-0.3818,0.07834,19888.0
50%,5.0,0.0,0.196925,84735.0
75%,5.0,0.5053,0.488286,972697.0
max,5.0,0.9992,1.0,1516436.0


### c) Agrupe as linhas com postagens iguais ou semelhantes

In [16]:
df_clean.columns

Index(['date_message', 'id_member_anonymous', 'id_group_anonymous', 'media',
       'media_type', 'media_url', 'has_media', 'has_media_url',
       'text_content_anonymous', 'dataset_info_id', 'date_system',
       'score_sentiment', 'score_misinformation', 'id_message', 'message_type',
       'messenger', 'media_name', 'media_md5'],
      dtype='object')

In [17]:
import emoji
import unidecode

def normalize_text(text):
    text = str(text).lower()
    text = re.sub(r"https?://\S+|www\.\S+", " ", text)
    text = emoji.replace_emoji(text, " ")
    text = re.sub(r"[^\w\sÀ-ÿ]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [18]:
df_clean['text_normalized'] = df_clean['text_content_anonymous'].progress_apply(normalize_text)

100%|██████████| 338045/338045 [00:59<00:00, 5673.81it/s]


In [19]:
df_clean = df_clean[df_clean['text_normalized'].notnull()]
df_clean = df_clean[df_clean['text_normalized'] != ""]

In [20]:
len(df_clean)

338034

In [21]:
df_clean = df_clean[df_clean['text_normalized'].str.split().str.len() >= 5]

In [22]:
len(df_clean)

334780

In [23]:
# Remove exact duplicates and count occurrences as "shares"
df_grouped = (
    df_clean.groupby('text_normalized', as_index=False)
      .agg({
            'date_message': 'min',
            'id_member_anonymous': 'first',
            **{col: 'first' for col in df_clean.columns if col != 'text_normalized' and col != 'date_message'}
      })
)

In [24]:
share_counts = df_clean['text_content_anonymous'].value_counts().to_dict()
df_grouped['shares'] = df_grouped['text_content_anonymous'].map(share_counts)

In [25]:
len(df_grouped)

213677

In [26]:
df_grouped.head(5)

Unnamed: 0,text_normalized,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,text_content_anonymous,dataset_info_id,date_system,score_sentiment,score_misinformation,id_message,message_type,messenger,media_name,media_md5,shares
0,0 31 dos votos contados,2022-10-02 20:25:52,,ca8d6cc94923d0967c15917812bb71bb,5db8d948c7921ad7ea5db4d54b1551f5.jpg,image/jpg,,True,False,"🗳📃 — 0,31% dos votos contados",5,2022-10-03 05:10:41.408186,0.0,,12118,Imagem,telegram,,5db8d948c7921ad7ea5db4d54b1551f5,1
1,0 48 dos votos contados,2022-10-02 20:27:10,,ca8d6cc94923d0967c15917812bb71bb,b5911be07ab3e1d22ec41c9d4c3c02d2.jpg,image/jpg,,True,False,"📃 — 0,48% dos votos contados",5,2022-10-03 05:10:43.398983,0.0,,12120,Imagem,telegram,,b5911be07ab3e1d22ec41c9d4c3c02d2,1
2,0 75 em alta o payroll de setembro 263 mil vei...,2022-10-07 12:51:35,,2ff252ad4422e11a6a8abfaa747abb55,,,t.me/alexeconomia,False,True,"0,75 EM ALTA - O payroll de setembro (263 mil)...",5,2022-10-07 12:51:46.727848,-0.3753,0.005815,49312,Texto,telegram,,,1
3,0 800 n a n a alaia azzedine paris france 011 ...,2022-10-02 02:53:42,,6e4192de8ce3464c5263b0a937e53529,,,,False,False,",,""0 800"",""N/A"",""N/A"",\n\n""Alaia, Azzedine"",""P...",5,2022-10-03 04:54:51.879906,0.0,0.003349,19900,Texto,telegram,,,1
4,0 a 10 vai ter 2o turno o que vocês acham,2022-10-03 00:11:39,,b11f2df64ac19aad47a50accf32052d6,,,,False,False,"0 a 10 vai ter 2o turno, o que vocês acham ?",5,2022-10-03 05:18:48.986706,0.0,,154118,Texto,telegram,,,1


In [27]:
from sentence_transformers import SentenceTransformer
import torch

# Load a multilingual model (Portuguese-friendly) and move to GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device=device)

2025-06-20 22:42:20.254453: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750459340.273280    2354 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750459340.277770    2354 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1750459340.290434    2354 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750459340.290447    2354 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1750459340.290450    2354 computation_placer.cc:177] computation placer alr

cuda


In [28]:
# Get embeddings (batch-processed on GPU)
texts = df_grouped['text_normalized'].tolist()
print("Encoding texts into embeddings...")
embeddings = model.encode(texts, batch_size=64, show_progress_bar=True, device=device)
df_grouped['embedding'] = embeddings.tolist()

Encoding texts into embeddings...


Batches:   0%|          | 0/3339 [00:00<?, ?it/s]

In [29]:
df_grouped = df_grouped.reset_index(drop=True)
embeddings_array = np.array(df_grouped['embedding'].tolist())

In [30]:
import faiss

faiss.normalize_L2(embeddings)
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)

In [32]:
from collections import defaultdict

similarity_threshold = 0.9
k = 1000

clusters = defaultdict(list)
removed_indices = set()

for i in tqdm(range(len(df_grouped))):
    if i in removed_indices:
        continue
    vec = embeddings[i].reshape(1, -1)
    D, I = index.search(vec, k)
    for sim, j in zip(D[0][1:], I[0][1:]):
        if j == -1 or j == i or j in removed_indices:
            continue
        if sim >= similarity_threshold:
            clusters[i].append(j)
            removed_indices.add(j)

100%|██████████| 213677/213677 [2:06:24<00:00, 28.17it/s]  


In [33]:
results = []
used = set()

for base_idx, sim_indices in tqdm(clusters.items()):
    if base_idx in used:
        continue
    total_count = df_grouped.iloc[base_idx]['shares']
    used.add(base_idx)
    for sim_idx in sim_indices:
        total_count += df_grouped.iloc[sim_idx]['shares']
        used.add(sim_idx)
    row = df_grouped.iloc[base_idx].copy()
    row['shares'] = total_count
    results.append(row)

# Add remaining unmatched
remaining = df_grouped.loc[~df_grouped.index.isin(used)].copy()
final_df = pd.concat([pd.DataFrame(results), remaining], ignore_index=True)

100%|██████████| 10353/10353 [00:04<00:00, 2570.24it/s]


In [34]:
len(final_df)

178404

In [35]:
final_df.head()

Unnamed: 0,text_normalized,date_message,id_member_anonymous,id_group_anonymous,media,media_type,media_url,has_media,has_media_url,text_content_anonymous,...,date_system,score_sentiment,score_misinformation,id_message,message_type,messenger,media_name,media_md5,shares,embedding
0,0 31 dos votos contados,2022-10-02 20:25:52,,ca8d6cc94923d0967c15917812bb71bb,5db8d948c7921ad7ea5db4d54b1551f5.jpg,image/jpg,,True,False,"🗳📃 — 0,31% dos votos contados",...,2022-10-03 05:10:41.408186,0.0,,12118,Imagem,telegram,,5db8d948c7921ad7ea5db4d54b1551f5,2,"[0.027809733524918556, 0.09836164861917496, -0..."
1,01 11 2022 08 00 ásia pacífico fechado s p asx...,2022-11-01 11:01:54,,2ff252ad4422e11a6a8abfaa747abb55,,,youtube.com/AlexEconomia,False,True,▪️ 01/11/2022 - 08:00 \n\nÁsia-Pacífico (fecha...,...,2022-11-01 11:02:13.43606,0.0,0.065628,52387,Texto,telegram,,,106,"[0.045012831687927246, -0.08233073353767395, 0..."
2,02 10 2022 tse governo sp às 18h 59m 00s,2022-10-02 22:05:53,56b8359fd127312651b80b8ed8030085,ef6bcfbd08be365e9b208c22f1d6ad36,a9dff1444485730d391ca9720108cdb7.jpg,image/jpg,,True,False,*02.10.2022 TSE - Governo-SP às 18h 59m 00s*,...,2022-10-03 05:13:14.67546,0.0,,43651,Imagem,telegram,,a9dff1444485730d391ca9720108cdb7,12,"[-0.038795433938503265, 0.20785652101039886, 0..."
3,02 10 2022 tse presidente às 18h 55m 31s,2022-10-02 21:59:55,56b8359fd127312651b80b8ed8030085,ef6bcfbd08be365e9b208c22f1d6ad36,b18ed539a09a227d1fa8de9ad82d7214.jpg,image/jpg,,True,False,02.10.2022 TSE - Presidente - às 18h 55m 31s,...,2022-10-03 05:13:05.88844,0.0,,43650,Imagem,telegram,,b18ed539a09a227d1fa8de9ad82d7214,15,"[-0.014751886017620564, 0.32362258434295654, 0..."
4,02 10 2022 tse prévia das 17h 44m 10s,2022-10-02 20:53:12,56b8359fd127312651b80b8ed8030085,ef6bcfbd08be365e9b208c22f1d6ad36,31559a0c6a512e4531296377b981c7c3.jpg,image/jpg,,True,False,02.10.2022 TSE - Prévia das 17h 44m 10s,...,2022-10-03 05:11:20.651301,0.0,,43648,Imagem,telegram,,31559a0c6a512e4531296377b981c7c3,2,"[-0.08622369170188904, 0.4251839518547058, 0.0..."


In [36]:
final_df.drop(columns=['embedding']).to_csv("grouped_by_similarity.csv")

In [37]:
final_df[['text_content_anonymous', 'shares']].sort_values(by='shares', ascending=False).head(10)

Unnamed: 0,text_content_anonymous,shares
5088,Grupo a diretoria bloqueado:\n\nEstá comunidad...,17423
2627,Comentarios de Anderson Martinho de 29 años. B...,1607
46,"1, pressione o botão abaixo dentro de 3 Minuto...",1425
9308,"Welcome, 00\n\n🔸 [USER] — professional tool fo...",1282
7095,Olá 1...2...3...5...8...13...21...34...55...89...,1256
7165,🚨Olá SEJA bem vindo(a) 1- ao grupo Especulando...,1090
177931,سکس مردان ازبک با زن انگلیسی با این vpn از سای...,1019
7098,Olá 182 (-) \n\nSeja bem vindo(a) ao grupo DIR...,724
4573,"Feedback de Afonso Rico 27 ​​anos. Brasil, Li...",698
177964,فیلم سوپر با زیرنویس فارسی ببین😍😍\nبا این فیلت...,632


---

### d) Colunas Auxiliares

Para facilitar a recuperação de dados em alguns casos, vamos criar colunas auxiliares.

In [None]:
stop_words = set(stopwords.words('portuguese', 'english'))

tqdm.pandas()

def clean_text(text):
    if not isinstance(text, str):
        return ''
    text_no_punct = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text_no_punct)
    filtered = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered)

final_df['text_no_stopwords'] = final_df['text_content_anonymous'].progress_apply(clean_text)

In [None]:
final_df['word_count'] = final_df['text_content_anonymous'].progress_apply(lambda x: len(x.split()) if pd.notnull(x) else 0)
final_df['character_count'] = final_df['text_content_anonymous'].progress_apply(lambda x: len(x) if pd.notnull(x) else 0)

In [None]:
def classify_misinformation(score):
    if score >= 0.66:
        return 'Misinformation'
    elif score >= 0.33:
        return 'Neutral'
    else:
        return 'Non-misinformation'

# Criar coluna de categoria
final_df['misinformation_category'] = final_df['score_misinformation'].progress_apply(classify_misinformation)

In [None]:
final_df['sentiment'] = final_df['score_sentiment'].progress_apply(lambda x: "Positive" if x >= 0.05 else ("Negative" if x <= -0.05 else "Neutral"))

In [None]:
final_df.info()

In [None]:
final_df.describe()

In [None]:
final_df.drop(columns=['embedding']).to_csv("final_with_columns.csv")

## Parte 02: Modelo Preditivo

Utilizando os dados referente a postagens no Telegram, crie um modelo preditivo
(regressor) para, dado os dados de uma postagem, prever a quantidade de
compartilhamentos dessa mensagem, o que é denominado potencial de “viralização”.

### Pré-processamento

Removendo mensagens com caracteres 

In [None]:
final_df.columns

In [None]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Remove spans e tags
    text = re.sub(r"<[^>]+>", "", text)
    # Remove caracteres não latinos
    text = re.sub(r"[^\x00-\x7F]+", "", text)
    return text

In [None]:
final_df['text_clean'] = final_df['text_no_stopwords'].progress_apply(clean_text)

In [None]:
final_df = final_df.dropna(subset=['shares', 'text_clean'])

In [None]:
len(final_df)

In [None]:
def is_latin_only(text):
    if not isinstance(text, str):
        return False
    # This regex matches text containing **any** non-Latin characters
    return not re.search(r'[^\u0000-\u007F]', text)

In [None]:
final_df = final_df[final_df['text_no_stopwords'].apply(is_latin_only)]

In [None]:
len(final_df)

In [None]:
final_df[["shares", "text_content_anonymous"]].sort_values(by='shares', ascending=False).head(10)

In [None]:
# Mensagem do Governo
final_df = final_df.drop(index=162719)

In [None]:
final_df[["shares", "text_content_anonymous"]].sort_values(by='shares', ascending=False).head(10)

In [None]:
final_df.drop(columns=['embedding']).to_csv("final_with_columns_clean.csv")

In [None]:
final_df.head()

### 