# Lista 05 - Regresão

>Yanna Torres Gonçalves
>
>Matrícula: 587299
>
>Mestrado em Ciências da Computação


## Parte 01: limpeza e pré-processamento dos dados

In [None]:
!pip install pandas scikit-learn fuzzywuzzy python-Levenshtein spacy nltk sentence_transformers emoji unidecode torch
!python -m spacy download pt_core_news_sm

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
import pandas as pd
import numpy as np
import re

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
import spacy
from tqdm import tqdm

nltk.download('punkt')
nltk.download('stopwords')

nltk.download('punkt_tab')

### a) Ler o dataset fakeTelegram.BR_2022.csv

Utilizamos o mesmo dataset da lista 01, acessando pelo mesmo link.

[https://github.com/yanna-torres/CKP9011-ciencia-de-dados/blob/lista-01/data/fakeTelegram.BR_2022.csv](https://github.com/yanna-torres/CKP9011-ciencia-de-dados/blob/lista-01/data/fakeTelegram.BR_2022.csv)
   

In [None]:
data_url = "https://media.githubusercontent.com/media/yanna-torres/CKP9011-ciencia-de-dados/refs/heads/lista-01/data/fakeTelegram.BR_2022.csv"

df = pd.read_csv(data_url)

In [None]:
df.describe()

### b) Remova os trava-zaps, as linhas repetidas e textos com menos de 5 palavras

#### Removendo os trava-zaps

In [None]:
def has_trava_zap(message):
    if not isinstance(message, str):
        return False  # Garante que só analisa strings

    # Regra 1: muito longo
    if len(message) > 10000:
        return True

    # Regra 2: caracteres invisíveis
    invisible = ''.join(chr(c) for c in range(0x200B, 0x200F + 1)) + '\uFEFF'
    if sum(message.count(c) for c in invisible) > 100:
        return True

    # Regra 3: emojis repetidos
    emoji_pattern = re.compile("[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
                              "\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]+", flags=re.UNICODE)
    emojis = emoji_pattern.findall(message)
    if any(len(e) > 20 for e in emojis):
        return True

    # Regra 4: baixa diversidade
    unique = set(message)
    if len(unique) < 10 and len(message) > 500:
        return True

    return False

In [None]:
df['trava_zap'] = df['text_content_anonymous'].apply(has_trava_zap)

In [None]:
df_clean = df[df['trava_zap'] == False]

In [None]:
df_clean.drop(columns=['trava_zap'], inplace=True)

In [None]:
df_clean.describe()

#### Removendo as linhas repetidas

In [None]:
df_clean = df_clean.drop_duplicates()

In [None]:
df_clean.describe()

#### Removendo textos com menos de 5 palavras

In [None]:
df_clean = df_clean[df_clean['text_content_anonymous'].str.split().str.len() >= 5]

In [None]:
df_clean.describe()

### c) Agrupe as linhas com postagens iguais ou semelhantes

In [None]:
df_clean.columns

In [None]:
import emoji
import unidecode

def normalize_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = emoji.replace_emoji(text, "")
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = unidecode.unidecode(text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [None]:
df_clean['text_normalized'] = df_clean['text_content_anonymous'].apply(normalize_text)

In [None]:
len(df_clean)

In [None]:
df_clean = df_clean[df_clean['text_normalized'].notnull()]
df_clean = df_clean[df_clean['text_normalized'] != ""]

In [None]:
len(df_clean)

In [None]:
# Remove exact duplicates and count occurrences as "shares"
df_grouped = (
    df_clean.groupby('text_normalized', as_index=False)
      .agg({
            'date_message': 'min',
            'id_member_anonymous': 'first',
            **{col: 'first' for col in df_clean.columns if col != 'text_normalized' and col != 'date_message'}
      })
)

In [None]:
share_counts = df_clean['text_content_anonymous'].value_counts().to_dict()
df_grouped['shares'] = df_grouped['text_content_anonymous'].map(share_counts)

In [None]:
len(df_grouped)

In [None]:
df_grouped.head(5)

In [None]:
from sentence_transformers import SentenceTransformer
import torch

# Load a multilingual model (Portuguese-friendly) and move to GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device=device)

In [None]:
# Get embeddings (batch-processed on GPU)
texts = df_grouped['text_normalized'].tolist()
print("Encoding texts into embeddings...")
embeddings = model.encode(texts, batch_size=128, show_progress_bar=True, device=device)
df_grouped['embedding'] = embeddings.tolist()

In [None]:
faiss.normalize_L2(embeddings)
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)

In [None]:
similarity_threshold = 0.85
k = 10

clusters = defaultdict(list)
removed_indices = set()

for i in tqdm(range(len(df_grouped))):
    if i in removed_indices:
        continue
    vec = embeddings[i].reshape(1, -1)
    D, I = index.search(vec, k)
    for sim, j in zip(D[0][1:], I[0][1:]):
        if j == -1 or j == i or j in removed_indices:
            continue
        if sim >= similarity_threshold:
            clusters[i].append(j)
            removed_indices.add(j)

In [None]:
results = []
used = set()

for base_idx, sim_indices in clusters.items():
    if base_idx in used:
        continue
    total_count = df_grouped.iloc[base_idx]['share_count']
    used.add(base_idx)
    for sim_idx in sim_indices:
        total_count += df_grouped.iloc[sim_idx]['share_count']
        used.add(sim_idx)
    row = df_grouped.iloc[base_idx].copy()
    row['share_count'] = total_count
    results.append(row)

# Add remaining unmatched
remaining = df_grouped.loc[~df_grouped.index.isin(used)].copy()
final_df = pd.concat([pd.DataFrame(results), remaining], ignore_index=True)

In [None]:
len(final_df)

In [None]:
final_df[['text_content_anonymous', 'share_count']].head()

---

### d) Colunas Auxiliares

Para facilitar a recuperação de dados em alguns casos, vamos criar colunas auxiliares.

In [None]:
stop_words = set(stopwords.words('portuguese', 'english'))

def clean_text(text):
  if not isinstance(text, str):
    return ''
  text_no_punct = text.translate(str.maketrans('', '', string.punctuation))
  tokens = word_tokenize(text_no_punct)
  filtered = [word for word in tokens if word.lower() not in stop_words]
  return ' '.join(filtered)

df_clean['text_no_stopwords'] = df_clean['text_content_anonymous'].apply(clean_text)

In [None]:
df_clean['word_count'] = df_clean['text_content_anonymous'].apply(lambda x: len(x.split()) if pd.notnull(x) else 0)
df_clean['character_count'] = df_clean['text_content_anonymous'].apply(lambda x: len(x) if pd.notnull(x) else 0)

In [None]:
text_counts = df_clean['text_content_anonymous'].value_counts()
df_clean['viral'] = df_clean['text_content_anonymous'].apply(lambda x: True if pd.notnull(x) and text_counts[x] > 1 else False)

In [None]:
def classify_misinformation(score):
    if score >= 0.66:
        return 'Misinformation'
    elif score >= 0.33:
        return 'Neutral'
    else:
        return 'Non-misinformation'

# Criar coluna de categoria
df_clean['misinformation_category'] = df_clean['score_misinformation'].apply(classify_misinformation)

In [None]:
df_clean['sentiment'] = df_clean['score_sentiment'].apply(lambda x: "Positive" if x >= 0.05 else ("Negative" if x <= -0.05 else "Neutral"))

In [None]:
df_clean.info()

In [None]:
df_clean.describe()

## Parte 02: Modelo Preditivo

Utilizando os dados referente a postagens no Telegram, crie um modelo preditivo
(regressor) para, dado os dados de uma postagem, prever a quantidade de
compartilhamentos dessa mensagem, o que é denominado potencial de “viralização”.