# Análise do Sentimento no Stack Overflow

Este notebook explora o sentimento das interações no Stack Overflow, analisando a evolução temporal do sentimento em diferentes tipos de posts e a relação entre o sentimento e características dos usuários.

## Configuração Inicial e Carregamento de Dados

In [None]:
# Dependências já carregadas
from google.colab import drive
import os
import zipfile
import json

drive.mount('/content/drive', force_remount=True)

folder_path = '/content/drive/My Drive/'
zip_path = '/content/drive/My Drive/analyzed.zip'
json_data = None

## Descompactando e Carregando os Dados

In [None]:
zip_file_path = '/content/drive/My Drive/analyzed.zip'
!cp "{zip_file_path}" /content/analyzed.zip
!unzip /content/analyzed.zip -d /content/extracted_analyzed
!ls /content/extracted_analyzed

In [None]:
!cp "/content/drive/My Drive/dump-users.jsonl" /content/dump_users.jsonl

## Carregando Dados de Usuários

In [None]:
import json

input_users = '/content/dump_users.jsonl'

def get_ids(post, ids):
    ids.add(post['owner_user_id'])
    if 'answers' in post:
        for a in post['answers']:
            get_ids(a, ids)
    for c in post['comments']:
        ids.add(c['user_id'])

users = {}
with open(input_users, 'r') as f:
    users = json.load(f)

## Carregando Dados de Posts Analisados

In [None]:
import json

json_file_path = '/content/extracted_analyzed/analyzed.jsonl'

json_data = []

if json_file_path:
    print(f"Opening and reading JSONL file: {json_file_path}")
    try:
        with open(json_file_path, 'r') as f:
            for line in f:
                try:
                    json_data.append(json.loads(line.strip()))
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON in line: {line}. Error: {e}")
        print("JSONL data loaded successfully.")
    except Exception as e:
        print(f"An error occurred: {e}")
else:
    print("No .jsonl file found in the extracted directory.")

print(json_data[:2])

## Explorando a Estrutura dos Dados

In [None]:
import pandas as pd

print(json_data[0])

# Sentimento das respostas
print(json_data[0]['body_sentiment'])
print(json_data[0]['creation_date'])

# Sentimento do comentário

print(json_data[0]['comments'][0]['text_sentiment'])
print(json_data[0]['comments'][0]['creation_date'])

# Sentimento das respostas

print(json_data[0]['answers'][0]['body_sentiment'])
print(json_data[0]['answers'][0]['creation_date'])

# Sentimento dos comentários das respostas

print(json_data[0]['answers'][0]['comments'][0]['text_sentiment'])
print(json_data[0]['answers'][0]['comments'][0]['creation_date'])

## Análise Temporal do Sentimento por Tipo de Post

Esta seção analisa a evolução do sentimento médio para diferentes tipos de interações (posts, respostas, comentários) ao longo dos anos.

In [None]:
import json
from tqdm import tqdm
from collections import defaultdict
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np

# Função para extrair o ano
from dateutil.parser import parse

def extract_year(date_str):
    return parse(date_str).year

sentiments_by_year = defaultdict(lambda: {'posts': [], 'answers': [], 'post_comments': [], 'answer_comments': []})

for item in tqdm(json_data, desc="Processing posts"):
    year = extract_year(item['creation_date'])
    sentiments_by_year[year]['posts'].append(item['body_sentiment'] - 1)

    for c in item.get('comments', []):
        cyear = extract_year(c['creation_date'])
        sentiments_by_year[cyear]['post_comments'].append(c['text_sentiment'] - 1)

    for a in item.get('answers', []):
        ayear = extract_year(a['creation_date'])
        sentiments_by_year[ayear]['answers'].append(a['body_sentiment'] - 1)

        for ac in a.get('comments', []):
            acyear = extract_year(ac['creation_date'])
            sentiments_by_year[acyear]['answer_comments'].append(ac['text_sentiment'] - 1)

# Função para calcular médias
def avg(values):
    return np.mean(values) if values else 0

# Preparação para análise
years = sorted(sentiments_by_year.keys())
avg_post_sent = [avg(sentiments_by_year[y]['posts']) for y in years]
avg_answer_sent = [avg(sentiments_by_year[y]['answers']) for y in years]
avg_post_comm_sent = [avg(sentiments_by_year[y]['post_comments']) for y in years]
avg_answer_comm_sent = [avg(sentiments_by_year[y]['answer_comments']) for y in years]
avg_total_sent = []
for y in years:
    all_sentiments = (
        sentiments_by_year[y]['posts'] +
        sentiments_by_year[y]['answers'] +
        sentiments_by_year[y]['post_comments'] +
        sentiments_by_year[y]['answer_comments']
    )
    avg_total_sent.append(avg(all_sentiments))

## Visualização da Evolução Temporal do Sentimento

In [None]:
# Visualização dos resultados
plt.figure(figsize=(12, 6))
plt.plot(years, avg_post_sent, label='Posts')
plt.plot(years, avg_answer_sent, label='Respostas')
plt.plot(years, avg_post_comm_sent, label='Comentários em posts')
plt.plot(years, avg_answer_comm_sent, label='Comentários em respostas')
plt.plot(years, avg_total_sent, label='Total')
plt.xlabel('Ano')
plt.ylabel('Média de Sentimento')
plt.title('Evolução Temporal do Sentimento no Stack Overflow')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('sentimentos_por_ano.png')
plt.show()

## Evolução Temporal do Sentimento com Intervalo de Confiança de 95%

In [None]:
from scipy.stats import sem
import matplotlib.pyplot as plt
import numpy as np

def confidence_interval_95(values):
    if len(values) > 1:
        return 1.96 * sem(values)
    return 0

# Preparação para análise
years = sorted(sentiments_by_year.keys())

def compute_avg_and_ci(key):
    avg_list, ci_list = [], []
    for y in years:
        values = sentiments_by_year[y][key]
        avg_list.append(avg(values))
        ci_list.append(confidence_interval_95(values))
    return avg_list, ci_list

avg_post_sent, ci_post = compute_avg_and_ci('posts')
avg_answer_sent, ci_answer = compute_avg_and_ci('answers')
avg_post_comm_sent, ci_post_comm = compute_avg_and_ci('post_comments')
avg_answer_comm_sent, ci_answer_comm = compute_avg_and_ci('answer_comments')

avg_total_sent, ci_total = [], []
for y in years:
    all_sentiments = (
        sentiments_by_year[y]['posts'] +
        sentiments_by_year[y]['answers'] +
        sentiments_by_year[y]['post_comments'] +
        sentiments_by_year[y]['answer_comments']
    )
    avg_total_sent.append(avg(all_sentiments))
    ci_total.append(confidence_interval_95(all_sentiments))
plt.figure(figsize=(12, 6))

plt.errorbar(years, avg_post_sent, yerr=ci_post, label='Posts', fmt='-o')
plt.errorbar(years, avg_answer_sent, yerr=ci_answer, label='Respostas', fmt='-o')
plt.errorbar(years, avg_post_comm_sent, yerr=ci_post_comm, label='Comentários em posts', fmt='-o')
plt.errorbar(years, avg_answer_comm_sent, yerr=ci_answer_comm, label='Comentários em respostas', fmt='-o')
plt.errorbar(years, avg_total_sent, yerr=ci_total, label='Total', fmt='-o', color='black', linewidth=2)

plt.xlabel('Ano')
plt.ylabel('Média de Sentimento')
plt.title('Evolução Temporal do Sentimento no Stack Overflow (com 95% CI)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('sentimentos_por_ano.png')
plt.show()

## Análise de Rede e Sentimento por Usuário

Esta seção foca na análise do sentimento médio de usuários e a visualização de uma sub-rede com base no sentimento.

In [None]:
import json
from collections import defaultdict
from datetime import datetime
import networkx as nx
from dateutil.parser import parse
import matplotlib.pyplot as plt
from tqdm import tqdm

MIN_YEARS_ACTIVE = 3
CURRENT_YEAR = datetime.now().year

def extract_year(date_str):
    return parse(date_str).year

def active_years(start_year):
    return CURRENT_YEAR - start_year

user_first_activity = {}
user_sentiments = defaultdict(list)  # Guarda sentimentos por usuário

G = nx.DiGraph()

for item in tqdm(json_data, desc="Processing posts"):
    post_owner = item.get('owner_user_id')
    post_year = extract_year(item['creation_date'])
    if post_owner:
        user_first_activity.setdefault(post_owner, post_year)

    for comment in item.get('comments', []):
        commenter = comment.get('user_id')
        sentiment = comment.get('sentiment', 0) - 1  # reduz 1 aqui
        if commenter:
            cyear = extract_year(comment['creation_date'])
            user_first_activity.setdefault(commenter, cyear)
            if post_owner and commenter != post_owner:
                G.add_edge(commenter, post_owner, weight=sentiment, type='comment')
                user_sentiments[commenter].append(sentiment)

    for answer in item.get('answers', []):
        answer_owner = answer.get('owner_user_id')
        sentiment = answer.get('sentiment', 0) - 1  # reduz 1 aqui
        ayear = extract_year(answer['creation_date'])
        if answer_owner:
            user_first_activity.setdefault(answer_owner, ayear)
            if post_owner and answer_owner != post_owner:
                G.add_edge(answer_owner, post_owner, weight=sentiment, type='answer')
                user_sentiments[answer_owner].append(sentiment)

        for a_comment in answer.get('comments', []):
            ac_user = a_comment.get('user_id')
            sentiment = a_comment.get('sentiment', 0) - 1  # reduz 1 aqui
            if ac_user:
                ac_year = extract_year(a_comment['creation_date'])
                user_first_activity.setdefault(ac_user, ac_year)
                if answer_owner and ac_user != answer_owner:
                    G.add_edge(ac_user, answer_owner, weight=sentiment, type='comment')
                    user_sentiments[ac_user].append(sentiment)

# Filtra usuários com pelo menos N anos de atividade
eligible_users = {u for u, y in user_first_activity.items() if active_years(y) >= MIN_YEARS_ACTIVE}
G_filtered = G.subgraph(eligible_users).copy()

# Calcula sentimento médio de cada usuário
for node in G_filtered.nodes():
    sentiments = user_sentiments.get(node, [])
    avg_sentiment = sum(sentiments) / len(sentiments) if sentiments else 0
    G_filtered.nodes[node]['avg_sentiment'] = avg_sentiment
    user_info = users.get(str(node), {})  # Busca no dicionário users (chave como string)
    G_filtered.nodes[node]['reputation'] = user_info.get('reputation', 0)

# Exemplo de métricas temporais por ano
activity_by_year = defaultdict(lambda: {'nodes': set(), 'edges': 0})

for u, v, d in G_filtered.edges(data=True):
    year_u = user_first_activity.get(u)
    year_v = user_first_activity.get(v)
    year = min(year_u, year_v)
    activity_by_year[year]['nodes'].update([u, v])
    activity_by_year[year]['edges'] += 1

print("Ano\tUsuários ativos\tInterações")
for year in sorted(activity_by_year.keys()):
    data = activity_by_year[year]
    print(f"{year}\t{len(data['nodes'])}\t\t{data['edges']}")

# nx.write_graphml(G_filtered, "rede_usuarios_filtrada.graphml")

top_nodes = sorted(G_filtered.degree, key=lambda x: x[1], reverse=True)[:1000]
top_node_ids = set(n for n, _ in top_nodes)

# Subgrafo com os 1000 principais nós
G_top = G_filtered.subgraph(top_node_ids).copy()

pos = nx.spring_layout(G_top, seed=42)
sentiments = [G_top.nodes[n]['avg_sentiment'] for n in G_top.nodes()]
nx.draw_networkx_nodes(G_top, pos, node_size=80, node_color=sentiments, cmap=plt.cm.coolwarm)
nx.draw_networkx_edges(G_top, pos, alpha=0.3)
plt.title("Top 1000 Usuários - Cor por Sentimento Médio")
plt.colorbar(plt.cm.ScalarMappable(cmap=plt.cm.coolwarm), label="Sentimento Médio")
plt.axis('off')
plt.show()

## Visualização da Rede de Usuários por Sentimento e Participação

Este gráfico exibe a rede dos 1000 usuários mais ativos, colorindo os nós pelo sentimento médio e dimensionando-os pela quantidade de interações iniciadas. As arestas são coloridas pelo sentimento da interação.

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors

# Seleciona os 1000 nós com maior grau (in + out)
top_nodes = sorted(G_filtered.degree, key=lambda x: x[1], reverse=True)[:1000]
top_node_ids = set(n for n, _ in top_nodes)
G_top = G_filtered.subgraph(top_node_ids).copy()

# Layout da rede
pos = nx.spring_layout(G_top, seed=150, k=1.2, scale=10.0)

# Cor dos nós baseada no sentimento médio
sentiments = [G_top.nodes[n]['avg_sentiment'] for n in G_top.nodes()]
cmap = plt.cm.coolwarm
norm = mcolors.Normalize(vmin=-1, vmax=1)
node_colors = [cmap(norm(s)) for s in sentiments]

# Tamanho dos nós proporcional à quantidade de interações iniciadas (out-degree)
node_sizes = [10 + 30 * G_top.out_degree(n) for n in G_top.nodes()]

# Cores das arestas com base no sentimento da interação
edge_colors = []
for u, v in G_top.edges():
    sentiment = G_top[u][v].get('weight', 0)
    if sentiment == -1:
        edge_colors.append('red')
    elif sentiment == 0:
        edge_colors.append('gray')
    elif sentiment == 1:
        edge_colors.append('green')
    else:
        edge_colors.append('black')

# Plot
plt.figure(figsize=(12, 10))
nx.draw_networkx_nodes(G_top, pos, node_size=node_sizes, node_color=node_colors)
nx.draw_networkx_edges(G_top, pos, edge_color=edge_colors, alpha=0.4)
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
plt.colorbar(sm, label="Sentimento Médio")
plt.title("Top 1000 Usuários - Cor por Sentimento Médio, Tamanho por Participação")
plt.axis('off')
plt.show()

## Análise do Sentimento por Grupos de Usuários

Esta seção analisa o sentimento médio de usuários agrupados por percentis de participação (quantidade de posts).

In [None]:
import json
from collections import defaultdict
from datetime import datetime
from dateutil.parser import parse
import networkx as nx
import pandas as pd
from tqdm import tqdm

MIN_YEARS_ACTIVE = 3
CURRENT_YEAR = datetime.now().year

def extract_year(date_str):
    return parse(date_str).year

def active_years(start_year):
    return CURRENT_YEAR - start_year

user_first_activity = {}
user_sentiments = defaultdict(list)
user_participation = defaultdict(int)
G = nx.DiGraph()

for item in tqdm(json_data, desc="Processing posts"):
    post_owner = item.get('owner_user_id')
    post_year = extract_year(item['creation_date'])
    if post_owner:
        user_first_activity.setdefault(post_owner, post_year)
        user_participation[post_owner] += 1

    for comment in item.get('comments', []):
        commenter = comment.get('user_id')
        sentiment = comment.get('text_sentiment', 0) - 1
        if commenter:
            cyear = extract_year(comment['creation_date'])
            user_first_activity.setdefault(commenter, cyear)
            user_participation[commenter] += 1
            if post_owner and commenter != post_owner:
                G.add_edge(commenter, post_owner, weight=sentiment, type='comment')
                user_sentiments[commenter].append(sentiment)

    for answer in item.get('answers', []):
        answer_owner = answer.get('owner_user_id')
        sentiment = answer.get('body_sentiment', 0) - 1
        ayear = extract_year(answer['creation_date'])
        if answer_owner:
            user_first_activity.setdefault(answer_owner, ayear)
            user_participation[answer_owner] += 1
            if post_owner and answer_owner != post_owner:
                G.add_edge(answer_owner, post_owner, weight=sentiment, type='answer')
                user_sentiments[answer_owner].append(sentiment)

        for a_comment in answer.get('comments', []):
            ac_user = a_comment.get('user_id')
            sentiment = a_comment.get('text_sentiment', 0) - 1
            if ac_user:
                ac_year = extract_year(a_comment['creation_date'])
                user_first_activity.setdefault(ac_user, ac_year)
                user_participation[ac_user] += 1
                if answer_owner and ac_user != answer_owner:
                    G.add_edge(ac_user, answer_owner, weight=sentiment, type='comment')
                    user_sentiments[ac_user].append(sentiment)

eligible_users = {u for u, y in user_first_activity.items() if active_years(y) >= MIN_YEARS_ACTIVE}
G_filtered = G.subgraph(eligible_users).copy()

for node in G_filtered.nodes():
    sentiments = user_sentiments.get(node, [])
    avg_sentiment = sum(sentiments) / len(sentiments) if sentiments else 0
    G_filtered.nodes[node]['avg_sentiment'] = avg_sentiment

# Lista com dados dos usuários
user_data = []
for n in G_filtered.nodes():
    user_data.append({
        'user_id': n,
        'posts': user_participation[n],
        'avg_sentiment': G_filtered.nodes[n]['avg_sentiment']
    })

df = pd.DataFrame(user_data)
df = df.sort_values(by='posts', ascending=False).reset_index(drop=True)

# Percentiles
percentiles = [0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 1.00]

results = []
total_users = len(df)
for p in percentiles:
    cutoff = max(1, int(total_users * p))
    subset = df.iloc[:cutoff]
    mean_sentiment = subset['avg_sentiment'].mean()
    results.append({
        'Faixa': f'Top {int(p * 100)}%',
        'Usuários': cutoff,
        'Sentimento Médio': round(mean_sentiment, 3)
    })

tabela_resultados = pd.DataFrame(results)
print(tabela_resultados.to_string(index=False))

## Tabela de Sentimento Médio por Percentil de Usuários

In [None]:
percentiles = [0.0001, 0.001, 0.01, 0.1, 0.10, 0.25, 1.00]

results = []
total_users = len(df)
for p in percentiles:
    cutoff = max(1, int(total_users * p))
    subset = df.iloc[:cutoff]
    mean_sentiment = subset['avg_sentiment'].mean()
    results.append({
        'Faixa': f'Top {(p * 100):.2f}%',
        'Usuários': cutoff,
        'Sentimento Médio': round(mean_sentiment, 3)
    })

tabela_resultados = pd.DataFrame(results)
print(tabela_resultados.to_string(index=False))
print(tabela_resultados.to_latex(index=False))

## Visualização do Sentimento por Percentil de Usuários

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

percentiles = [0.0001, 0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1.00]
results = []

total_users = len(df)
for p in percentiles:
    cutoff = max(1, int(total_users * p))
    subset = df.iloc[:cutoff]
    mean_sentiment = subset['avg_sentiment'].mean()
    results.append({
        'percentile': p,
        'cutoff': cutoff,
        'mean_sentiment': mean_sentiment
    })

result_df = pd.DataFrame(results)

result_df = result_df[result_df['percentile'] >= 0.01]
percentis = [0.01, 0.2, 0.4, 0.6, 0.8, 1]
# Plot 1: Sentimento suavizado por percentil (Top 1%)
top_1_cutoff = max(1, int(total_users * 0.01))
top_1_df = df.iloc[:top_1_cutoff].copy()
top_1_df['rank'] = np.arange(1, len(top_1_df) + 1)
top_1_df['percentile'] = top_1_df['rank'] / top_1_cutoff
top_1_df['smoothed_sentiment'] = top_1_df['avg_sentiment'].rolling(window=200, center=True).mean()

sns.lineplot(data=top_1_df, x='percentile', y='smoothed_sentiment')
plt.title('Sentimento dos Usuários por Percentil (Top 1%)')
plt.xlabel('Percentil (Top 1%)')
plt.ylabel('Sentimento Médio')
plt.xticks(result_df['percentile'], [f'{p:.2f}%' for p in percentis])
plt.grid(True)
plt.show()

# Plot 2: Sentimento médio por percentil (Top %)
sns.lineplot(data=result_df, x='percentile', y='mean_sentiment', marker='o')
plt.title('Sentimento Médio Dos Usuários por Percentil')
plt.xlabel('Percentil')
plt.ylabel('Sentimento Médio')
plt.xticks(result_df['percentile'], [f'{p*100:.0f}%' for p in result_df['percentile']])
plt.grid(True)
plt.show()

## Análise Temporal do Sentimento para o Top 1% de Usuários

In [None]:
import json
from collections import defaultdict
from datetime import datetime
from dateutil.parser import parse
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

MIN_YEARS_ACTIVE = 3
CURRENT_YEAR = datetime.now().year

def extract_year(date_str):
    return parse(date_str).year

def active_years(start_year):
    return CURRENT_YEAR - start_year

user_first_activity = {}
user_sentiments_by_year = defaultdict(lambda: defaultdict(list))
user_posts_by_year = defaultdict(lambda: defaultdict(int))

G = nx.DiGraph()

for item in tqdm(json_data, desc="Processing posts"):
    post_owner = item.get('owner_user_id')
    post_year = extract_year(item['creation_date'])
    if post_owner:
        user_first_activity.setdefault(post_owner, post_year)
        user_posts_by_year[post_year][post_owner] += 1

    for comment in item.get('comments', []):
        commenter = comment.get('user_id')
        sentiment = comment.get('text_sentiment', 0) - 1
        if commenter:
            cyear = extract_year(comment['creation_date'])
            user_first_activity.setdefault(commenter, cyear)
            user_posts_by_year[cyear][commenter] += 1
            user_sentiments_by_year[cyear][commenter].append(sentiment)
            if post_owner and commenter != post_owner:
                G.add_edge(commenter, post_owner, weight=sentiment, type='comment')

    for answer in item.get('answers', []):
        answer_owner = answer.get('owner_user_id')
        sentiment = answer.get('body_sentiment', 0) - 1
        ayear = extract_year(answer['creation_date'])
        if answer_owner:
            user_first_activity.setdefault(answer_owner, ayear)
            user_posts_by_year[ayear][answer_owner] += 1
            user_sentiments_by_year[ayear][answer_owner].append(sentiment)
            if post_owner and answer_owner != post_owner:
                G.add_edge(answer_owner, post_owner, weight=sentiment, type='answer')

        for a_comment in answer.get('comments', []):
            ac_user = a_comment.get('user_id')
            sentiment = a_comment.get('text_sentiment', 0) - 1
            if ac_user:
                ac_year = extract_year(a_comment['creation_date'])
                user_first_activity.setdefault(ac_user, ac_year)
                user_posts_by_year[ac_year][ac_user] += 1
                user_sentiments_by_year[ac_year][ac_user].append(sentiment)
                if answer_owner and ac_user != answer_owner:
                    G.add_edge(ac_user, answer_owner, weight=sentiment, type='comment')

# Análise por ano
yearly_sentiment = []

for year in sorted(user_posts_by_year.keys()):
    posts = user_posts_by_year[year]
    sentiments = user_sentiments_by_year[year]

    eligible_users = {
        u for u in posts
        if active_years(user_first_activity.get(u, year)) >= MIN_YEARS_ACTIVE and sentiments[u]
    }

    user_data = [{
        'user_id': u,
        'posts': posts[u],
        'avg_sentiment': sum(sentiments[u]) / len(sentiments[u])
    } for u in eligible_users]

    if not user_data:
        continue

    df_year = pd.DataFrame(user_data)
    df_year = df_year.sort_values(by='posts', ascending=False).reset_index(drop=True)

    cutoff = max(1, int(len(df_year) * 0.01))
    top_users = df_year.iloc[:cutoff]
    mean_sentiment = top_users['avg_sentiment'].mean()

    yearly_sentiment.append({
        'year': year,
        'mean_sentiment': mean_sentiment,
        'n_users': cutoff
    })

# Plot
df_result = pd.DataFrame(yearly_sentiment)
df_result = df_result[df_result['year'] <= 2022]
plt.plot(df_result['year'], df_result['mean_sentiment'], marker='o')
plt.title('Sentimento Médio dos Top 1% por Ano')
plt.xlabel('Ano')
plt.ylabel('Sentimento Médio')
plt.grid(True)
plt.tight_layout()
plt.show()

## Análise de Sentimento por Atributos de Usuário

Esta seção explora a relação entre o sentimento médio dos usuários e seus atributos, como reputação, visualizações, upvotes e downvotes.

In [None]:
users['6069']
 #[''].get('location', 'Unknown')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from dateutil.parser import parse
from collections import defaultdict

# Funções auxiliares
def extract_year(date_str):
    return parse(date_str).year

def extract_country(location):
    if location and ',' in location:
        return location.split(',')[-1].strip()
    return 'Unknown'

def get_bucket(value, bins):
    for b in bins:
        if value <= b:
            return f"<= {b}"
    return f"> {bins[-1]}"

# Bins para agrupamento
reputations, views, upvotes, downvotes = [], [], [], []

for uid in eligible_users:
    if uid in users:
        user = users[uid]
        reputations.append(user.get('reputation', 0))
        views.append(user.get('views', 0))
        upvotes.append(user.get('up_votes', 0))
        downvotes.append(user.get('down_votes', 0))

# 2. Calcula os bins com percentis (5 faixas)
def compute_bins(values, q=[0.1, 0.25, 0.5, 0.75]):
    return sorted(set(np.percentile(values, [v * 100 for v in q]).astype(int)))

reputation_bins = compute_bins(reputations)
views_bins = compute_bins(views)
upvote_bins = compute_bins(upvotes)
downvote_bins = compute_bins(downvotes)

# 1. Conta interações por usuário
user_interaction_counts = defaultdict(int)

for item in json_data:
    uid = str(item.get('owner_user_id'))
    if uid: user_interaction_counts[uid] += 1

    for c in item.get('comments', []):
        cid = str(c.get('user_id'))
        if cid: user_interaction_counts[cid] += 1

    for ans in item.get('answers', []):
        aid = str(ans.get('owner_user_id'))
        if aid: user_interaction_counts[aid] += 1
        for ac in ans.get('comments', []):
            acid = str(ac.get('user_id'))
            if acid: user_interaction_counts[acid] += 1

# 2. Seleciona usuários com ≥ 50 interações
eligible_users = {uid for uid, count in user_interaction_counts.items() if count >= 10}

# 3. Monta dataset
records = []

for item in tqdm(json_data, desc="Processando dados"):
    year = extract_year(item['creation_date'])
    sentiment = item['body_sentiment'] - 1
    uid = str(item.get('owner_user_id'))

    if uid in eligible_users and uid in users:
        user = users[uid]
        records.append({
            'year': year,
            'sentiment': sentiment,
            'reputation_bin': get_bucket(user.get('reputation', 0), reputation_bins),
            'views_bin': get_bucket(user.get('views', 0), views_bins),
            'upvotes_bin': get_bucket(user.get('up_votes', 0), upvote_bins),
            'downvotes_bin': get_bucket(user.get('down_votes', 0), downvote_bins),
        })

    for c in item.get('comments', []):
        cid = str(c.get('user_id'))
        if cid in eligible_users and cid in users:
            user = users[cid]
            records.append({
                'year': extract_year(c['creation_date']),
                'sentiment': c['text_sentiment'] - 1,
                'reputation_bin': get_bucket(user.get('reputation', 0), reputation_bins),
                'views_bin': get_bucket(user.get('views', 0), views_bins),
                'upvotes_bin': get_bucket(user.get('up_votes', 0), upvote_bins),
                'downvotes_bin': get_bucket(user.get('down_votes', 0), downvote_bins),
            })

    for ans in item.get('answers', []):
        aid = str(ans.get('owner_user_id'))
        if aid in eligible_users and aid in users:
            user = users[aid]
            records.append({
                'year': extract_year(ans['creation_date']),
                'sentiment': ans['body_sentiment'] - 1,
                'reputation_bin': get_bucket(user.get('reputation', 0), reputation_bins),
                'views_bin': get_bucket(user.get('views', 0), views_bins),
                'upvotes_bin': get_bucket(user.get('up_votes', 0), upvote_bins),
                'downvotes_bin': get_bucket(user.get('down_votes', 0), downvote_bins),
            })

        for ac in ans.get('comments', []):
            acid = str(ac.get('user_id'))
            if acid in eligible_users and acid in users:
                user = users[acid]
                records.append({
                    'year': extract_year(ac['creation_date']),
                    'sentiment': ac['text_sentiment'] - 1,
                    'reputation_bin': get_bucket(user.get('reputation', 0), reputation_bins),
                    'views_bin': get_bucket(user.get('views', 0), views_bins),
                    'upvotes_bin': get_bucket(user.get('up_votes', 0), upvote_bins),
                    'downvotes_bin': get_bucket(user.get('down_votes', 0), downvote_bins),
                })

## Visualização do Sentimento por Atributos de Usuário ao Longo do Tempo

In [None]:
# 4. Cria DataFrame
df = pd.DataFrame(records)
df = df[df['year'] <= 2022]
df = df[df['views_bin'] != '> 5000']
df = df[df['views_bin'] != '<= 10']

# 5. Função para plotar
def plot_grouped_sentiment(df, group_col, title):
    grouped = df.groupby([group_col, 'year'])['sentiment'].mean().reset_index()
    pivot = grouped.pivot(index='year', columns=group_col, values='sentiment')

    pivot.plot(figsize=(10, 6), marker='o')
    plt.title(title)
    plt.xlabel("Ano")
    plt.ylabel("Sentimento Médio")
    plt.grid(True)
    plt.legend(title=group_col, fontsize='small')
    plt.tight_layout()
    plt.show()

# 6. Gera gráficos
plot_grouped_sentiment(df, 'reputation_bin', "Sentimento por Reputação (Faixas) ao Longo do Tempo")
plot_grouped_sentiment(df, 'views_bin', "Sentimento por Visualizações ao Longo do Tempo")
plot_grouped_sentiment(df, 'upvotes_bin', "Sentimento por Upvotes ao Longo do Tempo")
plot_grouped_sentiment(df, 'downvotes_bin', "Sentimento por Downvotes ao Longo do Tempo")