In [1]:
# Standard libraries
import ast
import numpy as np
import pandas as pd

# Visualization (add if needed)
# import matplotlib.pyplot as plt
# import seaborn as sns

# Machine Learning & Processing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Network Analysis
import networkx as nx

# PyTorch & PyTorch Geometric
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
EMBEDDING_DIM = 768 
COMPETENCIAS = ['comp1', 'comp2', 'comp3', 'comp4', 'comp5']

In [3]:
def load_and_prepare_data():
    df = pd.read_csv('C:/Users/RWP/Documents/Estudos/Cogna/02_output/train_embeddings.csv')
    
    features = [
        'first_person_total', 'enclisis_count', 'demonstrative_pronouns',
        'tokens_count', 'first_person_per_token', 'enclisis_per_token',
        'demonstrative_per_token'
    ]
    
    scaler = MinMaxScaler()
    df[features] = scaler.fit_transform(df[features])
    
    df['node_features'] = df.apply(lambda row: np.concatenate([
        row[features].values,
        row['essayfull_embedding'][:EMBEDDING_DIM]  # Usar apenas parte do embedding se necessário
    ]), axis=1)
    
    return df

In [4]:
def build_graph(df):
    import ast
    def parse_embedding(embedding_str):
        try:
            if isinstance(embedding_str, str):
                return np.array(ast.literal_eval(embedding_str))
            return embedding_str
        except:
            return np.zeros(768)

    for col in ["prompt_embedding", "essay_full_embedding", "essay_conclusion_embedding"]:
        df[col] = df[col].apply(parse_embedding)

    df["essay"] = df["essay"].apply(
        lambda x: ([parse_embedding(e) for e in ast.literal_eval(x)] if isinstance(x, str) else x)
    )

    G = nx.Graph()

    for _, row in df.iterrows():
        prompt_node = f"prompt_{row['prompt_id']}"
        full_essay_node = f"full_essay_{row['prompt_id']}"

        G.add_node(prompt_node, type="prompt", embedding=row["prompt_embedding"])

        G.add_node(
            full_essay_node,
            type="full_essay",
            embedding=row["essay_full_embedding"],
            features={f: row[f] for f in [
                "first_person_total", "enclisis_count", "demonstrative_pronouns",
                "tokens_count", "first_person_per_token", "enclisis_per_token",
                "demonstrative_per_token"
            ]}
        )

        for comp in ["comp1", "comp2", "comp3", "comp4", "comp5"]:
            G.nodes[full_essay_node][comp] = row[comp]

        G.add_edge(prompt_node, full_essay_node, relation="prompt_to_essay")

        for i, para_embedding in enumerate(row["essay"]):
            para_node = f"para_{row['prompt_id']}_{i}"
            G.add_node(para_node, type="paragraph", paragraph_idx=i, embedding=para_embedding)
            G.add_edge(full_essay_node, para_node, relation="essay_to_paragraph")
            if i > 0:
                prev_para_node = f"para_{row['prompt_id']}_{i-1}"
                G.add_edge(prev_para_node, para_node, relation="paragraph_flow")

        conclusion_node = f"conclusion_{row['prompt_id']}"
        G.add_node(conclusion_node, type="conclusion", embedding=row["essay_conclusion_embedding"])
        last_para_node = f"para_{row['prompt_id']}_{len(row['essay']) - 1}"
        G.add_edge(last_para_node, conclusion_node, relation="paragraph_to_conclusion")
        G.add_edge(full_essay_node, conclusion_node, relation="essay_to_conclusion")

    return G


In [5]:
# NOTE Exemplo de uso:
G_train = build_graph(pd.read_csv('../02_output/train_embeddings.csv'))
G_valid = build_graph(pd.read_csv('../02_output/valid_embeddings.csv'))
G_test = build_graph(pd.read_csv('../02_output/test_embeddings.csv'))

In [6]:
print(f"Número de nós: {len(G_train.nodes())}")
print(f"Número de arestas: {len(G_train.edges())}")

# Exemplo de acesso aos dados
for node in list(G_train.nodes())[:3]:  # Primeiros 3 nós
    print(f"\nNó: {node}")
    print(f"Tipo: {G_train.nodes[node]['type']}")
    if 'features' in G_train.nodes[node]:
        print(f"Features: {G_train.nodes[node]['features']}")

Número de nós: 344
Número de arestas: 344

Nó: prompt_60
Tipo: prompt

Nó: full_essay_60
Tipo: full_essay
Features: {'first_person_total': 0, 'enclisis_count': 0, 'demonstrative_pronouns': 5, 'tokens_count': 397, 'first_person_per_token': 0.0, 'enclisis_per_token': 0.0, 'demonstrative_per_token': 0.0125944584382871}

Nó: para_60_0
Tipo: paragraph


In [None]:
class GCNMultiTaskImproved(nn.Module):
    def __init__(self, num_features, hidden_dim=128, num_outputs=5):
        super().__init__()
        self.conv1 = GCNConv(num_features, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.dropout = nn.Dropout(0.3)

        self.proj = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, num_outputs)
        )

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.dropout(x)

        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.dropout(x)

        return self.proj(x)

In [16]:
from torch_geometric.nn import GATConv

class GATMultiTask(nn.Module):
    def __init__(self, num_features, hidden_dim=128, num_outputs=5, heads=4):
        super().__init__()
        self.gat1 = GATConv(num_features, hidden_dim, heads=heads, dropout=0.3)
        self.gat2 = GATConv(hidden_dim * heads, hidden_dim, heads=1, concat=False, dropout=0.3)
        
        self.dropout = nn.Dropout(0.3)

        self.out_proj = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, num_outputs)
        )

    def forward(self, x, edge_index):
        x = self.gat1(x, edge_index)
        x = F.elu(x)
        x = self.dropout(x)

        x = self.gat2(x, edge_index)
        x = F.elu(x)
        x = self.dropout(x)

        return self.out_proj(x)


In [17]:
from torch_geometric.data import Data

def nx_to_pyg(G, competencias=["comp1", "comp2", "comp3", "comp4", "comp5"]):
    node_features = []
    node_mapping = {}
    labels = []
    mask = []

    for i, node in enumerate(G.nodes()):
        node_mapping[node] = i
        embedding = G.nodes[node].get("embedding", np.zeros(768))
        feature_vector = torch.tensor(embedding, dtype=torch.float32)
        node_features.append(feature_vector)

        if G.nodes[node]['type'] == 'full_essay':
            y = [G.nodes[node].get(c, 0.0) for c in competencias]
            labels.append(torch.tensor(y, dtype=torch.float32))
            mask.append(True)
        else:
            labels.append(torch.zeros(len(competencias)))
            mask.append(False)

    x = torch.stack(node_features)
    y = torch.stack(labels)
    mask = torch.tensor(mask, dtype=torch.bool)

    edges = [(node_mapping[u], node_mapping[v]) for u, v in G.edges()]
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

    data = Data(x=x, edge_index=edge_index, y=y)
    data.mask = mask
    return data


In [18]:
data_train = nx_to_pyg(G_train, competencias=COMPETENCIAS)
data_valid = nx_to_pyg(G_valid, competencias=COMPETENCIAS)
data_test = nx_to_pyg(G_test, competencias=COMPETENCIAS)

In [11]:
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data.x.to(DEVICE), data.edge_index.to(DEVICE))
    loss = criterion(out[data.mask], data.y[data.mask].to(DEVICE))
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate(model, data, criterion):
    model.eval()
    with torch.no_grad():
        out = model(data.x.to(DEVICE), data.edge_index.to(DEVICE))
        loss = criterion(out[data.mask], data.y[data.mask].to(DEVICE))
        return loss.item(), out[data.mask].cpu()


In [19]:
model = GATMultiTask(num_features=EMBEDDING_DIM).to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

EPOCHS = 6000

for epoch in range(1, EPOCHS + 1):
    loss = train(model, data_train, optimizer, criterion)
    val_loss, val_pred = evaluate(model, data_valid, criterion)
    print(f"Epoch {epoch:03d} | Train Loss: {loss:.4f} | Val Loss: {val_loss:.4f}")


Epoch 001 | Train Loss: 15133.3604 | Val Loss: 15061.0186
Epoch 002 | Train Loss: 15131.4746 | Val Loss: 15058.7402
Epoch 003 | Train Loss: 15129.2422 | Val Loss: 15056.1875
Epoch 004 | Train Loss: 15126.5059 | Val Loss: 15053.2773
Epoch 005 | Train Loss: 15123.4219 | Val Loss: 15049.7979
Epoch 006 | Train Loss: 15120.4014 | Val Loss: 15045.7637
Epoch 007 | Train Loss: 15116.7617 | Val Loss: 15041.3340
Epoch 008 | Train Loss: 15111.4951 | Val Loss: 15036.1504
Epoch 009 | Train Loss: 15109.0654 | Val Loss: 15030.0137
Epoch 010 | Train Loss: 15101.0605 | Val Loss: 15022.6445
Epoch 011 | Train Loss: 15092.1016 | Val Loss: 15013.6631
Epoch 012 | Train Loss: 15083.8096 | Val Loss: 15002.6797
Epoch 013 | Train Loss: 15069.3906 | Val Loss: 14989.2158
Epoch 014 | Train Loss: 15056.9902 | Val Loss: 14972.9131
Epoch 015 | Train Loss: 15039.5625 | Val Loss: 14953.1807
Epoch 016 | Train Loss: 15016.4678 | Val Loss: 14929.4932
Epoch 017 | Train Loss: 15003.4229 | Val Loss: 14901.0439
Epoch 018 | Tr

In [20]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

def evaluate_on_test(model, data_test, competencias=["comp1", "comp2", "comp3", "comp4", "comp5"]):
    model.eval()
    with torch.no_grad():
        out = model(data_test.x.to(DEVICE), data_test.edge_index.to(DEVICE))
        y_true = data_test.y[data_test.mask].cpu().numpy()
        y_pred = out[data_test.mask].cpu().numpy()

    metrics = {}
    for i, comp in enumerate(competencias):
        y_true_i = y_true[:, i]
        y_pred_i = y_pred[:, i]

        metrics[comp] = {
            "R2": r2_score(y_true_i, y_pred_i),
            "MSE": mean_squared_error(y_true_i, y_pred_i),
            "MAPE": mean_absolute_percentage_error(y_true_i, y_pred_i),
        }

    return metrics, y_true, y_pred


In [21]:
metrics, y_true, y_pred = evaluate_on_test(model, data_test)

for comp, m in metrics.items():
    print(f"\nCompetência: {comp}")
    print(f"  R²:   {m['R2']:.4f}")
    print(f"  MSE:  {m['MSE']:.4f}")



Competência: comp1
  R²:   -0.0089
  MSE:  2458.7092

Competência: comp2
  R²:   -0.0049
  MSE:  2987.8801

Competência: comp3
  R²:   -0.0112
  MSE:  2428.4570

Competência: comp4
  R²:   -0.0018
  MSE:  3064.7681

Competência: comp5
  R²:   -0.0000
  MSE:  3151.5491
