<a href="https://colab.research.google.com/github/vnavya2004/BTP/blob/main/GNN_WITH_EDGES.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch

!pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

[0mLooking in links: https://data.pyg.org/whl/torch-2.4.0+cu121.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.4.0%2Bcu121/torch_scatter-2.1.2%2Bpt24cu121-cp310-cp310-linux_x86_64.whl (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt24cu121
Looking in links: https://data.pyg.org/whl/torch-2.4.0+cu121.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.4.0%2Bcu121/torch_sparse-0.6.18%2Bpt24cu121-cp310-cp310-linux_x86_64.whl (5.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt24cu121
Looking in links: https://data.pyg.org/whl/torch-2.4.0+cu121.html
Collecting torch-cluster
  Downloading https://da

In [2]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import GCNConv
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import pandas as pd
import numpy as np
from tqdm import tqdm
from google.colab import files
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import random

# Load dataset
uploaded = files.upload()
df = pd.read_excel(pd.ExcelFile(list(uploaded.keys())[0]), header=0)
df = df.sample(frac=0.1, random_state=42)

# Graph Preparation
tweets_column = 'tweet'
labels_column = 'label'
NUM_LABELS = len(df[labels_column].unique())
possible_labels = df[labels_column].unique()
label_dict = {possible_label: index for index, possible_label in enumerate(possible_labels)}
df['labels'] = df[labels_column].map(label_dict)

# Split the dataset
df_labeled, df_temp = train_test_split(df, stratify=df[labels_column], test_size=0.8)
df_unlabeled, df_test = train_test_split(df_temp, stratify=df_temp[labels_column], test_size=0.25)

# Load the XLM-RoBERTa model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModel.from_pretrained('xlm-roberta-base')

def extract_embeddings(texts):
    """Extract embeddings from XLM-RoBERTa model for a list of texts."""
    encoded_inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**encoded_inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Take the mean of token embeddings for each input
    return embeddings

def create_graph_data(df, feature_dim=768, top_k=5):
    """
    Creates a graph data object with edges based on similarity of node features.

    Args:
        df (pd.DataFrame): The input data containing tweets and labels.
        feature_dim (int): Dimensionality of node features.
        top_k (int): Number of most similar nodes to connect.

    Returns:
        Data: Graph data object containing node features, edges, and labels.
    """
    # Extract embeddings from XLM-RoBERTa
    texts = df[tweets_column].tolist()
    embeddings = extract_embeddings(texts)

    # Convert embeddings to numpy for similarity computation
    embeddings_np = embeddings.cpu().numpy()

    # Compute cosine similarity between all node embeddings
    similarity_matrix = cosine_similarity(embeddings_np)

    # Create edges based on top_k most similar nodes
    edge_index = []
    for i in range(len(similarity_matrix)):
        # Get indices of the top_k most similar nodes (excluding self)
        similar_nodes = np.argsort(-similarity_matrix[i])[1:top_k+1]
        for j in similar_nodes:
            edge_index.append([i, j])

    # Convert edge index to torch tensor
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    # Create graph data
    data = Data(x=embeddings, edge_index=edge_index, y=torch.tensor(df['labels'].values, dtype=torch.long))
    return data

# Create graph data with feature embeddings and similarity-based edges
graph_data_labeled = create_graph_data(df_labeled, feature_dim=768, top_k=5)
graph_data_unlabeled = create_graph_data(df_unlabeled, feature_dim=768, top_k=5)
graph_data_test = create_graph_data(df_test, feature_dim=768, top_k=5)

def split_data(data, num_subgraphs=10, min_nodes_per_subgraph=10):
    """
    Splits a large graph data object into multiple smaller subgraphs.
    """
    node_indices = list(range(data.num_nodes))
    random.shuffle(node_indices)

    subgraphs = []
    nodes_per_subgraph = max(min_nodes_per_subgraph, len(node_indices) // num_subgraphs)

    for i in range(0, len(node_indices), nodes_per_subgraph):
        subgraph_node_indices = node_indices[i:i + nodes_per_subgraph]
        subgraph_node_indices = torch.tensor(subgraph_node_indices, dtype=torch.long)

        # Extract features and labels for the subgraph nodes
        subgraph_x = data.x[subgraph_node_indices]
        subgraph_y = data.y[subgraph_node_indices]

        # Filter edges that connect the nodes within this subgraph
        mask = torch.isin(data.edge_index[0], subgraph_node_indices) & torch.isin(data.edge_index[1], subgraph_node_indices)
        subgraph_edge_index = data.edge_index[:, mask]

        # Reindex edges for the new subgraph
        node_mapping = {old_idx: new_idx for new_idx, old_idx in enumerate(subgraph_node_indices.tolist())}
        subgraph_edge_index = torch.tensor(
            [[node_mapping[src.item()], node_mapping[dst.item()]] for src, dst in subgraph_edge_index.t()],
            dtype=torch.long
        ).t().contiguous()

        # Create subgraph Data object
        subgraph = Data(x=subgraph_x, edge_index=subgraph_edge_index, y=subgraph_y)
        subgraphs.append(subgraph)

    return subgraphs

# Example usage:
subgraphs_labeled = split_data(graph_data_labeled, num_subgraphs=10, min_nodes_per_subgraph=10)
subgraphs_unlabeled = split_data(graph_data_unlabeled, num_subgraphs=10, min_nodes_per_subgraph=10)
subgraphs_test = split_data(graph_data_test, num_subgraphs=10, min_nodes_per_subgraph=10)
batch_size = 4

# DataLoaders for batched subgraphs
dataloader_train = DataLoader(subgraphs_labeled, batch_size=batch_size, shuffle=True)
dataloader_unlabeled = DataLoader(subgraphs_unlabeled, batch_size=batch_size, shuffle=True)
dataloader_test = DataLoader(subgraphs_test, batch_size=batch_size, shuffle=False)

# Define GNN Model with correct input dimensions
class GNNModel(torch.nn.Module):
    def __init__(self, num_node_features, hidden_channels, num_classes):
        super(GNNModel, self).__init__()
        self.conv1 = GCNConv(num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

# Initialize the models with the corrected input feature size
student_model = GNNModel(num_node_features=768, hidden_channels=64, num_classes=NUM_LABELS)
teacher_model = GNNModel(num_node_features=768, hidden_channels=64, num_classes=NUM_LABELS)

# Set up the device for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
student_model.to(device)
teacher_model.to(device)

# Copy student model parameters to the teacher model
teacher_model.load_state_dict(student_model.state_dict())

# Set up optimizer
optimizer = torch.optim.Adam(student_model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)
epochs = 10
alpha = 0.999  # EMA decay rate

# Define evaluation metrics
def compute_metrics(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    accuracy = accuracy_score(labels_flat, preds_flat)
    f1 = f1_score(labels_flat, preds_flat, average='weighted')
    precision = precision_score(labels_flat, preds_flat, average='weighted')
    recall = recall_score(labels_flat, preds_flat, average='weighted')
    return accuracy, f1, precision, recall

# Update teacher model using EMA of student model
def update_teacher(student_model, teacher_model, alpha):
    for student_param, teacher_param in zip(student_model.parameters(), teacher_model.parameters()):
        teacher_param.data = alpha * teacher_param.data + (1 - alpha) * student_param.data

# Training loop
for epoch in range(1, epochs + 1):
    student_model.train()
    teacher_model.eval()
    loss_train_total = 0
    progress_bar = tqdm(dataloader_train, desc=f'Epoch {epoch}', leave=False, disable=False)

    # Train the student on labeled data
    for batch in progress_bar:
        student_model.zero_grad()
        batch = batch.to(device)

        # Forward pass through the student model
        logits_student = student_model(batch.x, batch.edge_index)
        loss = F.cross_entropy(logits_student, batch.y)  # Supervised loss on labeled data

        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(student_model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item())})

    # Consistency Loss on Unlabeled Data
    loss_consistency_total = 0
    for batch in DataLoader([graph_data_unlabeled], batch_size=batch_size, shuffle=True):
        student_model.zero_grad()
        batch = batch.to(device)

        with torch.no_grad():
            # Predictions from the teacher model
            logits_teacher = teacher_model(batch.x, batch.edge_index)

        # Predictions from the student model
        logits_student = student_model(batch.x, batch.edge_index)

        # Consistency loss between teacher and student predictions
        loss_consistency = F.mse_loss(logits_student, logits_teacher)
        loss_consistency_total += loss_consistency.item()

        # Backpropagation
        loss_consistency.backward()
        torch.nn.utils.clip_grad_norm_(student_model.parameters(), 1.0)
        optimizer.step()

    # Update the teacher model
    update_teacher(student_model, teacher_model, alpha)

    # Evaluation on the test set
    teacher_model.eval()
    predictions, true_labels = [], []
    loss_test_total = 0

    for batch in dataloader_test:
        batch = batch.to(device)
        with torch.no_grad():
            logits = student_model(batch.x, batch.edge_index)
            loss_test = F.cross_entropy(logits, batch.y)  # Supervised loss on test data
            loss_test_total += loss_test.item()
            logits = logits.detach().cpu().numpy()
            label_ids = batch.y.cpu().numpy()
            predictions.append(logits)
            true_labels.append(label_ids)

    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)
    accuracy, f1, precision, recall = compute_metrics(predictions, true_labels)

    # Print evaluation metrics
    print(
        f'\nEpoch {epoch}\n'
        f'Training Loss: {loss_train_total / len(dataloader_train):.3f}\n'
        f'Consistency Loss: {loss_consistency_total / len(dataloader_unlabeled):.3f}\n'
        f'Test Loss: {loss_test_total / len(dataloader_test):.3f}\n'
        f'Accuracy: {accuracy:.3f}\n'
        f'F1 Score: {f1:.3f}\n'
        f'Precision: {precision:.3f}\n'
        f'Recall: {recall:.3f}\n'
    )


Saving Arabic_Depression_10.000_Tweets.xlsx to Arabic_Depression_10.000_Tweets.xlsx


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]




Epoch 1
Training Loss: 0.752
Consistency Loss: 0.245
Test Loss: 0.670
Accuracy: 0.500
F1 Score: 0.398
Precision: 0.476
Recall: 0.500






Epoch 2
Training Loss: 0.669
Consistency Loss: 0.275
Test Loss: 0.656
Accuracy: 0.560
F1 Score: 0.502
Precision: 0.598
Recall: 0.560






Epoch 3
Training Loss: 0.660
Consistency Loss: 0.178
Test Loss: 0.648
Accuracy: 0.660
F1 Score: 0.650
Precision: 0.686
Recall: 0.660






Epoch 4
Training Loss: 0.646
Consistency Loss: 0.108
Test Loss: 0.659
Accuracy: 0.595
F1 Score: 0.551
Precision: 0.670
Recall: 0.595






Epoch 5
Training Loss: 0.653
Consistency Loss: 0.087
Test Loss: 0.664
Accuracy: 0.605
F1 Score: 0.559
Precision: 0.696
Recall: 0.605






Epoch 6
Training Loss: 0.665
Consistency Loss: 0.077
Test Loss: 0.666
Accuracy: 0.600
F1 Score: 0.548
Precision: 0.702
Recall: 0.600






Epoch 7
Training Loss: 0.650
Consistency Loss: 0.076
Test Loss: 0.662
Accuracy: 0.605
F1 Score: 0.555
Precision: 0.706
Recall: 0.605






Epoch 8
Training Loss: 0.646
Consistency Loss: 0.082
Test Loss: 0.658
Accuracy: 0.610
F1 Score: 0.563
Precision: 0.710
Recall: 0.610






Epoch 9
Training Loss: 0.644
Consistency Loss: 0.087
Test Loss: 0.654
Accuracy: 0.605
F1 Score: 0.559
Precision: 0.696
Recall: 0.605






Epoch 10
Training Loss: 0.647
Consistency Loss: 0.091
Test Loss: 0.653
Accuracy: 0.605
F1 Score: 0.559
Precision: 0.696
Recall: 0.605

