<a href="https://colab.research.google.com/github/vnavya2004/BTP/blob/main/Initial_Graph_MeanTeacher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import torch

!pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

[0mLooking in links: https://data.pyg.org/whl/torch-2.4.0+cu121.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.4.0%2Bcu121/torch_scatter-2.1.2%2Bpt24cu121-cp310-cp310-linux_x86_64.whl (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt24cu121
Looking in links: https://data.pyg.org/whl/torch-2.4.0+cu121.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.4.0%2Bcu121/torch_sparse-0.6.18%2Bpt24cu121-cp310-cp310-linux_x86_64.whl (5.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt24cu121
Looking in links: https://data.pyg.org/whl/torch-2.4.0+cu121.html
Collecting torch-cluster
  Downloading https://da

In [2]:
import torch
import pandas as pd
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from torch_geometric.loader import DataLoader
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from tqdm import tqdm
from google.colab import files

# Load the XLM-RoBERTa tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
xlm_model = AutoModel.from_pretrained("xlm-roberta-base")

# Upload and read data
uploaded = files.upload()
df = pd.read_excel(pd.ExcelFile(list(uploaded.keys())[0]), header=0)
df = df.sample(frac=0.1, random_state=42)

# Define columns and create label mappings
tweets_column = 'tweet'
labels_column = 'label'
NUM_LABELS = len(df[labels_column].unique())
possible_labels = df[labels_column].unique()
label_dict = {possible_label: index for index, possible_label in enumerate(possible_labels)}
df['labels'] = df[labels_column].map(label_dict)

# Split data into labeled, unlabeled, and test sets
df_labeled, df_temp = train_test_split(df, stratify=df[labels_column], test_size=0.8)
df_unlabeled, df_test = train_test_split(df_temp, stratify=df_temp[labels_column], test_size=0.25)

# Tokenize data and generate embeddings using XLM-RoBERTa
def generate_embeddings(df, tweets_column):
    inputs = tokenizer(df[tweets_column].tolist(), return_tensors='pt', padding=True, truncation=True, max_length=256)
    with torch.no_grad():
        outputs = xlm_model(**inputs)
    # Use mean pooling on the last hidden state
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

# Generate embeddings for labeled, unlabeled, and test data
embeddings_labeled = generate_embeddings(df_labeled, tweets_column)
embeddings_unlabeled = generate_embeddings(df_unlabeled, tweets_column)
embeddings_test = generate_embeddings(df_test, tweets_column)

# Create PyTorch Geometric data object
def create_graph_data(embeddings, labels=None):
    num_nodes = embeddings.size(0)
    edge_index = torch.combinations(torch.arange(num_nodes), r=2).t()  # Full connectivity
    data = Data(x=embeddings, edge_index=edge_index)
    if labels is not None:
        data.y = labels
    return data

# Create graph data for labeled, unlabeled, and test datasets
data_labeled = create_graph_data(embeddings_labeled, torch.tensor(df_labeled['labels'].values))
data_unlabeled = create_graph_data(embeddings_unlabeled)
data_test = create_graph_data(embeddings_test, torch.tensor(df_test['labels'].values))

# Define GraphSAGE model
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Instantiate model, optimizer, and loss function
model = GraphSAGE(in_channels=embeddings_labeled.size(1), hidden_channels=128, out_channels=NUM_LABELS)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.CrossEntropyLoss()

# Set up training and evaluation data loaders
batch_size = 32
train_loader = DataLoader([data_labeled], batch_size=batch_size, shuffle=True)
unlabeled_loader = DataLoader([data_unlabeled], batch_size=batch_size)
test_loader = DataLoader([data_test], batch_size=batch_size)






Saving Arabic_Depression_10.000_Tweets.xlsx to Arabic_Depression_10.000_Tweets (4).xlsx


AttributeError: 'GlobalStorage' object has no attribute 'train_mask'

In [5]:
# Define training function
def train(model, loader):
    model.train()
    total_loss = 0
    for data in loader:
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        # Access all labels for training since a train_mask is not defined
        loss = loss_fn(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Define evaluation function
def evaluate(model, loader):
    model.eval()
    preds, labels = [], []
    for data in loader:
        out = model(data.x, data.edge_index)
        pred = out.argmax(dim=1)
        preds.append(pred.cpu().numpy())
        labels.append(data.y.cpu().numpy())
    preds = np.concatenate(preds)
    labels = np.concatenate(labels)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    return accuracy, f1, precision, recall

# Training loop
epochs = 50
for epoch in range(epochs):
    loss = train(model, train_loader)
    print(f"Epoch {epoch+1}, Loss: {loss:.4f}")

# Evaluation on test data
test_accuracy, test_f1, test_precision, test_recall = evaluate(model, test_loader)
print(f"Testing Accuracy: {test_accuracy}")
print(f"Testing F1 Score: {test_f1}")
print(f"Testing Precision: {test_precision}")
print(f"Testing Recall: {test_recall}")

Epoch 1, Loss: 0.5610
Epoch 2, Loss: 0.5659
Epoch 3, Loss: 0.5446
Epoch 4, Loss: 0.5338
Epoch 5, Loss: 0.5303
Epoch 6, Loss: 0.5118
Epoch 7, Loss: 0.5039
Epoch 8, Loss: 0.4939
Epoch 9, Loss: 0.4764
Epoch 10, Loss: 0.4699
Epoch 11, Loss: 0.4527
Epoch 12, Loss: 0.4416
Epoch 13, Loss: 0.4306
Epoch 14, Loss: 0.4153
Epoch 15, Loss: 0.4077
Epoch 16, Loss: 0.3916
Epoch 17, Loss: 0.3842
Epoch 18, Loss: 0.3691
Epoch 19, Loss: 0.3613
Epoch 20, Loss: 0.3472
Epoch 21, Loss: 0.3397
Epoch 22, Loss: 0.3267
Epoch 23, Loss: 0.3194
Epoch 24, Loss: 0.3071
Epoch 25, Loss: 0.3000
Epoch 26, Loss: 0.2886
Epoch 27, Loss: 0.2814
Epoch 28, Loss: 0.2717
Epoch 29, Loss: 0.2636
Epoch 30, Loss: 0.2561
Epoch 31, Loss: 0.2469
Epoch 32, Loss: 0.2406
Epoch 33, Loss: 0.2320
Epoch 34, Loss: 0.2254
Epoch 35, Loss: 0.2187
Epoch 36, Loss: 0.2111
Epoch 37, Loss: 0.2055
Epoch 38, Loss: 0.1988
Epoch 39, Loss: 0.1923
Epoch 40, Loss: 0.1872
Epoch 41, Loss: 0.1810
Epoch 42, Loss: 0.1754
Epoch 43, Loss: 0.1706
Epoch 44, Loss: 0.16