In [1]:
import os
import sys
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
import networkx as nx
from pathlib import Path

import torch.nn.functional as F
from torchmetrics import Accuracy
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from torch_geometric.utils import to_networkx
import matplotlib.pyplot as plt
from torch_geometric.data import DataLoader
from sklearn.model_selection import train_test_split



In [2]:
# path = f"data\processed\balance_16k\preprocessing"
path = Path("./../../data/processed/balance_16k/preprocessing")
input_paths = [
    os.path.join(path, value) for i, value in enumerate(sorted(os.listdir(path)))
]
print(input_paths)
dataframes = [pd.read_csv(file, header=None) for file in input_paths]
int_codes, str_label = pd.factorize(dataframes[1][0])

# dataframes[0] = dataframes[0].iloc[:500]
# int_codes = int_codes[:500]

X_train, X_test, y_train, y_test = train_test_split(
    dataframes[0], int_codes, test_size=0.15, random_state=22
)

['..\\..\\data\\processed\\balance_16k\\preprocessing\\x_train.csv', '..\\..\\data\\processed\\balance_16k\\preprocessing\\y_train.csv']


In [3]:
station_list = list(pd.unique(dataframes[1][0]))
print(station_list)

['S', 'Pn', 'Lg', 'Pg', 'Sn', 'P']


In [4]:
def create_graph_for_gnn(X, y, num_segments=20):
    graphs = []
    segment_size = X.shape[1] // num_segments

    for row in range(X.shape[0]):
        G = nx.Graph()

        for node_idx in range(num_segments):
            start = node_idx * segment_size
            end = (node_idx + 1) * segment_size
            node_features = X.iloc[row][start:end]
            G.add_node(node_idx, features=node_features)

        for i in range(num_segments):
            for j in range(i + 1, num_segments):
                G.add_edge(i, j)

        edge_index = torch.tensor(list(G.edges)).t().contiguous()
        x = torch.tensor(
            [G.nodes[node]["features"] for node in G.nodes()], dtype=torch.float
        )

        graph_data = Data(x=x, edge_index=edge_index)

        graph_data.y = torch.tensor(y[row], dtype=torch.long)

        graphs.append(graph_data)

    return graphs


# s = dataframes[2]#.iloc[:200]
# y = int_codes#[:200]
graph_train_data = create_graph_for_gnn(X_train, y_train)
graph_test_data = create_graph_for_gnn(X_test, y_test)

In [5]:
graph_train_data[0]

Data(x=[20, 360], edge_index=[2, 190], y=4)

## Aggreration type as (aggr)

epochs = 10, training = 425 samples
| Aggregation types| Accuracy|
|-------------------|---------|
|    'mean'         | 36      |
|    'sum' or 'add' | 36      |
|   'max'           | 23      |
|   'min'           | 23      |
| 'median'          | 35      |
|   'mul'           | 24      |
|   'std'           | 36      |
|  'var'            | 36      |
| 'softmax'         | 36      |

When run on all the sample except testing accuracy is around 20% on average for all of these aggregation.

In [9]:
class GCN(torch.nn.Module):
    def __init__(self, aggr='add'):
        super(GCN, self, ).__init__()
        self.conv1 = GCNConv(360, 64, aggr)
        self.conv2 = GCNConv(64, 128, aggr)

        self.fc = torch.nn.Linear(2560, 6)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = x.view(data.num_graphs, -1)
        x = self.fc(x)
        return x


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GCN().to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

train_loader = DataLoader(graph_train_data, shuffle=True)
test_loader = DataLoader(graph_test_data, shuffle=False)


def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)


for epoch in range(1, 10):
    train_loss = train()
    print(f"Epoch {epoch}, Train Loss: {train_loss:.4f},")

def evaluate(train_loader):
    model.eval()
    correct = 0
    total = 0
    acc = 0
    accuracy_metric = Accuracy(task="multiclass", num_classes=6).to(device)
    accuracy_metric.reset()
    with torch.no_grad():
        for data in train_loader:
            data = data.to(device)
            output = model(data)
            _, predicted = torch.max(output, dim=1)
            # print(f"Predicted: {predicted}, Actual: {data.y}")
            correct += (predicted == data.y).sum().item()
            total += data.y.size(0)
            acc += accuracy_metric(predicted, data.y)

    return correct / total, acc / len(train_loader)


correct, accu = evaluate(train_loader)
print(correct)
print(accu)

Epoch 1, Train Loss: 5806.1659,
Epoch 2, Train Loss: 63.9690,
Epoch 3, Train Loss: 79.0357,
Epoch 4, Train Loss: 1.7754,
Epoch 5, Train Loss: 1.7647,
Epoch 6, Train Loss: 1.7642,
Epoch 7, Train Loss: 1.7653,
Epoch 8, Train Loss: 1.7654,
Epoch 9, Train Loss: 1.7654,
0.18400923654206958
tensor(0.1840)


In [7]:
# def plot_graph(graph_data, title="Graph Visualization"):
#     """
#     Plots the graph created for the GNN.

#     Parameters:
#     - graph_data: A PyTorch Geometric Data object representing the graph.
#     - title: Title of the plot.
#     """
#     # Convert back to networkx for visualization
#     G = nx.Graph()

#     # Add nodes with features directly from the PyTorch tensor
#     node_features = graph_data.x  # Stay as a tensor
#     for i, features in enumerate(node_features):
#         G.add_node(i, features=features.tolist())  # Convert each feature to list


#     edge_index = graph_data.edge_index.t()
#     for edge in edge_index:
#         G.add_edge(int(edge[0]), int(edge[1]))

#     pos = nx.spring_layout(G)

#     plt.figure(figsize=(8, 6))

#     nx.draw(G, pos, with_labels=True, node_size=500, node_color="lightblue", font_size=10, font_weight="bold")

#     node_labels = {i: f"{i}" for i in G.nodes()}
#     nx.draw_networkx_labels(G, pos, labels=node_labels)

#     plt.title(title)
#     plt.show()

# plot_graph(graph_train_data[20], title="Sample Graph Visualization")