In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import sys
import pickle
import pandas as pd
import argparse
import os

sys.path.append("../source/")
from datasets import PlanetoidDataset, TadpoleDataset, SyntheticData, MyData 
from torch_geometric.data import Data
from torch_geometric.transforms import KNNGraph
from utils import utils

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, Normalizer

import sklearn
from typing import Callable, Tuple, List, Union, Optional, Dict

path_to_data = "../data/"

# TADPOLE Dataset

In [3]:
path_tadpole_data = os.path.join(path_to_data, 'Tadpole/tadpole_data.pickle')

In [4]:
with open(path_tadpole_data, 'rb') as f:
    X_,y_,train_mask_,test_mask_, weight_ = pickle.load(f)

In [5]:
tadpole_accs = []
for seed in [42,456789,67341,1204,1302]:

    X = torch.from_numpy(X_[:,:,0]).float()
    y = torch.from_numpy(y_[:,:,0]).float()

    train_mask = torch.from_numpy(np.concatenate((train_mask_[:,0][:-50], np.array([0]*50))))
    test_mask = torch.from_numpy(test_mask_[:,0])
    val_mask = torch.from_numpy(np.concatenate((np.array([0]*514), train_mask_[:,0][-50:])))

    X_train, y_train = X[train_mask.bool()].cpu(), y[train_mask.bool()].cpu()
    X_test, y_test = X[test_mask.bool()].cpu(), y[test_mask.bool()].cpu()
    X_val, y_val = X[val_mask.bool()].cpu(), y[val_mask.bool()].cpu()

    y_train = y_train.argmax(dim=1)
    y_test = y_test.argmax(dim=1)
    y_val = y_val.argmax(dim=1)

    rf_model = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=seed)
    rf_model.fit(X_train, y_train)

    y_test_pred = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_test_pred)

    print(f"Test Accuracy: {accuracy:.8f}")
    tadpole_accs.append(accuracy)

tadpole_accs = np.array(tadpole_accs)

Test Accuracy: 0.94736842
Test Accuracy: 0.94736842
Test Accuracy: 0.94736842
Test Accuracy: 0.94736842
Test Accuracy: 0.94736842


In [6]:
print("Accuracy of baseline on TADPOLE dataset: ", tadpole_accs.mean(), " +- ", tadpole_accs.std())

Accuracy of baseline on TADPOLE dataset:  0.9473684210526315  +-  0.0


# ABIDE Dataset

In [7]:
abide_path  = os.path.join(path_to_data, 'Abide/parisot_features.csv')
abide_data = pd.read_csv(abide_path, sep="\t", header=None)

In [8]:
def read_nodes(
            node_dataset_path: str
        ) -> Tuple[pd.DataFrame, np.ndarray]:
    def read_fn(path: str) -> pd.DataFrame:
        return pd.read_csv(path, header=None, sep='\t')

    def columns_fn(df: pd.DataFrame) -> Dict[str, Union[str, List[str]]]:
        categorical_columns = None
        continuous_columns = range(len(df.columns) - 1)
        node_feature_columns = continuous_columns
        label_column = len(df.columns) - 1

        columns = {
            'categorical': categorical_columns,
            'continuous': continuous_columns,
            'features': node_feature_columns,
            'label': label_column
        }
        return columns

    def processing_fn(
                df: pd.DataFrame,
                columns: Dict[str, Union[str, List[str]]]
            ) -> pd.DataFrame:
        # Scale the data
        scaler = StandardScaler()
        df[ list(columns['features']) ] = scaler.fit_transform( df[ list(columns['features']) ] )
        return df

    df = read_fn(node_dataset_path)
    columns = columns_fn(df)
    df = processing_fn(df, columns)

    node_features_df = df[columns['features']]
    labels = df[columns['label']].to_numpy(dtype=np.int8)
    return node_features_df, labels

In [9]:
x_abide, y_abide = read_nodes(abide_path)

In [10]:
train_mask, test_mask, val_mask = utils.create_train_test_val_masks_levelled(y_abide, test_size=((1-0.7)/2), validation_size=((1-0.7)/2), random_state=42)

In [65]:
accs_abide = []
for seed in [1204,1302,456789,67341,42]:

    abide_model = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=seed)
    # abide_model = LogisticRegression(random_state=seed)
    # abide_model = sklearn.linear_model.RidgeClassifier(alpha=50, random_state=seed)
    abide_model.fit(x_abide[train_mask], y_abide[train_mask])

    y_test_pred = abide_model.predict(x_abide[test_mask])

    accuracy = accuracy_score(y_abide[test_mask], y_test_pred)
    accs_abide.append(accuracy)

    print(f"Test Accuracy: {accuracy:.8f}")

accs_abide = np.array(accs_abide)

Test Accuracy: 0.68778281
Test Accuracy: 0.69230769
Test Accuracy: 0.71493213
Test Accuracy: 0.71493213
Test Accuracy: 0.71040724


In [67]:
print("Accuracy of baseline on ABIDE dataset: ", accs_abide.mean(), " +- ", accs_abide.std())

Accuracy of baseline on ABIDE dataset:  0.7040723981900453  +-  0.011659817852239945


# CORA Dataset

In [26]:
dataset = Planetoid(root=path_to_data, name='Cora', transform=NormalizeFeatures())

print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: Cora():
Number of graphs: 1
Number of features: 1433
Number of classes: 7

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
Number of nodes: 2708
Number of edges: 10556
Average node degree: 3.90
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [20]:
# using the commonly used set up for Cora, where 1708 nodes are used for training, 500 for validation, and 500 for testing
dataset[0].train_mask.fill_(False)
dataset[0].train_mask[:dataset[0].num_nodes - 1000] = 1
dataset[0].val_mask.fill_(False)
dataset[0].val_mask[dataset[0].num_nodes - 1000:dataset[0].num_nodes - 500] = 1
dataset[0].test_mask.fill_(False)
dataset[0].test_mask[dataset[0].num_nodes - 500:] = 1

In [21]:
x_train_cora = data.x[data.train_mask]
y_train_cora = data.y[data.train_mask]
x_test_cora = data.x[data.test_mask]
y_test_cora = data.y[data.test_mask]
x_val_cora = data.x[data.val_mask]
y_val_cora = data.y[data.val_mask]

In [39]:
cora_accs = []

for seed in [42,456789,67341,1204,1302]:
    # rf_cora = RandomForestClassifier(n_estimators=800, max_depth=80, random_state=seed)
    # rf_cora = LogisticRegression()
    model_cora = sklearn.linear_model.RidgeClassifier(alpha=0.1, random_state=seed)
    model_cora.fit(x_train_cora, y_train_cora)

    y_test_pred_cora = model_cora.predict(x_test_cora)
    accuracy = accuracy_score(y_test_cora, y_test_pred_cora)
    cora_accs.append(accuracy)
    print(f"Test Accuracy: {accuracy:.8f}")

cora_accs = np.array(cora_accs)

Test Accuracy: 0.78600000
Test Accuracy: 0.78600000
Test Accuracy: 0.78600000
Test Accuracy: 0.78600000
Test Accuracy: 0.78600000


In [40]:
print("Accuracy of baseline on CORA dataset: ", cora_accs.mean(), " +- ", cora_accs.std())

Accuracy of baseline on CORA dataset:  0.786  +-  0.0


# CiteSeer Dataset

In [27]:
dataset = Planetoid(root=path_to_data, name='CiteSeer', transform=NormalizeFeatures())

print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.test.index
Processing...



Dataset: CiteSeer():
Number of graphs: 1
Number of features: 3703
Number of classes: 6

Data(x=[3327, 3703], edge_index=[2, 9104], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327])
Number of nodes: 3327
Number of edges: 9104
Average node degree: 2.74
Has isolated nodes: True
Has self-loops: False
Is undirected: True


Done!


In [35]:
# using the commonly used set up for CiteSeer, where 2327 nodes are used for training, 500 for validation, and 500 for testing

dataset[0].train_mask.fill_(False)
dataset[0].train_mask[:dataset[0].num_nodes - 1000] = 1
dataset[0].val_mask.fill_(False)
dataset[0].val_mask[dataset[0].num_nodes - 1000:dataset[0].num_nodes - 500] = 1
dataset[0].test_mask.fill_(False)
dataset[0].test_mask[dataset[0].num_nodes - 500:] = 1

In [37]:
x_train_citeseer = data.x[data.train_mask]
y_train_citeseer = data.y[data.train_mask]
x_test_citeseer = data.x[data.test_mask]
y_test_citeseer = data.y[data.test_mask]
x_val_citeseer = data.x[data.val_mask]
y_val_citeseer = data.y[data.val_mask]

In [38]:
citeseer_accs = []

for seed in [42,456789,67341,1204,1302]:
    # rf_cora = RandomForestClassifier(n_estimators=800, max_depth=80, random_state=seed)
    # rf_cora = LogisticRegression()
    model_citeseer = sklearn.linear_model.RidgeClassifier(alpha=0.1, random_state=seed)
    model_citesser.fit(x_train_citeseer, y_train_citeseer)

    y_test_pred_citeseer = model_citeseer.predict(x_test_citeseer)
    accuracy = accuracy_score(y_test_citeseer, y_test_pred_citeseer)
    citeseer_accs.append(accuracy)
    print(f"Test Accuracy: {accuracy:.8f}")

citeseer_accs = np.array(citeseer_accs)

Test Accuracy: 0.77200000
Test Accuracy: 0.77200000
Test Accuracy: 0.77200000
Test Accuracy: 0.77200000
Test Accuracy: 0.77200000


In [41]:
print("Accuracy of baseline on CiteSeer dataset: ", citeseer_accs.mean(), " +- ", citeseer_accs.std())

Accuracy of baseline on CiteSeer dataset:  0.772  +-  0.0


# PubMed Dataset

In [42]:
dataset = Planetoid(root=path_to_data, name='PubMed', transform=NormalizeFeatures())

print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.test.index
Processing...



Dataset: PubMed():
Number of graphs: 1
Number of features: 500
Number of classes: 3

Data(x=[19717, 500], edge_index=[2, 88648], y=[19717], train_mask=[19717], val_mask=[19717], test_mask=[19717])
Number of nodes: 19717
Number of edges: 88648
Average node degree: 4.50
Has isolated nodes: False
Has self-loops: False
Is undirected: True


Done!


In [45]:
# using the commonly used set up for PubMed, where 18717 nodes are used for training, 500 for validation, and 500 for testing

dataset[0].train_mask.fill_(False)
dataset[0].train_mask[:dataset[0].num_nodes - 1000] = 1
dataset[0].val_mask.fill_(False)
dataset[0].val_mask[dataset[0].num_nodes - 1000:dataset[0].num_nodes - 500] = 1
dataset[0].test_mask.fill_(False)
dataset[0].test_mask[dataset[0].num_nodes - 500:] = 1

In [48]:
x_train_pubmed = data.x[data.train_mask]
y_train_pubmed = data.y[data.train_mask]
x_test_pubmed = data.x[data.test_mask]
y_test_pubmed = data.y[data.test_mask]
x_val_pubmed = data.x[data.val_mask]
y_val_pubmed = data.y[data.val_mask]

In [50]:
pubmed_accs = []

for seed in [42,456789,67341,1204,1302]:
    # rf_cora = RandomForestClassifier(n_estimators=800, max_depth=80, random_state=seed)
    # rf_cora = LogisticRegression()
    model_pubmed = sklearn.linear_model.RidgeClassifier(alpha=0.1, random_state=seed)
    model_pubmed.fit(x_train_pubmed, y_train_pubmed)

    y_test_pred_pubmed = model_pubmed.predict(x_test_pubmed)
    accuracy = accuracy_score(y_test_pubmed, y_test_pred_pubmed)
    pubmed_accs.append(accuracy)
    print(f"Test Accuracy: {accuracy:.8f}")

pubmed_accs = np.array(pubmed_accs)

Test Accuracy: 0.88400000
Test Accuracy: 0.88400000
Test Accuracy: 0.88400000
Test Accuracy: 0.88400000
Test Accuracy: 0.88400000


In [51]:
print("Accuracy of baseline on PubMed dataset: ", pubmed_accs.mean(), " +- ", pubmed_accs.std())

Accuracy of baseline on PubMed dataset:  0.884  +-  0.0
