In [22]:
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [23]:
CLFS = ["kpr", "ktr", "lpr", "ltr", "sfr", "stmk", "xfr", "xpr", "xtr", "kfr", "ktmk", "lfr", "ltmk", "spr", "str", "xlnet_softmax", "xtmk", "rep_bert"]

y_train = []
y_test = []
X_test = []
X_train = []

FX_test = np.load("/home/welton/data/representations/webkb/10_folds/tmk/0/test.npz", allow_pickle=True)["X_test"].tolist().toarray()
FX_train = np.load("/home/welton/data/representations/webkb/10_folds/tmk/0/train.npz", allow_pickle=True)["X_train"].tolist().toarray()

map_labels = lambda x: 1 if x == 0 else 0
vecf = np.vectorize(map_labels)

y_train = vecf(np.load(f"/home/welton/data/oracle/upper_bound/webkb/ktmk/0/train.npz")['y'])
y_test = vecf(np.load(f"/home/welton/data/oracle/upper_bound/webkb/ktmk/0/test.npz")['y'])

X_train = np.load(f"/home/welton/data/clfs_output/split_10/webkb/10_folds/ktmk/0/train.npz")["X_train"]
X_test = np.load(f"/home/welton/data/clfs_output/split_10/webkb/10_folds/ktmk/0/test.npz")["X_test"]

#X_train = np.hstack([X_train, FX_train])
#X_test = np.hstack([X_test, FX_test])


#for clf in CLFS:
#
#    y_train.append(np.load(f"/home/welton/data/oracle/upper_bound/webkb/{clf}/0/train.npz")['y'])
#    y_test.append(np.load(f"/home/welton/data/oracle/upper_bound/webkb/{clf}/0/test.npz")['y'])
#    
#    X_test.append(np.load(f"/home/welton/data/clfs_output/split_10/webkb/10_folds/{clf}/0/test.npz")["X_test"])
#    X_train.append(np.load(f"/home/welton/data/clfs_output/split_10/webkb/10_folds/{clf}/0/train.npz")["X_train"])
#
#y_train = np.hstack(y_train)
#y_test = np.hstack(y_test)
#X_test = np.vstack(X_test)
#X_train = np.vstack(X_train)
#

In [24]:
X_train.shape, y_train.shape

((7376, 7), (7376,))

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [44]:
class NN(nn.Module):

    def __init__(self, input_size: int) -> None:
        super().__init__()

        self.input_size = input_size

        self.hidden_layers = nn.Sequential(
            nn.Linear(self.input_size, self.input_size),
            nn.ReLU(),
            nn.Linear(self.input_size, self.input_size),
            nn.ReLU()
        )

        self.output_layer = nn.Sequential(
            nn.Linear(self.input_size, self.input_size),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        
        x = self.hidden_layers(x)
        return self.output_layer(x)

class Data(Dataset):

    def __init__(self, data: tuple):
        super().__init__()

        x = data[0]
        y = data[1]
        self.x = torch.from_numpy(x)
        self.y = torch.from_numpy(y)
        self.n_samples = x.shape[0]
    
    def __getitem__(self, index: int):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.n_samples

In [45]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [46]:
model = NN(X_train.shape[1]).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [47]:
data = Data((X_train, y_train))
train_loader = DataLoader(dataset = data, batch_size = 32, shuffle=True)

In [53]:
n_total_steps = len(train_loader)
epochs = 50
for epoch in range(epochs):
    rloss = 0.0
    for i, (x, y)  in enumerate(train_loader):

        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        
        # Passing data throug the net.
        outputs = model(x.float())
        loss = criterion(outputs, y)

        # Backward and optmize.
        #optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print(
            f'\t\tEpoch [{epoch+1}/{epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}', end="\r")



		Epoch [50/50], Step [200/231], Loss: 1.3453

In [54]:
X = torch.from_numpy(X_test)
y = torch.from_numpy(y_test)

In [55]:
with torch.no_grad():
    preds = []
    for x in X.to(device):
        preds.append(model(x.float()))

preds = np.array(torch.stack(preds).cpu().numpy())

In [56]:
preds.argmax(axis=1)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [265]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([5693, 1683]))

In [266]:
y_train.shape, X_train.shape, y_test.shape, X_test.shape

((7376,), (7376, 7), (823,), (823, 7))

In [267]:
#y_train.shape, FX_train.shape, y_test.shape, FX_test.shape

In [272]:
#clf = LogisticRegression(max_iter=200)
clf = KNeighborsClassifier(n_neighbors=5, metric="euclidean")
#clf = GradientBoostingClassifier()
#clf = MLPClassifier(hidden_layer_sizes=(10, 10, 10), activation="relu", n_iter_no_change=15)
#clf = GaussianNB()

In [273]:
clf.fit(X_train, y_train)

In [274]:
y_pred = clf.predict(X_test)

In [275]:
precision_score(y_test, y_pred, pos_label=1), recall_score(y_test, y_pred, pos_label=1)

(0.22033898305084745, 0.0718232044198895)

In [259]:
np.unique(y_pred, return_counts=True)

(array([0]), array([823]))