In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive

/content/drive/MyDrive


In [None]:
import pickle as pkl
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import warnings; warnings.filterwarnings('ignore')
import json
from datetime import datetime, timedelta
from collections import OrderedDict

In [None]:
model_name = 'mlp'
dataset = 'arx' # 'citeseer', 'cora', 'pubmed', 'arx'
task = 'classification' # 'classification', 'link_prediction'
feat_norm = False
n_iter = 10

In [None]:
with open('data/'+dataset+'.feature', 'rb') as f:
    X = pkl.load(f).todense()
with open('data/'+dataset+'.labels', 'rb') as f:
    y = pkl.load(f)

In [None]:
test_size = 0.45
val_size = 0.05
epochs = 200
if model_name == 'logreg':
    multi_class = 'ovr'
    setting_order = 'test_size, epochs, multi_class'
    settings = test_size, val_size, epochs, multi_class
if model_name == 'mlp':
    learning_rate = 0.05
    hidden_units = 32
    hidden_layer_num = 1
    setting_order = 'test_size, epochs, learning_rate, hidden_units, hidden_layer_num'
    settings = test_size, val_size, epochs, learning_rate, hidden_units, hidden_layer_num

### Logistic Regression

In [None]:
# results= []
# for i in range(n_iter):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=True)
#     X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.1, shuffle=True)
#     clf = LogisticRegression(multi_class=multi_class, max_iter=epochs).fit(X_train, y_train)
#     val_results = np.round(clf.score(X_val, y_val) , 6)
#     test_results =  np.round(clf.score(X_test, y_test), 6)
#     results.append(test_results)
#     print('Val: {:.2f} ± {:.2f}'.format(np.mean(val_results)*100, np.std(val_results)*100), 'Test: {:.2f} ± {:.2f}'.format(np.mean(test_results)*100, np.std(test_results)*100))
#     # print('\n)
# print('Model: ', clf, '\n')
# print(results)
# print('\nAverage performance: {:.2f} ± {:.2f}'.format(np.mean(results)*100, np.std(results)*100))

### MLP

In [None]:
results= []
for i in range(n_iter):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=True)
    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.1, shuffle=True)
    clf2 = MLPClassifier(max_iter=epochs, learning_rate_init=learning_rate, hidden_layer_sizes=(32,)).fit(X_train, y_train)
    val_results = np.round(clf2.score(X_val, y_val) , 6)
    test_results =  np.round(clf2.score(X_test, y_test), 6)
    results.append(test_results)
    print('Val: {:.2f} ± {:.2f}'.format(np.mean(val_results)*100, np.std(val_results)*100), 'Test: {:.2f} ± {:.2f}'.format(np.mean(test_results)*100, np.std(test_results)*100))
print('Model: ', clf2, '\n')
print(results)
print('\nAverage performance: {:.2f} ± {:.2f}'.format(np.mean(results)*100, np.std(results)*100))

Val: 64.80 ± 0.00 Test: 63.33 ± 0.00
Val: 67.20 ± 0.00 Test: 63.60 ± 0.00
Val: 61.40 ± 0.00 Test: 64.38 ± 0.00
Val: 61.60 ± 0.00 Test: 63.11 ± 0.00
Val: 64.00 ± 0.00 Test: 63.38 ± 0.00
Val: 62.80 ± 0.00 Test: 63.09 ± 0.00
Val: 62.20 ± 0.00 Test: 63.64 ± 0.00
Val: 61.60 ± 0.00 Test: 63.73 ± 0.00
Val: 62.80 ± 0.00 Test: 64.18 ± 0.00
Val: 65.40 ± 0.00 Test: 63.40 ± 0.00
Model:  MLPClassifier(hidden_layer_sizes=(32,), learning_rate_init=0.05) 

[0.633333, 0.636, 0.643778, 0.631111, 0.633778, 0.630889, 0.636444, 0.637333, 0.641778, 0.634]

Average performance: 63.58 ± 0.40


In [None]:
date = (datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")

In [None]:
log_dict= OrderedDict()

In [104]:
log_dict['model'] = model_name
log_dict['dataset'] = dataset
log_dict['datetime'] = date
log_dict['iteration'] = n_iter
log_dict['setting_order_{}'.format(model_name)] = setting_order
log_dict['settings'] = settings
log_dict['acc'] = results
log_dict['acc_mean'] = np.mean(results)
log_dict['acc_std'] = np.std(results)
log_dict
pd.DataFrame(log_dict.items(), columns=['key', 'value'])

Unnamed: 0,key,value
0,model,mlp
1,dataset,arx
2,datetime,2021-11-30 23:27:22
3,iteration,10
4,setting_order_mlp,"test_size, epochs, learning_rate, hidden_units..."
5,settings,"(0.45, 0.05, 200, 0.05, 32, 1)"
6,acc,"[0.633333, 0.636, 0.643778, 0.631111, 0.633778..."
7,acc_mean,0.635844
8,acc_std,0.00402106


In [102]:
data = json.load(open('results_classification_baseline_v2.json'))
data.append(log_dict)
with open('results_classification_baseline_v2.json', 'w') as f:
    json.dump(data, f)

In [103]:
print("Last data saved at: {}".format((datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")))
print("Total data num: {}".format(len(data)))

Last data saved at: 2021-11-30 23:28:31
Total data num: 8


In [None]:
# if wanna clear results(be careful!)
# empty = []
# with open('results_classification_baseline_v2.json', 'w') as f:
#     json.dump(empty, f)