In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive

/content/drive/MyDrive


In [1]:
import pickle as pkl
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import warnings; warnings.filterwarnings('ignore')
import json
from datetime import datetime, timedelta
from collections import OrderedDict

In [2]:
import sklearn

In [3]:
sklearn.__version__

'0.24.2'

In [None]:
model_name = 'mlp'
dataset = 'arx' # 'citeseer', 'cora', 'pubmed', 'arx'
task = 'classification' # 'classification', 'link_prediction'
feat_norm = False
n_iter = 10

In [None]:
with open('data/'+dataset+'.feature', 'rb') as f:
    X = pkl.load(f).todense()
with open('data/'+dataset+'.labels', 'rb') as f:
    y = pkl.load(f)

In [None]:
test_size = 0.5
epochs = 200
if model_name == 'logreg':
    multi_class = 'ovr'
    setting_order = 'test_size, epochs, multi_class'
    settings = test_size, epochs, multi_class
if model_name == 'mlp':
    learning_rate = 0.05
    hidden_units = 32
    hidden_layer_num = 1
    setting_order = 'test_size, epochs, learning_rate, hidden_units, hidden_layer_num'
    settings = test_size, epochs, learning_rate, hidden_units, hidden_layer_num

### Logistic Regression

In [None]:
# results= []
# for i in range(n_iter):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True)
#     clf = LogisticRegression(multi_class=multi_class, max_iter=epochs).fit(X_train, y_train)
#     results.append(np.round(clf.score(X_test, y_test), 6))
# print('Model: ', clf, '\n')
# print(results)
# print('\nAverage performance: {:.2f} ± {:.2f}'.format(np.mean(results)*100, np.std(results)*100))

### MLP

In [None]:
results= []
for i in range(n_iter):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True)
    clf2 = MLPClassifier(max_iter=epochs, learning_rate_init=learning_rate, hidden_layer_sizes=(32,)).fit(X_train, y_train)
    results.append(np.round(clf2.score(X_test, y_test), 6))
print('Model: ', clf2, '\n')
print(results)
print('\nAverage performance: {:.2f} ± {:.2f}'.format(np.mean(results)*100, np.std(results)*100))

Model:  MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(32,), learning_rate='constant',
              learning_rate_init=0.05, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False) 

[0.6356, 0.6404, 0.635, 0.636, 0.632, 0.6426, 0.6392, 0.624, 0.6392, 0.6388]

Average performance: 63.63 ± 0.50


In [None]:
date = (datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")

In [None]:
log_dict= OrderedDict()

In [None]:
log_dict['model'] = model_name
log_dict['dataset'] = dataset
log_dict['datetime'] = date
log_dict['iteration'] = n_iter
log_dict['setting_order_{}'.format(model_name)] = setting_order
log_dict['settings'] = settings
log_dict['acc'] = results
log_dict['acc_mean'] = np.mean(results)
log_dict['acc_std'] = np.std(results)
log_dict
pd.DataFrame(log_dict.items(), columns=['key', 'value'])

Unnamed: 0,key,value
0,model,mlp
1,dataset,arx
2,datetime,2021-09-23 12:38:42
3,iteration,10
4,setting_order_mlp,"test_size, epochs, learning_rate, hidden_units..."
5,settings,"(0.5, 200, 0.05, 32, 1)"
6,acc,"[0.6356, 0.6404, 0.635, 0.636, 0.632, 0.6426, ..."
7,acc_mean,0.63628
8,acc_std,0.00501613


In [None]:
data = json.load(open('results_classification_baseline.json'))
data.append(log_dict)
with open('results_classification_baseline.json', 'w') as f:
    json.dump(data, f)

In [None]:
print("Last data saved at: {}".format((datetime.now() + timedelta(hours=9)).strftime("%Y-%m-%d %H:%M:%S")))
print("Total data num: {}".format(len(data)))

Last data saved at: 2021-09-23 12:38:42
Total data num: 8


In [None]:
# # if wanna clear results(be careful!)
# empty = []
# with open('results_classification_baseline.json', 'w') as f:
#     json.dump(empty, f)