In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
import os
import itertools
import sklearn
import pandas as pd
import scipy.stats as stats

#from keras import Sequential
#from keras.layers import Dense
#from keras.wrappers.scikit_learn import KerasRegressor

from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import mean_squared_error



In [2]:
neuron_data = {}

files = sorted(os.listdir('../neuron_logs/train_data'))


In [3]:
def reduce_to_statistics(activations, labels, debug=False):
    sorted_data = []
    for i in range(10): #hardcoded MOFO
        sorted_data += [[]]
    for i, a in zip(labels, activations):
        sorted_data[i] += [a]
    if debug:
        return(sorted_data)
    statistics = []
    for ar in sorted_data:
        curr_stats = stats.describe(ar)
        statistics += [curr_stats.mean, curr_stats.variance, curr_stats.skewness, curr_stats.kurtosis, curr_stats.minmax[0],
                       curr_stats.minmax[1], curr_stats.nobs]
        #print(statistics)
    return(statistics)

In [8]:
def extract_data(filename, fin = 10, activations_no = 1000, target = 'usefulness_loss', shuffle = True):
    
    features, labels = [], []
    with open(os.path.join('..', 'neuron_logs', 'train_data', filename), 'r') as f:
        neuron_data = json.load(f)
        for e in neuron_data.keys():
            if e == '0' or int(e) > fin:
                continue
            for neuron in neuron_data[e]:
                if ' ' not in neuron:
                    continue
                current_data = neuron_data[e][neuron]
                important_features = []
                important_features = current_data['activations'][:activations_no]
                important_features += reduce_to_statistics(current_data['activations'], neuron_data[e]['original_labels'])
                important_features += [current_data['depth']]
                important_features += [current_data['inverse_depth']]
                important_features += [current_data['width']]
                # important_features += [current_data['input_weights']]
                # important_features += [current_data['output_weights']]
                important_features += [current_data['reg_loss_in_layer']]
                important_features += [e]
                features += [important_features]
                #labels += [current_data[target]]
                labels += [current_data[target]]
    
    if shuffle:
        shuffled = sklearn.utils.shuffle(np.concatenate([np.array(features), np.array(labels).reshape(-1, 1)], axis=1))
        features, labels = shuffled[:, :-1], shuffled[:, -1]
    
    return(np.array(features, dtype=np.float32), np.array(labels, dtype=np.float32))


In [74]:
# features, labels = extract_data(files[0])

In [75]:
# print(features.shape, labels.shape)

(1200, 1075) (1200,)


In [14]:
reg = sklearn.linear_model.SGDRegressor(
    loss='huber',
    penalty='l2',
    alpha=0.0001,
    l1_ratio=0.15,
    fit_intercept=True,
    max_iter=1000,
    tol=0.001,
    shuffle=False, #ain't doin' nothin' on partial_fit
    verbose=0,
    epsilon=0.1,
    random_state=0,
    learning_rate='optimal',
    eta0=0.03,
    power_t=0.25,
    early_stopping=False, #nono
    validation_fraction=0.1,
    n_iter_no_change=5,
    warm_start=True, #must
    average=False,
)

In [19]:
X_test, y_test = extract_data(files[0])
scaler = StandardScaler()
scaler.fit(X_test)
X_test = scaler.transform(X_test)
valids, tests = [], []

# train
for epoch in range(5):
    for i, filename in enumerate(files[1:]):
        print(f'Opening file {filename}')
        features, labels = extract_data(filename)
        scaler = StandardScaler()
        scaler.partial_fit(features)
        features = scaler.transform(features)

        X_train, X_valid, y_train, y_valid = train_test_split(features, labels, test_size=0.2, random_state=0)

        for epoch in range(20):
            reg.partial_fit(X_train, y_train)

        y_pred = reg.predict(X_valid)
        valids += [mean_squared_error(y_pred, y_valid)]
        print(f'MSE on valid: {mean_squared_error(y_pred, y_valid)}')
        y_pred = reg.predict(X_test)
        tests += [mean_squared_error(y_pred, y_test)]
        print(f'MSE on unknown network: {mean_squared_error(y_pred, y_test)}')

Opening file output_20191015-203714.json
MSE on valid: 148.4371444240196
MSE on unknown network: 129.0130452786891
Opening file output_20191015-215448.json
MSE on valid: 32.421837575864295
MSE on unknown network: 33.615723433833466
Opening file output_20191015-231129.json
MSE on valid: 50.19379461953599
MSE on unknown network: 56.88544377909865
Opening file output_20191016-002746.json
MSE on valid: 22.48122584171933
MSE on unknown network: 30.86201792058809
Opening file output_20191016-014510.json
MSE on valid: 4.9084298281556045
MSE on unknown network: 8.023102462167005
Opening file output_20191016-030245.json
MSE on valid: 14.675524306786942
MSE on unknown network: 15.18198149554988
Opening file output_20191016-042123.json
MSE on valid: 54.07517394698289
MSE on unknown network: 46.785981185321276
Opening file output_20191016-053821.json
MSE on valid: 11.517327718992293
MSE on unknown network: 8.496186061116298
Opening file output_20191016-065412.json
MSE on valid: 3.6378881251339816


KeyboardInterrupt: 

In [18]:
X_valid.shape

(240, 1075)

In [None]:
from joblib import dump, load
dump(clf, 'epic_regressor.joblib') 