# Intro from SciKeras

Example to set up a TensorFlow neural network using SciKeras.

from: https://adriangb.com/scikeras/refs/heads/master/notebooks/Basic_Usage.html#3.-Training-a-regressor

In [None]:
import numpy as np
from tensorflow import keras
from scikeras.wrappers import KerasRegressor
import pandas as pd
import multiprocessing
print("num of cpus:", multiprocessing.cpu_count())

In [None]:
#Generate random dataset

from sklearn.datasets import make_regression

X_regr, y_regr = make_regression(1000, 20, n_informative=10, random_state=0)

X_regr.shape, y_regr.shape, y_regr.min(), y_regr.max()

In [None]:
#Functionalize the creation of the neural network

def get_reg(meta, hidden_layer_sizes, dropout):
    n_features_in_ = meta["n_features_in_"]
    model = keras.models.Sequential()
    model.add(keras.layers.Input(shape=(n_features_in_,)))
    for hidden_layer_size in hidden_layer_sizes:
        model.add(keras.layers.Dense(hidden_layer_size, activation="relu"))
        model.add(keras.layers.Dropout(dropout))
    model.add(keras.layers.Dense(1))
    return model

In [None]:
from scikeras.wrappers import KerasRegressor

#Assigns Regressor model that calls the get_reg function

reg = KerasRegressor(
    model=get_reg,
    loss="mse",
    metrics=[KerasRegressor.r_squared],
    #hidden_layer_sizes=(100,),
    #dropout=0.5,
)

In [None]:
reg.fit(X_regr, y_regr);
y_pred = reg.predict(X_regr[:5])
y_pred

# w/ GridSearchCV

In [None]:
import numpy as np
from tensorflow import keras
from scikeras.wrappers import KerasRegressor
import pandas as pd
import multiprocessing
print("num of cpus:", multiprocessing.cpu_count())

In [None]:
# Load the data from the .csv file into a pandas DataFrame
data = pd.read_csv('../data/topFeatures_v2.csv', index_col=0)
targets = pd.read_csv('../data/def_param/def_param_v2.csv', index_col=0)
ground_truth = pd.read_csv('../data/ground_truth_featurized/ground_truth_topFeatures_v2.csv', index_col=0)
targets=np.log10(targets)
display(data)
display(ground_truth)
display(targets)

In [None]:
data.describe()

In [None]:
ground_truth.describe()

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from scikeras.wrappers import KerasRegressor

#add pipeline
#Split into train and test sets
X_train, X_test = train_test_split(data, test_size = 0.2, random_state = 42)
y_train, y_test = train_test_split(targets, test_size = 0.2, random_state = 42)

#Define the data preprocessing steps and the model
preprocessor = make_pipeline(StandardScaler())
model = LinearRegression()

# Fit the model using cross-validation
pipeline = make_pipeline(preprocessor, model)
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-validation score: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")

# Fit the model on the entire training set and evaluate on the test set
pipeline.fit(X_train, y_train)
test_score = pipeline.score(X_test, y_test)
print(f"Test set score : {test_score}")

In [None]:
#Normalize the data

scaler = StandardScaler(with_mean=True, with_std=True)
X_train_norm = scaler.fit_transform(X_train.values)
X_test_norm = scaler.fit_transform(X_test.values)
#gt_norm = scaler.fit_transform(ground_truth.values)

X_train_transform = pd.DataFrame(X_train_norm, index=X_train.index, columns=X_train.columns)
X_test_transform = pd.DataFrame(X_test_norm, index=X_test.index, columns=X_test.columns)
#gt_transform = pd.DataFrame(gt_norm, index=ground_truth.index, columns=ground_truth.columns)

#display(X_train_transform)
#display(X_test_transform)
X_train_transform.describe()
X_test_transform.describe()
#gt_transform.describe()

In [None]:
#Measures alpha, to be used ass activity_regularizer=l2(alpha)

from sklearn import linear_model
from sklearn.model_selection import cross_validate, KFold
import matplotlib.pyplot as plt

features = [c for c in X_train_transform.columns]
cv_results = []
coeffs = []
alphas = np.logspace(1, 2, 100)
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for alpha in alphas: #set up an array for alpha
    ridge = linear_model.Ridge(alpha=alpha, max_iter=10000) #iterate through alpha values
    ridge.fit(X_train_transform, y_train)#fit the norm_train_X and our target data column (form_y)
    scores = cross_validate(ridge, X_train_transform, y_train, cv=kfold, scoring="neg_mean_squared_error")
    cv_results.append([alpha, -np.mean(scores["test_score"])] + list(ridge.coef_))

cv_results = pd.DataFrame(cv_results, columns=["alpha", "score"] + features[0:8])
f, ax = plt.subplots(figsize=(6, 6))
plt.plot(cv_results["alpha"], cv_results["score"], "-x")
#plt.xlim(0,10000)
plt.xscale(r"log")
#plt.xlim([0.1, 10])
#plt.xlim([0.01])
plt.xlabel(r"$\alpha$")
plt.ylabel(r"MSE")
plt.title(r"Ridge regression")
rbest_alpha = cv_results["alpha"][cv_results["score"].idxmin()]
plt.annotate(
    r"Best $\alpha$ = %.3f" % rbest_alpha,
    (rbest_alpha, cv_results["score"].min()),
    fontsize=16,
);

In [None]:
from keras.regularizers import l2

#Functionalizes neural network
#Will iterate through up to 3 layers
#Output layer must be 8 neurons
#to fit the size of defect_param

def get_reg(meta, hidden_layer_sizes, dropout):
    n_features_in_ = meta["n_features_in_"]
    model = keras.models.Sequential()
    model.add(keras.layers.Input(shape=(n_features_in_,)))
    for hidden_layer_size in hidden_layer_sizes:
        model.add(keras.layers.Dense(hidden_layer_size, activation="relu", activity_regularizer=l2(rbest_alpha)))
        model.add(keras.layers.Dropout(dropout))
    model.add(keras.layers.Dense(8))
    model.summary()
    return model

In [None]:
#Pull params from GridSearch and plug into variables
#below to create model

reg = KerasRegressor(
    model=get_reg,
    loss="mse",
    optimizer='adam',
    optimizer__lr=0.001,
    model__hidden_layer_sizes=(100,),
    model__dropout=0.05,
    metrics=[KerasRegressor.r_squared],
    verbose=False,
    random_state=42,
)

In [None]:
#For loops to generate list of tuples for neuron list
#Imperfect, gives repeats but not rearranged repeats
#eg will give (32, 64, 512) but not (64, 32, 512) 


from itertools import combinations_with_replacement

neurons_list=[]
neurons=list(range(32,544,32))
for layers in range(1, 4):
    neurons_per_layer=combinations_with_replacement(neurons,layers)
    temp=[i for i in neurons_per_layer]
    neurons_list.append(temp)
neurons_list=list(neurons_list)
print(neurons_list)

#stay below 1.0
lr_list = []
for exponent in range(-4, 0):
    lr_list.append(10**exponent)
print(lr_list)

#entries must be below 1
dropout_list = list(np.linspace(0,0.5,5, endpoint=False))
print(dropout_list)

In [None]:
#1 layer GridSearch gave (512,)
#For loop to make list of tuples
#(512, i)

_2dim_layers=[]
first_layer = 512
for i in neurons:
    temp_arch = (first_layer, i)
    _2dim_layers.append(temp_arch)
print(_2dim_layers)

In [None]:
%%time
from sklearn.model_selection import GridSearchCV
best_params_log=[]
params = {
    'optimizer__lr':lr_list,
    'model__hidden_layer_sizes':neurons_list,
                                # (100,100), (200,200), (300,300), (400,400),\
                                # (500,500), (600,600), (700,700), (800,800),
                                # (900,900), (1000,1000)],
    'model__dropout':dropout_list,
}
gs = GridSearchCV(reg, params, refit=False, cv=3, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)

gs.fit(X_train_transform, y_train)
print(gs.best_score_, gs.best_params_)

best_params_log.append([gs.best_score_, gs.best_params_])

In [None]:
#Once the param_grid is created, can call function again
#or 

reg = KerasRegressor(
    model=get_reg,
    loss="mse",
    optimizer='adam',
    optimizer__lr=0.0001,
    model__hidden_layer_sizes=(512,32),
    model__dropout=0,
    metrics=[KerasRegressor.r_squared],
    verbose=False,
    random_state=42,
)

In [None]:
reg.fit(X_train_transform, y_train);

In [None]:
y_pred = reg.predict(X_test_transform)
y_pred = pd.DataFrame(10 ** y_pred, columns = targets.columns)
display(y_pred)
y_pred.describe()