## MLP with Grid Search CV

In [1]:
import pandas as pd
from cbfv.composition import generate_features
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupKFold

from keras.models import Sequential, Model
from keras.layers import Dense, Input

from keras.wrappers.scikit_learn import KerasRegressor

import pandas as pd
from cbfv.composition import generate_features


import tensorflow
from sklearn.model_selection import GridSearchCV



2022-12-17 16:58:33.354723: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Input Data Loading 

In [2]:
df_train = pd.read_csv('data/model_input/train_yieldstrength.csv')
df_test = pd.read_csv('data/model_input/test_yieldstrength.csv')
print('size of train data', len(df_train))
print('size of test data',len(df_test))


size of train data 115
size of test data 25


## Featurization Oliynyk

In [3]:
# FEATURIZATION 
X_train_unscaled, y_train, formulae_train, skipped_train = generate_features(df_train,elem_prop='oliynyk',drop_duplicates=False,extend_features=True)
X_test_unscaled, y_test, formulae_test, skipped_test = generate_features(df_test,elem_prop='oliynyk',drop_duplicates=False,extend_features=True)


scaler = StandardScaler()

X_train = scaler.fit_transform(X_train_unscaled)
X_test = scaler.transform(X_test_unscaled)
print(X_train.shape)
print(y_train.shape)


Processing Input Data: 100%|███████████████| 115/115 [00:00<00:00, 14104.89it/s]


	Featurizing Compositions...


Assigning Features...: 100%|████████████████| 115/115 [00:00<00:00, 8061.66it/s]


	Creating Pandas Objects...


Processing Input Data: 100%|██████████████████| 25/25 [00:00<00:00, 9765.10it/s]


	Featurizing Compositions...


Assigning Features...: 100%|██████████████████| 25/25 [00:00<00:00, 4892.34it/s]

	Creating Pandas Objects...
(115, 133)
(115,)





## Helper Function for performance matrix

In [4]:
################# EVALUATION MATRIX #######################################
def model_evaluation(y_actual,y_predict, label = None,model='None'):
    df = pd.DataFrame([{'mae':mean_absolute_error(y_actual,y_predict),
                        'rmse':mean_squared_error(y_actual,y_predict,squared=False),
                        'r2score':r2_score(y_actual,y_predict),
                        'model':model
                        }])
    if label:
        df.index = [label]
        
    return df

def model_evaluation_result(y_Train,y_train_predict,y_Test,y_test_predict,model='None'):
    result = pd.concat([model_evaluation(y_Train,y_train_predict,'Train',model),
                    model_evaluation(y_Test,y_test_predict, 'Test',model)])
    return result


def plot_model_performance(y_train,y_train_predict,y_test,y_test_predict,r2_train,r2_test):
    plt.figure(figsize=(8,8))
    plt.scatter(y_train,y_train_predict, s=160,alpha=0.7, label='Training Data ($r^2$ ={0:.3f})'.format(r2_train))
    plt.scatter(y_test,y_test_predict,s=160, alpha=0.7, label='Test Data ($r^2$ ={0:.3f})'.format(r2_test))
    # plt.plot(y_test,y_test,'--',dashes=(5, 6),linewidth=2,c='grey',label = 'Ideal Fit')
    plt.axline((0, 0), (1200, 1200), linewidth=2, color='k',linestyle='dashed',label='ideal fit')
    plt.xlabel('Actual Ultimate Tensile Strength (MPa) ', size=16)
    plt.ylabel('Predicted Ultimate Tensile Strength (MPa)',size=16)
    plt.tick_params(labelsize=14)
    plt.xlim((0,1200))
    plt.ylim((0,1200))
    plt.legend(prop={'size': 14})
    plt.minorticks_on()
    plt.tick_params(labelsize=16)
    plt.tick_params(direction='in',which='major', length=15, bottom=True, top=True, left=True, right=True,colors='k')
    plt.tick_params(direction='in',which='minor', length=5, bottom=True, top=True, left=True, right=True,colors='k')


## MLP with Grid Search 

In [5]:
tensorflow.keras.utils.set_random_seed(20)

def ANN(optimizer = 'adam',neurons=32,batch_size=32,epochs=50,activation='relu',loss='mse'):
    model = Sequential()
    model.add(Dense(neurons, input_shape=(X_train.shape[1],), activation=activation))
    model.add(Dense(neurons, activation=activation))
    model.add(Dense(1))
    model.compile(optimizer = optimizer, loss=loss)
    return model

ann_params = {
    'optimizer': ['adam','rmsprop'],
    'activation': ['relu'],
    'loss': ['mse','mae'],
    'batch_size': [8,16],
    'neurons':[8,16,32],
    'epochs':[100,150,200,300],
}



gkf=GroupKFold(n_splits=5)

clf = KerasRegressor(build_fn=ANN, verbose=0)
grid = GridSearchCV(clf, ann_params,cv=gkf,scoring='neg_mean_squared_error')


grid.fit(X_train, y_train,groups=formulae_train)

print(grid.best_params_)

print("MSE:"+ str(-grid.best_score_))


best = grid.best_estimator_
tensorflow.keras.utils.set_random_seed(20)

best.fit(X_train,y_train)


  clf = KerasRegressor(build_fn=ANN, verbose=0)
2022-12-17 16:58:39.864960: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


{'activation': 'relu', 'batch_size': 8, 'epochs': 200, 'loss': 'mae', 'neurons': 32, 'optimizer': 'adam'}
MSE:19020.473192867266


<keras.callbacks.History at 0x12a9c41f0>

## Performance Matrix 

In [6]:
y_test_predict = best.predict(X_test)
y_train_predict = best.predict(X_train)
print(model_evaluation(y_train,y_train_predict))
print(model_evaluation(y_test,y_test_predict))


         mae       rmse   r2score model
0  31.881728  50.533851  0.880352  None
         mae        rmse   r2score model
0  99.651962  151.745755  0.538895  None


## Custom Features

In [7]:
# FEATURIZATION 
X_train_unscaled, y_train, formulae_train, skipped_train = generate_features(df_train,elem_prop='f3_revised',drop_duplicates=False,extend_features=True)
X_test_unscaled, y_test, formulae_test, skipped_test = generate_features(df_test,elem_prop='f3_revised',drop_duplicates=False,extend_features=True)


X_train_unscaled_avg = X_train_unscaled[['avg_Atomic_Radius','avg_Pauling_Electronegativity','avg_number_of_valence_electrons','avg_Cohesive_energy_ev_atom',
            'avg_Bulk_modulus_RT_Gpa','avg_Elastic_modulus_RT_Gpa','avg_Shear_modulus_RT_Gpa','avg_Melting_point_(K)','avg_rate_shear_mod_Mpa_perK',
            'avg_Solid_Solubility_atpct','avg_lattice_constant_A','avg_BEC_percm3','avg_Av.Valence_bond_strength_ev','avg_EngelZ_e/a','T']]

X_test_unscaled_avg = X_test_unscaled[['avg_Atomic_Radius','avg_Pauling_Electronegativity','avg_number_of_valence_electrons','avg_Cohesive_energy_ev_atom',
            'avg_Bulk_modulus_RT_Gpa','avg_Elastic_modulus_RT_Gpa','avg_Shear_modulus_RT_Gpa','avg_Melting_point_(K)','avg_rate_shear_mod_Mpa_perK',
            'avg_Solid_Solubility_atpct','avg_lattice_constant_A','avg_BEC_percm3','avg_Av.Valence_bond_strength_ev','avg_EngelZ_e/a','T']]



train_data_avg = X_train_unscaled_avg.join(y_train)
test_data_avg = X_test_unscaled_avg.join(y_test)
total_data_avg = pd.concat([train_data_avg,test_data_avg])
total_data_avg.shape


print('shape of the features space:',X_train_unscaled_avg.shape)


# Instantiate the standard scaler
scaler = StandardScaler()
# zero mean and unit variance
X_train = scaler.fit_transform(X_train_unscaled_avg)
X_test = scaler.transform(X_test_unscaled_avg)


Processing Input Data: 100%|███████████████| 115/115 [00:00<00:00, 24344.87it/s]


	Featurizing Compositions...


Assigning Features...: 100%|███████████████| 115/115 [00:00<00:00, 10212.03it/s]


	Creating Pandas Objects...


Processing Input Data: 100%|█████████████████| 25/25 [00:00<00:00, 12524.80it/s]


	Featurizing Compositions...


Assigning Features...: 100%|██████████████████| 25/25 [00:00<00:00, 5833.20it/s]

	Creating Pandas Objects...
shape of the features space: (115, 15)





## MLP with Grid Search 

In [8]:
tensorflow.keras.utils.set_random_seed(20)

def ANN(optimizer = 'adam',neurons=32,batch_size=32,epochs=50,activation='relu',loss='mse'):
    model = Sequential()
    model.add(Dense(neurons, input_shape=(X_train.shape[1],), activation=activation))
    model.add(Dense(neurons, activation=activation))
  
    model.add(Dense(1))
    model.compile(optimizer = optimizer, loss=loss)
    return model

ann_params = {
    'optimizer': ['adam','rmsprop'],
    'activation': ['relu'],
    'loss': ['mse','mae'],
    'batch_size': [8,16],
    'neurons':[8,16,32],
    'epochs':[100,150,200,300],
}


gkf=GroupKFold(n_splits=5)

clf = KerasRegressor(build_fn=ANN, verbose=0)
grid = GridSearchCV(clf, ann_params,cv=gkf,scoring='neg_mean_squared_error')


grid.fit(X_train, y_train,groups=formulae_train)

print(grid.best_params_)

print("MSE:"+ str(-grid.best_score_))


best = grid.best_estimator_
tensorflow.keras.utils.set_random_seed(20)

best.fit(X_train,y_train)


  clf = KerasRegressor(build_fn=ANN, verbose=0)


{'activation': 'relu', 'batch_size': 8, 'epochs': 150, 'loss': 'mse', 'neurons': 16, 'optimizer': 'adam'}
MSE:18449.614634300076


<keras.callbacks.History at 0x12b323d60>

## Performance Matrix 

In [9]:
y_test_predict = best.predict(X_test)
y_train_predict = best.predict(X_train)
print(model_evaluation(y_train,y_train_predict))
model_evaluation(y_test,y_test_predict)

        mae       rmse   r2score model
0  48.58087  63.587855  0.810552  None


Unnamed: 0,mae,rmse,r2score,model
0,120.553963,154.34053,0.522991,


## Compositions

In [10]:
df_train = pd.read_csv('data/model_input/train_yieldstrength_composition.csv')
df_test = pd.read_csv('data/model_input/test_yieldstrength_composition.csv')
print('size of train data', len(df_train))
print('size of test data',len(df_test))


size of train data 115
size of test data 25


In [11]:
X_train_unscaled = df_train[['Nb','Zr','V','Ta','W','Ti','Mo','C','Hf','T']]
y_train = df_train['target']
X_test_unscaled = df_test[['Nb','Zr','V','Ta','W','Ti','Mo','C','Hf','T']]
y_test = df_test['target']
formulae_train = df_train['formula']

In [12]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train_unscaled)
X_test = scaler.transform(X_test_unscaled)

X_train 

array([[ 1.72486642, -0.31193527, -0.40201513, ..., -0.40108026,
         0.        , -1.68431545],
       [ 1.72486642, -0.31193527, -0.40201513, ..., -0.40108026,
         0.        ,  0.42444523],
       [ 1.72486642, -0.31193527, -0.40201513, ..., -0.40108026,
         0.        ,  0.97284136],
       ...,
       [ 0.07012001, -0.31193527,  2.48746859, ..., -0.40108026,
         0.        ,  0.86434573],
       [ 0.07012001, -0.31193527,  2.48746859, ..., -0.40108026,
         0.        ,  1.08330965],
       [ 0.07012001, -0.31193527,  2.48746859, ..., -0.40108026,
         0.        ,  1.30227357]])

## MLP with Grid Search

In [13]:
tensorflow.keras.utils.set_random_seed(20)

def ANN(optimizer = 'adam',neurons=32,batch_size=32,epochs=50,activation='relu',loss='mse'):
    model = Sequential()
    model.add(Dense(neurons, input_shape=(X_train.shape[1],), activation=activation))
    model.add(Dense(neurons, activation=activation))
    
    model.add(Dense(1))
    model.compile(optimizer = optimizer, loss=loss)
    return model


ann_params = {
    'optimizer': ['adam','rmsprop'],
    'activation': ['relu'],
    'loss': ['mse','mae'],
    'batch_size': [8,16],
    'neurons':[8,16,32],
    'epochs':[100,150,200,300],
}





gkf=GroupKFold(n_splits=5)

clf = KerasRegressor(build_fn=ANN, verbose=0)
grid = GridSearchCV(clf, ann_params,cv=gkf,scoring='neg_mean_squared_error')


grid.fit(X_train, y_train,groups=formulae_train)

print(grid.best_params_)

print("MSE:"+ str(-grid.best_score_))


best = grid.best_estimator_
tensorflow.keras.utils.set_random_seed(20)


  clf = KerasRegressor(build_fn=ANN, verbose=0)


{'activation': 'relu', 'batch_size': 8, 'epochs': 300, 'loss': 'mae', 'neurons': 32, 'optimizer': 'adam'}
MSE:12164.217814621006


## Performance Matrix

In [14]:
y_test_predict = best.predict(X_test)
y_train_predict = best.predict(X_train)
print(model_evaluation(y_train,y_train_predict))
print(model_evaluation(y_test,y_test_predict))

         mae       rmse  r2score model
0  23.376722  38.417175  0.93085  None
          mae        rmse   r2score model
0  144.975154  176.186875  0.378396  None
