In [None]:
# Utilities
import numpy as np
import pandas as pd

# ML
#import sklearn
#sklearn.__version__# use this when reopening previously saved ML models
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.gaussian_process.kernels import ConstantKernel, RBF, RationalQuadratic, ExpSineSquared
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, mean_absolute_error
    #pprint allows us to see current hyperparameter values of our model 
from pprint import pprint
# import data
fullData = pd.read_csv("vminStd.csv")
permanent = pd.read_csv("vminStd.csv")
fullData.head()

In [None]:
# only keep rows with Vmin value, and delete unuseful columns
fullData.dropna(subset=['Shmoo Value'], inplace=True)
fullData.drop(["VDD (Range)", "Library #1", "DVDD (Range)", "Period (Range)", "Result", "Test Number"], axis = 1, inplace = True)
##########################################################################################
# DROP FILE INDEX COLUMN FOR NOW, ADD THE DATA LATER TO INCLUDE LEAKAGE ETC IN THE MODEL.
fullData.drop(["File Index"], axis = 1, inplace = True)
##########################################################################################

# Change strings to integers, get rid of 'V' for voltages, etc...
fullData['Shmoo Value'] = fullData['Shmoo Value'].map(lambda x: x.rstrip('V'))
fullData["Shmoo Value"] = fullData["Shmoo Value"].astype('float')

# Use this to decide between one-hot encoding and categorical encoding (or else)
'''print("Data Columns and Different Values: ")
print(fullData.columns)
print()
for column in fullData.columns:
    print(fullData[str(column)].unique())
'''
# one hot encode categorical values. See bookmarks for why one hot rather than else
fullData = pd.get_dummies(fullData, columns=["Chip Type", "Library #2"])

fullData["Test Item"] = fullData["Test Item"].astype('category')
fullData["Test Item"] = fullData["Test Item"].cat.codes
fullData["Library #3"] = fullData["Library #3"].astype('category')
fullData["Library #3"] = fullData["Library #3"].cat.codes

###################### Continuous normalized temp#######################################################
fullData['Chip Temp'] = pd.to_numeric(fullData['Chip Temp'], errors='coerce').fillna(-40.0).astype(float)
fullData['Chip Temp'] = fullData['Chip Temp'].map(lambda x: x/150.0)

# create training and testing sets
X = fullData.copy()
X = shuffle(X)
y = X["Shmoo Value"]
X.drop(["Shmoo Value"], axis = 1, inplace = True)

X_train = X[:8196]
y_train = y[:8196]
X_test = X[-1500:]
y_test = y[-1500:]

In [None]:
rf = GaussianProcessRegressor()
pprint(rf.get_params())


# parameters for GP regressor (kernels and the rest)
ker_rbf = ConstantKernel(1.0, constant_value_bounds="fixed") * RBF(1.0, length_scale_bounds="fixed")
ker_rq = ConstantKernel(1.0, constant_value_bounds="fixed") * RationalQuadratic(alpha=0.1, length_scale=1)
ker_expsine = ConstantKernel(1.0, constant_value_bounds="fixed") * ExpSineSquared(1.0, 5.0, periodicity_bounds=(1e-2, 1e1))
kernel_list = [ker_rbf, ker_rq, ker_expsine]
alpha_list = 10.0 ** -np.arange(1, 7)
n_restarts_optimizer = 3
normalize_y = [False, True]
random_grid = {"kernel": kernel_list,
              'alpha': 10.0 ** -np.arange(1, 7),
              "optimizer": ["fmin_l_bfgs_b"],
              "n_restarts_optimizer": [1, 2, 3, 10],
              "normalize_y": [False, True],
              }
pprint(random_grid)

for k in kernel_list:
    for a in alpha_list:
        for y in normalize_y:    
            rf = GaussianProcessRegressor(random_state=42, kernel = k, alpha = a, n_restarts_optimizer = 3, normalize_y = y)
            rf.fit(X_train, y_train)
            print(str(k) + " and " + str(a) +  " and " + str(y))
            print(mean_absolute_error(rf.predict(X_test),y_test))
            

In [None]:
base_model = linear_model.SGDRegressor(random_state = 42)
base_model.fit(X_train, y_train)
y_predicted_base = base_model.predict(X_test)
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_predicted_base))

best_random = rf_random.best_estimator_
y_predicted_random = best_random.predict(X_test)
print(mean_absolute_error(y_test, y_predicted_random))

for i in range(5):
    print("For the base model:")
    print("The test label is " + str(y_test.iloc[i]) + " and the predicted value is " + str(y_predicted_base[i]))
    print("For the randomized search optimal model:")
    print("The test label is still " + str(y_test.iloc[i]) + " and the predicted value is " + str(y_predicted_random[i]))



In [None]:
rf = GaussianProcessRegressor()
pprint(rf.get_params())

ker_rbf = ConstantKernel(1.0, constant_value_bounds="fixed") * RBF(1.0, length_scale_bounds="fixed")
random_grid = {"kernel": ker_rbf,
              'alpha': 0.1,
              "optimizer": ["fmin_l_bfgs_b"],
              "n_restarts_optimizer": [3],
              "normalize_y": [False],
              }
pprint(random_grid)


rf = GaussianProcessRegressor(random_state=42, kernel = ker_rbf, alpha=0.1, optimizer="fmin_l_bfgs_b", n_restarts_optimizer = 3)
rf.fit(X_train, y_train)
y_predicted_random = rf.predict(X_test)

In [None]:
'''
# forget about this
param_grid = {
    'bootstrap': [True],
    'max_depth': [10],
    'max_features': ['auto'],
    'min_samples_leaf': [5],
    'min_samples_split': [4],
    'n_estimators': [200]
}

rfGrid = RandomForestRegressor()
grid_search = GridSearchCV(estimator = rfGrid, param_grid = param_grid, 
                          cv = 6, verbose = 2)

grid_search.fit(X_train, y_train)
grid_search.best_params_
'''

In [None]:
plt.scatter(y_test, y_predicted_random,  color='green', alpha=0.1)
plt.xlim([0.2,0.5])
plt.ylim([0.2,0.5])
plt.plot([0.2,0.5],[0.2,0.5], color='black')
plt.xlabel("Test Vmin")
plt.ylabel("Predicted Vmin")
plt.title("Standard Cell Vmin Prediction Error")
plt.show()

print(mean_absolute_error(y_test, y_predicted_random))
print(mean_absolute_error(y_test, y_predicted_random)/y_test.mean())

from joblib import dump, load
dump(best_random, 'GPVminBasic.joblib')