In [None]:
# Utilities
import numpy as np
import pandas as pd

# ML
#import sklearn
#sklearn.__version__# use this when reopening previously saved ML models
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, mean_absolute_error
    #pprint allows us to see current hyperparameter values of our model 
from pprint import pprint
# import data
fullData = pd.read_csv("vminStd.csv")
permanent = pd.read_csv("vminStd.csv")
fullData.head()

In [None]:
# only keep rows with Vmin value, and delete unuseful columns
fullData.dropna(subset=['Shmoo Value'], inplace=True)
fullData.drop(["VDD (Range)", "Library #1", "DVDD (Range)", "Period (Range)", "Result", "Test Number"], axis = 1, inplace = True)
##########################################################################################
# DROP FILE INDEX COLUMN FOR NOW, ADD THE DATA LATER TO INCLUDE LEAKAGE ETC IN THE MODEL.
fullData.drop(["File Index"], axis = 1, inplace = True)
##########################################################################################

# Change strings to integers, get rid of 'V' for voltages, etc...
fullData['Shmoo Value'] = fullData['Shmoo Value'].map(lambda x: x.rstrip('V'))
fullData["Shmoo Value"] = fullData["Shmoo Value"].astype('float')

# Use this to decide between one-hot encoding and categorical encoding (or else)
'''print("Data Columns and Different Values: ")
print(fullData.columns)
print()
for column in fullData.columns:
    print(fullData[str(column)].unique())
'''
# one hot encode categorical values. See bookmarks for why one hot rather than else
fullData = pd.get_dummies(fullData, columns=["Chip Type", "Library #2"])

fullData["Test Item"] = fullData["Test Item"].astype('category')
fullData["Test Item"] = fullData["Test Item"].cat.codes
fullData["Library #3"] = fullData["Library #3"].astype('category')
fullData["Library #3"] = fullData["Library #3"].cat.codes

###################### Continuous normalized temp#######################################################
fullData['Chip Temp'] = pd.to_numeric(fullData['Chip Temp'], errors='coerce').fillna(-40.0).astype(float)
fullData['Chip Temp'] = fullData['Chip Temp'].map(lambda x: x/150.0)

# create training and testing sets
X = fullData.copy()
X = shuffle(X)
y = X["Shmoo Value"]
X.drop(["Shmoo Value"], axis = 1, inplace = True)

X_train = X[:8196]
y_train = y[:8196]
X_test = X[-1500:]
y_test = y[-1500:]

In [None]:
print(len(X))
print(len(X_train))
print(len(X_test))

In [None]:
print(X_train.head())

In [None]:
print(X_train.iloc[7])
print(permanent.iloc[75100])

In [None]:
permanent['Test Item'].unique()

In [None]:
print(permanent.iloc[17765])
print(permanent.iloc[2634])
print(permanent.iloc[36227])
print(permanent.iloc[54953])

In [None]:
# randomized search for hyperparameter optimization
'''rf = RandomForestRegressor()
pprint(rf.get_params())

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(random_state=42)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42)
# Fit the random search model
rf_random.fit(X_train, y_train)
rf_random.best_params_
'''

In [None]:
#comparison of base model and randomsearch result
'''base_model = RandomForestRegressor(random_state = 42)
base_model.fit(X_train, y_train)
y_predicted_base = base_model.predict(X_test)
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y_test, y_predicted_base))

best_random = rf_random.best_estimator_
y_predicted_random = best_random.predict(X_test)
print(mean_squared_error(y_test, y_predicted_random))

for i in range(5):
    print("For the base model:")
    print("The test label is " + str(y_test.iloc[i]) + " and the predicted value is " + str(y_predicted_base[i]))
    print("For the randomized search optimal model:")
    print("The test label is still " + str(y_test.iloc[i]) + " and the predicted value is " + str(y_predicted_random[i]))
'''


In [None]:
# this is just some bs, forget about it

param_grid = {
    'bootstrap': [True],
    'max_depth': [10],
    'max_features': ['auto'],
    'min_samples_leaf': [5],
    'min_samples_split': [4],
    'n_estimators': [200]
}

rfGrid = RandomForestRegressor()
grid_search = GridSearchCV(estimator = rfGrid, param_grid = param_grid, 
                          cv = 6, verbose = 2)

grid_search.fit(X_train, y_train)
grid_search.best_params_


In [None]:
best_grid = grid_search.best_estimator_
y_predicted_grid = best_grid.predict(X_test)
feature_importances = pd.DataFrame(best_grid.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)
ax = feature_importances.plot.bar(rot=65)
plt.title("Feature importance in Random Forest")
plt.show()
plt.scatter(y_test, y_predicted_grid,  color='green', alpha=0.1)
plt.xlim([0.2,0.5])
plt.ylim([0.2,0.5])
plt.plot([0.2,0.5],[0.2,0.5], color='black')
plt.xlabel("Test Vmin")
plt.ylabel("Predicted Vmin")
plt.title("Standard Cell Vmin Prediction Error")
plt.show()

print(mean_absolute_error(y_test, y_predicted_grid))
print(mean_absolute_error(y_test, y_predicted_grid)/y_test.mean())

from joblib import dump, load
dump(best_grid, 'RFVminBasic.joblib')