### SEMICONDUCTOR MANUFACTURING PRODUCT QUALITY PREDICTION

**Highlights:**
 <br> 1. Unlike with machine learning algorithms, 
 <br> 1.A: no dataset outliers treatment as the auto-encoder neural network is insensitive to them 
 <br> 1.B: the multi-collinearity among the descriptors is also not bothered as the rigorous dimentionality reduction happens (quicker compared to PCA)
 <br> 2. Both the keras types of models are illustrated: Sequential API and Functional API
 <br> 3. All of the data is gathered from the sensors in real-time 
 <br> 4. Dataset is labelled as 'Product Quality' where 1: Good 0: Bad
 <br> 5. Auto-encoder neural network is used as a dimentionality reduction technique
 <br> 6. Support vector machine uses the encoded data for the prediction of product quality

Import all the necessary library packages

In [2]:
import pandas as pd 
import numpy as np

from sklearn.svm import SVC  
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix 

import tensorflow as tf
from tensorflow.keras import layers
from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers

Functions used

In [3]:
def getRedundantColumns(X):                                         
    RedundantColumns = set()
    for loc in range(X.shape[1]):
        tocomparecolumn = X.iloc[:, loc]
        for nextloc in range(loc + 1, X.shape[1]):
            comparewithcolumn = X.iloc[:,nextloc]
            if tocomparecolumn.equals(comparewithcolumn):
                RedundantColumns.add(X.columns.values[nextloc])
    return list(RedundantColumns)

#### Data Preparation

In [4]:
dataset = pd.read_csv('SMPQ.csv') 
dataset.isnull().sum()   # identifies the missing columns

Time          0
0             6
1             7
2            14
3            14
             ..
586           1
587           1
588           1
589           1
Pass/Fail     0
Length: 592, dtype: int64

In [5]:
dataset.replace('', np.nan, inplace=True)    # replace miising values across the dataset with NaN
dataset = dataset.fillna(dataset.median())   # fill the NaN values in each column with their respective column median
# dataset.isnull().sum() # to know the number of column missing values afterwards
print('Actual Dataset dimension:',dataset.shape)

  dataset = dataset.fillna(dataset.median())   # fill the NaN values in each column with their respective column median


Actual Dataset dimension: (1567, 592)


In [6]:
RedundantColumns = np.array(getRedundantColumns(dataset))   # Identifying the duplicate columns
RedundantColumns

array(['422', '231', '375', '322', '534', '194', '449', '378', '257',
       '234', '186', '533', '498', '314', '261', '192', '528', '394',
       '504', '259', '462', '450', '264', '399', '233', '189', '369',
       '464', '97', '236', '397', '141', '262', '508', '404', '465',
       '325', '190', '501', '513', '512', '536', '328', '240', '149',
       '396', '515', '241', '329', '403', '481', '330', '505', '461',
       '237', '535', '373', '364', '506', '380', '315', '178', '530',
       '232', '507', '69', '243', '503', '379', '509', '242', '258',
       '226', '256', '260', '402', '327', '326', '395', '179', '374',
       '191', '266', '265', '370', '532', '230', '538', '229', '502',
       '284', '529', '451', '537', '531', '263', '414', '52', '371',
       '381', '458', '401', '514', '193', '276', '400', '463', '235',
       '466', '398', '372', '313'], dtype='<U3')

In [7]:
dataset = dataset.T.drop_duplicates().T    # removes all of the above duplicate columns 
print('Dataset dimension after removing duplicate columns:',dataset.shape) 

Dataset dimension after removing duplicate columns: (1567, 480)


In [78]:
dataset = dataset.drop(['Time'], axis=1)  # Drop the 'Time' column as it is not needed

dataconsistency = dataset.nunique()       # Identify whether the column data is identical for all the rows
inconsistant_columns = dataconsistency[dataconsistency == 1].index
dataset = dataset.drop(inconsistant_columns, axis=1) # drop the columns with no data variation
print('Dataset dimension after removing useless columns:',dataset.shape)   # columns with all the rows having same data
dataset.head(3)

KeyError: "['Time'] not found in axis"

In [None]:
# Seperate the class variable with the descriptors, as the normalization must be performed only with non-class variable
Descriptors = dataset.iloc[:,:-1].values      
Class = pd.DataFrame(dataset.iloc[:,-1].values)
Class = Class.rename(columns={Class.columns[0]: 'Product Quality'})  # Renaming the label
Class['Product Quality'] = Class['Product Quality'].replace([-1,1],[1,0])    # 1: Good, 0: Bad

In [77]:
Descriptors = np.asarray(Descriptors).astype(np.float32)
Descriptors  = pd.DataFrame(Descriptors)
Descriptors.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,464,465,466,467,468,469,470,471,472,473
0,3.626271,3.046981,2.580172,1.616686,-0.132319,-0.012904,-0.133853,-0.132145,-0.133987,-0.134011,...,-0.134001,-0.044322,-0.133386,-0.133992,-0.134002,-0.131075,-0.133981,-0.133988,-0.134001,-0.044804
1,3.706726,2.924332,2.633133,1.681862,-0.132978,-0.007036,-0.133852,-0.13215,-0.134007,-0.134025,...,-0.133999,0.124299,-0.133384,-0.133979,-0.134,-0.128492,-0.133995,-0.133982,-0.133999,0.124299
2,3.504292,3.041944,2.578531,1.972613,-0.132133,-0.015541,-0.133853,-0.132216,-0.134002,-0.134005,...,-0.133988,-0.031207,-0.133391,-0.133987,-0.134002,-0.130068,-0.133934,-0.133947,-0.133988,-0.031207
3,3.573904,2.942644,2.594191,0.994714,-0.132368,-0.004687,-0.133856,-0.13216,-0.134022,-0.134011,...,-0.134001,-0.042394,-0.133388,-0.133994,-0.134004,-0.131458,-0.133982,-0.133988,-0.134001,-0.042394
4,3.627896,2.971141,2.636786,1.511721,-0.132104,-0.009451,-0.133853,-0.132142,-0.13401,-0.134016,...,-0.134001,-0.044322,-0.133411,-0.133415,-0.133877,-0.010808,-0.133982,-0.133988,-0.134001,-0.042394


In [None]:
Descriptors = np.asarray(Descriptors).astype(np.float32)  # Performs the normalization
layer = tf.keras.layers.Normalization(axis=None)
layer.adapt(Descriptors)
Normalised_Data=layer(Descriptors)
Descriptors = pd.DataFrame(Normalised_Data)

In [None]:
dataframes = [Descriptors, Class]              # merge the class variable column with the normalized descriptors columns
modified_dataset = pd.concat(dataframes, axis=1)

#### Model Building

Training the auto-encoder network (Funcitonal API Model), to use the bottleneck layer as the reduced dimentionality for prediction engine

In [64]:
Guiding_layer = Input(shape =(Descriptors.shape[1], ))

en_layer1 = Dense(250,activation ='tanh',activity_regularizer = regularizers.l1(0.01))(Guiding_layer)
en_Layer2 = Dense(125,activation ='tanh',activity_regularizer = regularizers.l1(0.01))(en_layer1)
en_Layer3 = Dense(83,activation ='tanh',activity_regularizer = regularizers.l1(0.01))(en_Layer2)
en_Layer4 = Dense(32,activation ='tanh',activity_regularizer = regularizers.l1(0.01))(en_Layer3)

Bottleneck_layer = Dense(15, activation ='relu')(en_Layer4)       # Compressed dimentionality
  

de_layer1 = Dense(32,activation ='tanh')(Bottleneck_layer)
de_layer2 = Dense(83,activation ='tanh')(de_layer1)
de_layer3 = Dense(125,activation ='tanh')(de_layer2)
de_layer4 = Dense(250,activation ='tanh')(de_layer3)
  
reconstructed_layer = Dense(Descriptors.shape[1], activation ='relu')(de_layer4)

In [68]:
autoencoder = Model(Guiding_layer, reconstructed_layer)
autoencoder.compile(optimizer ="adadelta", loss ="mse")
autoencoder.fit(Descriptors, Descriptors, 
                batch_size = 40, epochs = 12, 
                shuffle = True, validation_split = 0.20)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<keras.callbacks.History at 0x20cc5f67a30>

Dimensionlaity reduction using the encoder section(Sequencial API)

In [69]:
Descriptors_Compressor = Sequential()
Descriptors_Compressor.add(autoencoder.layers[0])
Descriptors_Compressor.add(autoencoder.layers[1])
Descriptors_Compressor.add(autoencoder.layers[2])
Descriptors_Compressor.add(autoencoder.layers[3])
Descriptors_Compressor.add(autoencoder.layers[4])  
Descriptors_Compressor.add(autoencoder.layers[5]) # Bottlenecklayer

In [70]:
Encoded_Descriptors = Descriptors_Compressor.predict(Descriptors)
Encoded_Descriptors = pd.DataFrame(Encoded_Descriptors)



Now that the dataframe for SVM prediction has 15 descriptors(down from 500) and one label

In [72]:
joinedframes = [Encoded_Descriptors, Class]
Compressed_Data = pd.concat(joinedframes, axis=1)
Compressed_Data.head(1)     

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Product Quality
0,0.52613,0.577982,0.101263,0.003088,0.0,0.375904,0.0,0.0,0.260246,0.0,0.0,0.0,0.0,0.013958,0.09806,1


SVM Classifier

In [73]:
X_train, X_test, y_train, y_test = train_test_split(Encoded_Descriptors, Class, test_size = 0.2)  # Train-Test Split
parameter_range = {'C': [0.001,0.1], 'gamma': [5,1],'kernel': ['rbf', 'poly']} # Hyper-tuning to identify the best parameters
tuned_model = GridSearchCV(SVC(),parameter_range,refit=True,verbose=2)
tuned_model.fit(X_train,y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END .......................C=0.001, gamma=5, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.001, gamma=5, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.001, gamma=5, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.001, gamma=5, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.001, gamma=5, kernel=rbf; total time=   0.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END ......................C=0.001, gamma=5, kernel=poly; total time=   0.0s
[CV] END ......................C=0.001, gamma=5, kernel=poly; total time=   0.0s
[CV] END ......................C=0.001, gamma=5, kernel=poly; total time=   0.0s
[CV] END ......................C=0.001, gamma=5, kernel=poly; total time=   0.0s
[CV] END ......................C=0.001, gamma=5, kernel=poly; total time=   0.0s
[CV] END .......................C=0.001, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.001, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.001, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.001, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.001, gamma=1, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.001, gamma=1, kernel=poly; total time=   0.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END ......................C=0.001, gamma=1, kernel=poly; total time=   0.0s
[CV] END ......................C=0.001, gamma=1, kernel=poly; total time=   0.0s
[CV] END ......................C=0.001, gamma=1, kernel=poly; total time=   0.0s
[CV] END ......................C=0.001, gamma=1, kernel=poly; total time=   0.0s
[CV] END .........................C=0.1, gamma=5, kernel=rbf; total time=   0.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .........................C=0.1, gamma=5, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=5, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=5, kernel=rbf; total time=   0.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .........................C=0.1, gamma=5, kernel=rbf; total time=   0.0s
[CV] END ........................C=0.1, gamma=5, kernel=poly; total time=   0.3s


  y = column_or_1d(y, warn=True)


[CV] END ........................C=0.1, gamma=5, kernel=poly; total time=   0.2s


  y = column_or_1d(y, warn=True)


[CV] END ........................C=0.1, gamma=5, kernel=poly; total time=   0.2s


  y = column_or_1d(y, warn=True)


[CV] END ........................C=0.1, gamma=5, kernel=poly; total time=   0.2s


  y = column_or_1d(y, warn=True)


[CV] END ........................C=0.1, gamma=5, kernel=poly; total time=   0.1s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s
[CV] END ........................C=0.1, gamma=1, kernel=poly; total time=   0.0s


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.001, 0.1], 'gamma': [5, 1],
                         'kernel': ['rbf', 'poly']},
             verbose=2)

In [74]:
print(tuned_model.best_params_)

{'C': 0.001, 'gamma': 5, 'kernel': 'rbf'}


In [75]:
predicted_label = tuned_model.predict(X_test)
print('Accuracy : '+str(accuracy_score(y_test_encoded, y_pred_lrclf)))

Accuracy : 0.9394904458598726


### THE END