### SEMICONDUCTOR MANUFACTURING PRODUCT QUALITY PREDICTION

**Highlights:**
 <br> 1. Unlike with machine learning algorithms, 
 <br> 1.A: no dataset outliers treatment as the auto-encoder neural network is insensitive to them 
 <br> 1.B: the multi-collinearity among the descriptors is also not bothered as the rigorous dimentionality reduction happens (quicker compared to PCA)
 <br> 2. Both the keras types of models are illustrated: Sequential API and Functional API
 <br> 3. All of the data is gathered from the sensors in real-time 
 <br> 4. Dataset is labelled as 'Product Quality' where 1: Good 0: Bad
 <br> 5. Auto-encoder neural network is used as a dimentionality reduction technique
 <br> 6. Support vector machine uses the encoded data for the prediction of product quality

Import all the necessary library packages

In [87]:
import pandas as pd 
import numpy as np

from sklearn.svm import SVC  
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix 

import tensorflow as tf
from tensorflow.keras import layers
from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers

Functions used

In [88]:
def getRedundantColumns(X):                                         
    RedundantColumns = set()
    for loc in range(X.shape[1]):
        tocomparecolumn = X.iloc[:, loc]
        for nextloc in range(loc + 1, X.shape[1]):
            comparewithcolumn = X.iloc[:,nextloc]
            if tocomparecolumn.equals(comparewithcolumn):
                RedundantColumns.add(X.columns.values[nextloc])
    return list(RedundantColumns)

#### Data Preparation

In [89]:
dataset = pd.read_csv('SMPQ.csv') 
dataset.isnull().sum()   # identifies the missing columns

Time          0
0             6
1             7
2            14
3            14
             ..
586           1
587           1
588           1
589           1
Pass/Fail     0
Length: 592, dtype: int64

In [None]:
dataset.replace('', np.nan, inplace=True)    # replace miising values across the dataset with NaN
dataset = dataset.fillna(dataset.median())   # fill the NaN values in each column with their respective column median
# dataset.isnull().sum() # to know the number of column missing values afterwards
print('Actual Dataset dimension:',dataset.shape)

In [None]:
RedundantColumns = np.array(getRedundantColumns(dataset))   # Identifying the duplicate columns
RedundantColumns

In [81]:
dataset = dataset.T.drop_duplicates().T    # removes all of the above duplicate columns 
print('Dataset dimension after removing duplicate columns:',dataset.shape) 

Dataset dimension after removing duplicate columns: (1567, 480)


In [82]:
dataset = dataset.drop(['Time'], axis=1)  # Drop the 'Time' column as it is not needed

dataconsistency = dataset.nunique()       # Identify whether the column data is identical for all the rows
inconsistant_columns = dataconsistency[dataconsistency == 1].index
dataset = dataset.drop(inconsistant_columns, axis=1) # drop the columns with no data variation
print('Dataset dimension after removing useless columns:',dataset.shape)   # columns with all the rows having same data
dataset.head(3)

Dataset dimension after removing useless columns: (1567, 475)


Unnamed: 0,0,1,2,3,4,6,7,8,9,10,...,581,582,583,584,585,586,587,588,589,Pass/Fail
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,97.6133,0.1242,1.5005,0.0162,-0.0034,...,72.2889,0.5005,0.0118,0.0035,2.363,0.0205,0.0148,0.0046,71.9005,-1
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,102.3433,0.1247,1.4966,-0.0005,-0.0148,...,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045,-1
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,95.4878,0.1241,1.4436,0.0041,0.0013,...,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602,1


In [83]:
# Seperate the class variable with the descriptors, as the normalization must be performed only with non-class variable
Descriptors = dataset.iloc[:,:-1].values      
Class = pd.DataFrame(dataset.iloc[:,-1].values)
Class = Class.rename(columns={Class.columns[0]: 'Product Quality'})  # Renaming the label
Class['Product Quality'] = Class['Product Quality'].replace([-1,1],[1,0])    # 1: Good, 0: Bad

In [85]:
Descriptors = np.asarray(Descriptors).astype(np.float32)
Descriptors  = pd.DataFrame(Descriptors)
Descriptors.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,464,465,466,467,468,469,470,471,472,473
0,3030.929932,2564.0,2187.733398,1411.126465,1.3602,97.613297,0.1242,1.5005,0.0162,-0.0034,...,0.0047,72.288902,0.5005,0.0118,0.0035,2.363,0.0205,0.0148,0.0046,71.900497
1,3095.780029,2465.139893,2230.422119,1463.660645,0.8294,102.3433,0.1247,1.4966,-0.0005,-0.0148,...,0.006,208.204498,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.204498
2,2932.610107,2559.939941,2186.411133,1698.017212,1.5102,95.487801,0.1241,1.4436,0.0041,0.0013,...,0.0148,82.860199,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.860199


In [None]:
Descriptors = np.asarray(Descriptors).astype(np.float32)  # Performs the normalization
layer = tf.keras.layers.Normalization(axis=None)
layer.adapt(Descriptors)
Normalised_Data=layer(Descriptors)
Descriptors = pd.DataFrame(Normalised_Data)

In [86]:
dataframes = [Descriptors, Class]              # merge the class variable column with the normalized descriptors columns
modified_dataset = pd.concat(dataframes, axis=1)
modified_dataset.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,465,466,467,468,469,470,471,472,473,Product Quality
0,3030.929932,2564.0,2187.733398,1411.126465,1.3602,97.613297,0.1242,1.5005,0.0162,-0.0034,...,72.288902,0.5005,0.0118,0.0035,2.363,0.0205,0.0148,0.0046,71.900497,1
1,3095.780029,2465.139893,2230.422119,1463.660645,0.8294,102.3433,0.1247,1.4966,-0.0005,-0.0148,...,208.204498,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.204498,1
2,2932.610107,2559.939941,2186.411133,1698.017212,1.5102,95.487801,0.1241,1.4436,0.0041,0.0013,...,82.860199,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.860199,0


#### Model Building

Training the auto-encoder network (Funcitonal API Model), to use the bottleneck layer as the reduced dimentionality for prediction engine

In [64]:
Guiding_layer = Input(shape =(Descriptors.shape[1], ))

en_layer1 = Dense(250,activation ='tanh',activity_regularizer = regularizers.l1(0.01))(Guiding_layer)
en_Layer2 = Dense(125,activation ='tanh',activity_regularizer = regularizers.l1(0.01))(en_layer1)
en_Layer3 = Dense(83,activation ='tanh',activity_regularizer = regularizers.l1(0.01))(en_Layer2)
en_Layer4 = Dense(32,activation ='tanh',activity_regularizer = regularizers.l1(0.01))(en_Layer3)

Bottleneck_layer = Dense(15, activation ='relu')(en_Layer4)       # Compressed dimentionality
  

de_layer1 = Dense(32,activation ='tanh')(Bottleneck_layer)
de_layer2 = Dense(83,activation ='tanh')(de_layer1)
de_layer3 = Dense(125,activation ='tanh')(de_layer2)
de_layer4 = Dense(250,activation ='tanh')(de_layer3)
  
reconstructed_layer = Dense(Descriptors.shape[1], activation ='relu')(de_layer4)

In [None]:
autoencoder = Model(Guiding_layer, reconstructed_layer)
autoencoder.compile(optimizer ="adadelta", loss ="mse")
autoencoder.fit(Descriptors, Descriptors, 
                batch_size = 40, epochs = 12, 
                shuffle = True, validation_split = 0.20)

Dimensionlaity reduction using the encoder section(Sequencial API)

In [69]:
Descriptors_Compressor = Sequential()
Descriptors_Compressor.add(autoencoder.layers[0])
Descriptors_Compressor.add(autoencoder.layers[1])
Descriptors_Compressor.add(autoencoder.layers[2])
Descriptors_Compressor.add(autoencoder.layers[3])
Descriptors_Compressor.add(autoencoder.layers[4])  
Descriptors_Compressor.add(autoencoder.layers[5]) # Bottlenecklayer

In [70]:
Encoded_Descriptors = Descriptors_Compressor.predict(Descriptors)
Encoded_Descriptors = pd.DataFrame(Encoded_Descriptors)



Now that the dataframe for SVM prediction has 15 descriptors(down from 500) and one label

In [72]:
joinedframes = [Encoded_Descriptors, Class]
Compressed_Data = pd.concat(joinedframes, axis=1)
Compressed_Data.head(1)     

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,Product Quality
0,0.52613,0.577982,0.101263,0.003088,0.0,0.375904,0.0,0.0,0.260246,0.0,0.0,0.0,0.0,0.013958,0.09806,1


SVM Classifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Encoded_Descriptors, Class, test_size = 0.2)  # Train-Test Split
parameter_range = {'C': [0.001,0.004,0.1,0.5,1], 'gamma': [10,5,2,1],'kernel': ['rbf', 'poly','linear','sigmoid']} # Hyper-tuning to identify the best parameters
tuned_model = GridSearchCV(SVC(),parameter_range,refit=True,verbose=2)
tuned_model.fit(X_train,y_train)

In [74]:
print(tuned_model.best_params_)

{'C': 0.001, 'gamma': 5, 'kernel': 'rbf'}


In [75]:
predicted_label = tuned_model.predict(X_test)
print('Accuracy : '+str(accuracy_score(y_test, predicted_label)))

Accuracy : 0.9394904458598726


### THE END