In [1]:
import numpy as np
import pandas as pd 
import cv2
import os 
import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
import time

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Functions

In [3]:
def reshape(diff):
  meanA = diff.mean(axis =3)
  N,m,n = meanA.shape
  return meanA.reshape((N,m*n))

In [4]:
def predict_one_model_one_batch(sequence_part,model):
    y_sample = model.predict(sequence_part[0])
    y_template = model.predict(sequence_part[1])
    diff = np.asarray(y_sample - y_template)
    diff= reshape(diff)
    return diff

In [5]:
def predict_batches_diff(sequence,model1,model2,model3,model4,model5): 
  for i in range(len(sequence)): 
    diff1 = predict_one_model_one_batch(sequence[i],model1)
    diff2 = predict_one_model_one_batch(sequence[i],model2)
    diff3 = predict_one_model_one_batch(sequence[i],model3)
    diff4 = predict_one_model_one_batch(sequence[i],model4)
    diff5 = predict_one_model_one_batch(sequence[i],model5)
    a= np.append(diff1,diff2,axis= 1)
    b= np.append(diff3,diff4,axis= 1)
    c= np.append(b,diff5,axis= 1)
    diff= np.append(a,c,axis= 1)
    sequence.change_batch_diff(diff,i)

In [6]:
def predict_5layers(seuqence,model):
  model_S1 = keras.models.Model(model.input,model.layers[0].output)
  model_S2 = keras.models.Model(model.input,model.layers[3].output)
  model_S3 = keras.models.Model(model.input,model.layers[6].output)
  model_S4 = keras.models.Model(model.input,model.layers[8].output)
  model_S5 = model
  sequence = predict_batches_diff(seuqence,model_S1,model_S2,model_S3,model_S4,model_S5)


In [7]:
#from keras.utils import Sequence
from tensorflow.keras.utils import Sequence
import math
class DataSequence(Sequence):
    """
    Keras Sequence object to train a model on larger-than-memory data.
    """
    def __init__(self, df, batch_size):
        #self.df = df # your pandas dataframe
        self.bsz = batch_size # batch size
        self.diff= np.zeros((len(df),4024))
        # Take labels and a list of image locations in memory
        self.labels = df['label'].values
        self.sample = df["sample"].tolist()
        self.template = df["template"].tolist()
        self.sample_name = df["sample_name"].tolist()
    def __len__(self):
        # compute number of batches to yield
        return int(math.ceil(len(self.diff) / float(self.bsz)))

    def on_epoch_end(self):
        # Shuffles indexes after each epoch if in training mode
        self.indexes = range(len(self.labels))

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return self.labels[idx * self.bsz: (idx + 1) * self.bsz]

    def get_batch_features(self, idx):
        # Fetch a batch of inputs
        return np.array([im for im in self.sample[idx * self.bsz: (1 + idx) * self.bsz]]),np.array([im for im in self.template[idx * self.bsz: (1 + idx) * self.bsz]])
    
    def change_batch_diff(self,new_diff,idx):
        self.diff[idx * self.bsz: (1 + idx) * self.bsz,:]  = new_diff

    def get_batch_diff(self,idx): 
        return self.diff[idx * self.bsz: (1 + idx) * self.bsz]

    def __getitem__(self, idx):
        batch_s, batch_t = self.get_batch_features(idx)
        batch_y = self.get_batch_labels(idx)
        batch_diff = self.get_batch_diff(idx)
        return batch_s,batch_t, batch_y,batch_diff

In [8]:
def match_images(template,sample_0,sample_1) : 
  #test 
    mismatched = sample_0.merge(template, how='inner', on='die_iml')
    mismatched ['label'] = 0
    matched = sample_1.merge(template, how='inner', on='die_iml')
    matched['label']= 1
    df = matched.append(mismatched)#.reset_index()
    return df

In [9]:
def create_matched_missmatched (df_1_template,df_0,df_1_sample):
    # divide df_0 and df_1 sample to test and train 
    df_0_train,df_0_test = train_test_split(df_0, test_size= 0.2, random_state=13)
    df_1_train,df_1_test = train_test_split(df_1_sample, test_size= 0.2, random_state=13)

    # match tables according to die_iml
    df_train = match_images(df_1_template,df_0_train,df_1_train)
    df_train = df_train.sample(frac=1).reset_index(drop=True)
    
    df_test = match_images(df_1_template,df_0_test,df_1_test)
    return df_train,df_test

In [10]:
size = 224
model = keras.models.Sequential([
    keras.layers.Conv2D(filters=64, kernel_size=(11,11), strides=(4,4), activation='relu', input_shape=(size,size,1)),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2)),
    keras.layers.Conv2D(filters=192, kernel_size=(5,5), strides=(1,1),padding="same",  activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPool2D(pool_size=(3,3), strides=(2,2)),
    keras.layers.Conv2D(filters=384, kernel_size=(3,3), strides=(1,1), activation='relu', padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Conv2D(filters=256, kernel_size=(3,3), strides=(1,1), activation='relu', padding="same"),
    keras.layers.BatchNormalization(),
    keras.layers.Conv2D(filters=256, kernel_size=(3,3), strides=(1,1), activation='relu', padding="same"),
    keras.layers.BatchNormalization(),
])

In [11]:
def generate_test_dataset(path_file,df,size):     
    images = []
    dies_imls = []
    noms = []
    dies = []
    dataset = []
    for idx,row in df.iterrows():
                nom_img = row.images
                img = os.path.join(path_file,nom_img) 
                image = cv2.imread(img,cv2.IMREAD_GRAYSCALE)/255
                image = cv2.resize(image,(size,size))
                die_iml = nom_img[18],nom_img[20]
                die = nom_img[18]
                images.append(image)
                dies_imls.append(die_iml)
                noms.append(nom_img)
                dies.append(die)

                if idx%500==0 : 
                   print(idx)
                   
    dataset.append(images)
    dataset.append(noms)
    dataset.append(dies_imls)
    dataset.append(dies)
    df_test = pd.DataFrame(dataset).T
    df_test = df_test.rename(columns={0: "sample",1:"sample_name", 2: "die_iml",3: 'die'})
    df_test ['label'] = df['labels'].astype(int)
    return df_test

# Generating df_0 df_1
This section is dedicated to create df_0 and df_1 dataframes that carry images and informations of deffected and non deffected cards from all dies and that for the train set. These dataframes where generated only once though this project and have been saved as pickle files. If data is already generated, move directly to section "load pickle files".

In [23]:
!unrar x "/content/drive/MyDrive/PFE_2/Data_challenge.rar"



UNRAR 5.50 freeware      Copyright (c) 1993-2017 Alexander Roshal


Extracting from /content/drive/MyDrive/PFE_2/Data_challenge.rar

Creating    Data_challenge                                            OK
Extracting  Data_challenge/Public_Indexes.csv                              0%  OK 
Extracting  Data_challenge/readmefile.txt                                  0%  OK 
Creating    Data_challenge/x_test                                     OK
Extracting  Data_challenge/x_test/AE00008_080949_00_2_2_2001.jpg           0%  OK 
Extracting  Data_challenge/x_test/AE00008_091811_00_2_1_2001.jpg           0%  OK 
Extracting  Data_challenge/x_test/AE00008_113034_00_4_2_2001.jpg           0%  OK 
Extracting  Data_challenge/x_test/AE00018_000155_00_2_4_2001.jpg           0%  OK 
Extracting  Data_challenge/x_test/AE00018_022802_00_1_1_2001.jpg           0%  OK 
Extracting  Data_challenge/x_test/AE00018_023038_00_1_2_2001.jpg         

In [None]:
path = "/content/Data_challenge"
train_file = os.path.join(path,"x_train")
test_file = os.path.join(path, "x_test")

In [None]:
def generate_class_dataset(path_file,file,size):     
    classe = os.path.join(path_file,file)
    images = []
    dies_imls = []
    noms = []
    dies = [] 
    dataset = []
    i=0
    print("classe:",file)
    for nom_img in os.listdir(classe):
                img = os.path.join(classe,nom_img)
                image = cv2.imread(img,cv2.IMREAD_GRAYSCALE)/255
                image = cv2.resize(image,(size,size))
                die_iml = nom_img[18],nom_img[20]
                die = nom_img[18]
                images.append(image)
                dies_imls.append(die_iml)
                noms.append(nom_img)
                dies.append(die)
                i = i+1
                if i %1000 ==0 : 
                   print(i)
    
    dataset.append(images)
    dataset.append(noms)
    dataset.append(dies_imls)
    dataset.append(dies)
    
    print("generation for classe done")
    return dataset 


In [None]:
124*2

248

In [None]:
class_0_dataset = generate_class_dataset(train_file,'1',224)
class_1_dataset = generate_class_dataset(train_file,'2',224)

classe: 1
1000
2000
3000
4000
generation for classe done
classe: 2
1000
2000
3000
4000
5000
6000
generation for classe done


In [None]:
#prep defective cards' list of names according to die_iml ( samples)
df_0 = pd.DataFrame(class_0_dataset).T
df_0 = df_0.rename(columns={0: "sample",1:"sample_name", 2: "die_iml",3: 'die'})

#prep non defective cards' list of names according to die_iml
df_1 = pd.DataFrame(class_1_dataset).T
# divide to samples and templates 
df_1_sample, df_1_template = train_test_split(df_1, test_size=0.5,random_state=13)
df_1_template = df_1_template.rename(columns={0:"template", 1: "template_name", 2: "die_iml",3:'die'})
#df_1_template = df_1_template.drop(columns=3)
df_1_sample = df_1_sample.rename(columns={0: "sample",1:"sample_name", 2: "die_iml",3: 'die'})

In [None]:
#save
with open('/content/drive/MyDrive/PFE_2/df_0_224.pkl', 'wb') as f:
  pickle.dump(df_0, f)
with open('/content/drive/MyDrive/PFE_2/df_1_sample_224.pkl', 'wb') as f:
  pickle.dump(df_1_sample, f)
with open('/content/drive/MyDrive/PFE_2/df_1_template_224.pkl', 'wb') as f:
  pickle.dump(df_1_template, f)

# Load pickle files of df_0 and df_1

In [12]:
import pickle
#load
df_0 = pd.read_pickle("/content/drive/MyDrive/PFE_2/df_0_224.pkl")

df_1_sample = pd.read_pickle("/content/drive/MyDrive/PFE_2/df_1_sample_224.pkl")

In [13]:
import pickle
#load
df_1_template = pd.read_pickle("/content/drive/MyDrive/PFE_2/df_1_template_224.pkl")

In [19]:
df_1_sample.groupby('die').count()

Unnamed: 0_level_0,sample,sample_name,die_iml
die,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,1225,1225,1225


In [17]:
df_0.groupby('die').count()

Unnamed: 0_level_0,sample,sample_name,die_iml
die,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,392,392,392


# Select Die 2

In [16]:
df_1_sample = df_1_sample[df_1_sample["die"]=='2']
df_0 = df_0[df_0["die"]=='2']
df_1_template  = df_1_template[df_1_template["die"]=='2']

In [20]:
df_1_template =df_1_template.sample(n=300,random_state=2)
df_1_sample =df_1_sample.sample(n=300,random_state=2)
df_0 =df_0.sample(n=300,random_state=2)

In [None]:
df_1_template.groupby('die_iml').count()

Unnamed: 0_level_0,template,template_name,die
die_iml,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(2, 1)",105,105,105
"(2, 2)",118,118,118
"(2, 3)",95,95,95
"(2, 4)",82,82,82


In [None]:
del df_1_templateq

In [21]:
df_1_template = df_1_template.drop(columns='die')

#Die 2 models and results

In [22]:
df_train,df_validation = create_matched_missmatched (df_1_template,df_0,df_1_sample)

In [23]:
sequence_train = DataSequence(df_train,164 )
sequence_validation = DataSequence(df_validation, 164)

In [24]:
del df_train,df_validation

In [None]:
del df_1_template,df_1_sample,df_0

In [25]:
predict_5layers(sequence_train,model)


 In case of system failure, you can run the following function and re-run the predict_5layers function. Pay attention to the batch size (124 or 256 )

In [None]:
btsize = 164
def predict_batches_diff(sequence,model1,model2,model3,model4,model5): 
  zeros = np.zeros(4024)
  for i in range(len(sequence)): 
    if all(sequence.diff[i*btsize] == zeros):
        diff1 = predict_one_model_one_batch(sequence[i],model1)
        diff2 = predict_one_model_one_batch(sequence[i],model2)
        diff3 = predict_one_model_one_batch(sequence[i],model3)
        diff4 = predict_one_model_one_batch(sequence[i],model4)
        diff5 = predict_one_model_one_batch(sequence[i],model5)
        a= np.append(diff1,diff2,axis= 1)
        b= np.append(diff3,diff4,axis= 1)
        c= np.append(b,diff5,axis= 1)
        diff= np.append(a,c,axis= 1)
        sequence.change_batch_diff(diff,i)

In [26]:
# This function is responsible for the Multi-layer feature fusion : create sample and template features and perform the simmilarity measure.
predict_5layers(sequence_validation,model)



## Fully Connected block 

In [27]:
model_FC = keras.Sequential(
    [
        keras.layers.Dense(512, activation="relu", name="layer1"),
        keras.layers.Dense(128, activation="relu", name="layer2"),
        keras.layers.Dense(2,  activation="softmax", name="layer3"),
    ]
)

In [28]:
from tensorflow.keras.utils import to_categorical
y= to_categorical(sequence_train.labels)

In [29]:
model_FC.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model_FC.fit(sequence_train.diff,to_categorical(sequence_train.labels),
    batch_size=164,
    epochs=50,
    validation_data=(sequence_validation.diff,to_categorical(sequence_validation.labels)),
)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [42]:
# save model
model_FC.save('/content/drive/MyDrive/PFE_2/model_FC_die2_224.h5')
model_FC.save('model_FC_die2_224.h5')

In [30]:
import sklearn.metrics as metrics
dx = pd.DataFrame(sequence_validation.sample_name,columns=['sample_name'])
dx ['label'] = sequence_validation.labels
dx['prediction'] = model_FC.predict(sequence_validation.diff).argmax(axis=1)
x= dx.groupby('sample_name')[['label','prediction']].mean()
x['prediction']= x['prediction'].astype(int)
matrix = metrics.confusion_matrix(x['label'], x['prediction'])

In [31]:
print("test")
[[tn,fp],[fn,tp]]=matrix
l=100
print("confusion matrix: \n ",matrix)
print("accuracy = ",(tn+tp)/matrix.sum())
print("\n precision =  ",(tp)/(tp+fp))
print("\n recall =  ",(tp)/(tp+fn))
print("\n valeo score=  ",(fn+l*fp)/matrix.sum())

test
confusion matrix: 
  [[58  2]
 [ 6 54]]
accuracy =  0.9333333333333333

 precision =   0.9642857142857143

 recall =   0.9

 valeo score=   1.7166666666666666


# Die 2 using test set 

In [17]:
from keras.models import load_model
from tensorflow.keras.utils import to_categorical
import sklearn.metrics as metrics

model_FC =load_model('/content/drive/MyDrive/PFE_2/model_FC_die2_224.h5')

In [33]:
model_FC.load_weights("/content/drive/MyDrive/PFE_2/model_FC_die2_224.h5")

## Generate test set 
this subsection is responsible for the generation of test set for a given image resize. This operation is only realised once during this project. The resulted dataframe is saved as pickle file. If you have already generated test dataframe, move directly to the next subsection.

In [None]:
path = "/content/Data_challenge"
train_file = os.path.join(path,"x_train")
test_file = os.path.join(path, "x_test")

In [21]:
y_benchmark = pd.read_csv('Y_Benchmark.csv')

In [22]:
df_test = generate_test_dataset("/content/Data_challenge/x_test",y_benchmark,224)

TypeError: ignored

In [None]:
#with open('/content/drive/MyDrive/PFE_2/df_test_224.pkl', 'wb') as f:
#  pickle.dump(df_test, f)

## Die 2 test set results

In [43]:
#load picke
df_test = pd.read_pickle("/content/drive/MyDrive/PFE_2/df_test_224.pkl")

In [33]:
df_test = df_test[df_test["die"]=='2']

In [34]:
df_test_t = df_test.merge(df_1_template, how='inner', on='die_iml')

In [35]:
sequence_test = DataSequence(df_test_t, 164)

In [31]:
len(sequence_test)

331

In [36]:
predict_5layers(sequence_test,model)

In [37]:
test_accuracy = model_FC.evaluate(
    sequence_test.diff, to_categorical(sequence_test.labels)
)



In [38]:
dx = pd.DataFrame(sequence_test.sample_name,columns=['sample_name'])
dx ['label'] = sequence_test.labels
dx['prediction'] = model_FC.predict(sequence_test.diff).argmax(axis=1)
x= dx.groupby('sample_name')[['label','prediction']].mean()
x['prediction']= x['prediction'].astype(int)
matrix = metrics.confusion_matrix(x['label'], x['prediction'])

In [40]:
print("test")
[[tn,fp],[fn,tp]]=matrix
l=100
print("confusion matrix: \n ",matrix)
print("accuracy = ",(tn+tp)/matrix.sum())
print("\n precision =  ",(tp)/(tp+fp))
print("\n recall =  ",(tp)/(tp+fn))
print("\n valeo score=  ",(fn+l*fp)/matrix.sum())

test
confusion matrix: 
  [[ 69   4]
 [ 35 428]]
accuracy =  0.9272388059701493

 precision =   0.9907407407407407

 recall =   0.9244060475161987

 valeo score=   0.8115671641791045


In [41]:
df_test.groupby('label').count()

Unnamed: 0_level_0,sample,sample_name,die_iml,die
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,73,73,73,73
1,463,463,463,463


total results 

In [44]:
df_test.groupby('die').count()

Unnamed: 0_level_0,sample,sample_name,die_iml,label
die,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,490,490,490,490
2,536,536,536,536
3,576,576,576,576
4,386,386,386,386


In [45]:
len(df_test)

1988

In [46]:
accuracy_totale = (490*0.9951 + 536*0.9545 + 576*0.9908 + 386*0.9893)/1988

In [47]:
accuracy_totale

0.9817814889336015

In [48]:
valeo_score_total= (490*0.4122 + 536*0.8115 + 576*0.3593 + 386*0.0155)/1988

In [49]:
valeo_score_total

0.4275059356136821