# Load data

In [1]:
from dataloader import DataLoader

dataloader = DataLoader('/data0/semi-conductor-image-classification-first', 224, 224, rgb=True)
train_ratio = 1
data = dataloader.get_data(train_ratio)
print('train data:', data['train_data'].shape)
print('validation data:', data['validation_data'].shape)
print('test data:', data['test_data'].shape)
print('test image paths:', len(data['test_list']))
print('train labels:', data['train_labels'].shape)
print('validation labels:', data['validation_labels'].shape)

number of good images: 27000  number of bad images: 3000  number of test images: 3000
train data: (30000, 224, 224, 3)
validation data: (0, 224, 224, 3)
test data: (3000, 224, 224, 3)
test image paths: 3000
train labels: (30000,)
validation labels: (0,)


In [2]:
print(data['test_list'])

['86495424f0b40c9395c3a27eda3324eb', 'd2d1c0f6b6130ef59d78322698b11d40', '737e64dd90ae3b4c53b00e0a80916a79', '592e45cd940b8d6d595a052dd308492c', '6bd0a1cc9fe127f30894c560aafe3181', 'f130f58bc711b127537ced009f113e38', '0ffd699d6e3f54d2aef02cc48ad70583', 'a2b223ff5d2c434025daab5dbdbdab39', '857e7d9b45791250f4ee79cbc5dfe047', '5c74d2b54c330c26c521d5dabaaf7558', 'f1c73b9672dc7594a881ab6ba7e5ea4a', '8305c7bb4ec5aa7c7793278e90d4b73c', 'ce4f8a0648e8084c77b9fdbe274c3be5', 'b2aa1d62b3bfc436ec08020045b150f4', '7f376a2ae30a95babaa10e0f0afe3ddb', '0b72db0d7f3f9a9151f8c4a511b9c458', '034f7f9ffec4e5b91c89c3b0bf429932', '055c30977e3b90dab29014eef67a2054', 'c39c7988276e95e5110d563fd079402a', '152de8d932b722f9510fc3f2a7c2ac0a', '7d54095838f67c7b2a66196e8d7cdc21', 'ef1c49ec1e9eb68cdae2c49d17da694b', '15a13f7bd9c0dc98fbc29a3934cce528', 'e085522a88a07e6555c245862b2d3e41', 'a7c3295def09063ace43623262fcebaa', 'fd4cd6fb6c6566672bedb217b1ef0a96', 'fe89d953aa9286f06d3cda640f59c732', '21dbba138fe56dfe6b65fcf194

# Extract features using resnet18

In [2]:
import torchvision.models as models
import torch
import numpy as np
import math
from sklearn.cluster import KMeans

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# data_train = data['train_data'].reshape((-1,1,224,224))
data_train = np.transpose(data['train_data'], (0,3,1,2))
num_train = data_train.shape[0]
print('number of training data', num_train, ', shape of training data:', data_train.shape)

number of training data 30000 , shape of training data: (30000, 3, 224, 224)


In [3]:
batch_size = 256
resnet18 = models.resnet18(pretrained=True, progress=True)
resnet18.to(device)

for param in resnet18.parameters():
    param.requires_grad = False

# total squared error if cluster label 0 -> true label 0
total_se00 = 0
# if cluster label 0 -> true label 1
total_se01 = 0

X = np.array([])

for i in range(math.ceil(float(num_train) / float(batch_size))):
    batch = torch.from_numpy(data_train[batch_size*i : batch_size*(i+1)]).float().to(device)
    labels = data['train_labels'][batch_size*i : batch_size*(i+1)]
    # print(batch.shape)
    features = resnet18(batch)
    # print(features.shape)
    
    if X.shape[0] == 0:
        X = features.cpu().numpy()
    else:
        X = np.concatenate((X, features.cpu().numpy()))

print(X.shape)


(30000, 1000)


In [4]:
# split to k folds
from sklearn.model_selection import KFold

kf = KFold(n_splits=10)
kf.get_n_splits(X)
print(kf)

KFold(n_splits=10, random_state=None, shuffle=False)


In [5]:
test_data = np.array([])

data_test = np.transpose(data['test_data'], (0,3,1,2))
num_test = data_test.shape[0]

for i in range(math.ceil(float(num_test) / float(batch_size))):
    batch = torch.from_numpy(data_test[batch_size*i : batch_size*(i+1)]).float().to(device)
    features = resnet18(batch)
    
    if test_data.shape[0] == 0:
        test_data = features.cpu().numpy()
    else:
        test_data = np.concatenate((test_data, features.cpu().numpy()))

print(test_data.shape)

(3000, 1000)


In [6]:
y = data['train_labels']
print(y.shape)

(30000,)


# K-Means

In [19]:
from sklearn.metrics import roc_auc_score

train_accs = list()
test_accs = list()
auc_scores = list()

kmeans = KMeans(n_clusters=2, random_state=0, max_iter=1000)
# reverse = False
for i, (train_index, test_index) in enumerate(kf.split(X)):
#     print("TRAIN:", train_index, len(train_index), "TEST:", test_index, len(test_index))
    X_trainset, X_testset = X[train_index], X[test_index]
    y_trainset, y_testset = y[train_index], y[test_index]
    
    kmeans.fit(X_trainset)
    X_predset = 1-kmeans.labels_
    train_acc = np.sum(abs( 1-X_predset - y_trainset )) / len(X_trainset)

    y_predset = 1-kmeans.predict(X_testset)  
    test_acc = np.sum(abs( 1-y_predset - y_testset )) / len(X_testset)
    
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    
    auc_score = roc_auc_score(y_testset, y_predset)
    auc_scores.append(auc_score)
    
    print('fold {0}/10: train accuracy: {1}, test accuracy: {2}, auc score: {3}.'.format(i, train_acc, test_acc, auc_score))
    
print('average train accuracy: {0}, test accuracy: {1}, auc score: {2}'.format(sum(train_accs)/len(train_accs), sum(test_accs)/len(test_accs), sum(auc_scores)/len(auc_scores)))

fold 0/10: train accuracy: 0.5060370370370371, test accuracy: 0.507, auc score: 0.49084000258778493.
fold 1/10: train accuracy: 0.5048518518518519, test accuracy: 0.5126666666666667, auc score: 0.49348812569966416.
fold 2/10: train accuracy: 0.4968148148148148, test accuracy: 0.487, auc score: 0.5039492430291359.
fold 3/10: train accuracy: 0.4902962962962963, test accuracy: 0.49333333333333335, auc score: 0.4941484839389725.
fold 4/10: train accuracy: 0.5077777777777778, test accuracy: 0.49933333333333335, auc score: 0.48334994187012126.
fold 5/10: train accuracy: 0.492, test accuracy: 0.49833333333333335, auc score: 0.5197813321764412.
fold 6/10: train accuracy: 0.5075185185185185, test accuracy: 0.5136666666666667, auc score: 0.5126628210912354.
fold 7/10: train accuracy: 0.4922222222222222, test accuracy: 0.4856666666666667, auc score: 0.48318641443194815.
fold 8/10: train accuracy: 0.49625925925925923, test accuracy: 0.498, auc score: 0.5219371090073631.
fold 9/10: train accuracy: 

In [20]:
from dataloader import save_csv

kmeans_pred = 1-kmeans.predict(test_data)

save_csv('/data0/semi-conductor-image-classification-first/sample_submission.csv', '/data0/semi-conductor-image-classification-first/kmeans_prediction.csv', data['test_list'], kmeans_pred)

Processed 3001 lines.


# One-class SVM

In [21]:
from sklearn.svm import OneClassSVM

train_accs = list()
test_accs = list()
auc_scores = list()

svm = OneClassSVM(gamma='auto')
# reverse = False
for i, (train_index, test_index) in enumerate(kf.split(X)):
#     print("TRAIN:", train_index, len(train_index), "TEST:", test_index, len(test_index))
    X_trainset, X_testset = X[train_index], X[test_index]
    y_trainset, y_testset = y[train_index], y[test_index]
    
    svm.fit(X_trainset)
    
    X_predset = (1-svm.predict(X_trainset)) / 2
    train_acc = np.sum(abs( 1-X_predset - y_trainset )) / len(X_trainset)

    y_predset = (1-svm.predict(X_testset)) / 2  
    test_acc = np.sum(abs( 1-y_predset - y_testset )) / len(X_testset)
    
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    
    auc_score = roc_auc_score(y_testset, y_predset)
    auc_scores.append(auc_score)
    
    print('fold {0}/10: train accuracy: {1}, test accuracy: {2}, auc score: {3}.'.format(i, train_acc, test_acc, auc_score))
    
print('average train accuracy: {0}, test accuracy: {1}, auc score: {2}'.format(sum(train_accs)/len(train_accs), sum(test_accs)/len(test_accs), sum(auc_scores)/len(auc_scores)))

fold 0/10: train accuracy: 0.522074074074074, test accuracy: 0.5166666666666667, auc score: 0.5437354674947578.
fold 1/10: train accuracy: 0.5206296296296297, test accuracy: 0.5313333333333333, auc score: 0.5775527746681592.
fold 2/10: train accuracy: 0.5217407407407407, test accuracy: 0.526, auc score: 0.5636106022889171.
fold 3/10: train accuracy: 0.521074074074074, test accuracy: 0.5186666666666667, auc score: 0.5637293295658506.
fold 4/10: train accuracy: 0.5221851851851852, test accuracy: 0.5043333333333333, auc score: 0.5431199136356086.
fold 5/10: train accuracy: 0.5215925925925926, test accuracy: 0.514, auc score: 0.551460766517108.
fold 6/10: train accuracy: 0.5228518518518519, test accuracy: 0.512, auc score: 0.5362654535646162.
fold 7/10: train accuracy: 0.5227037037037037, test accuracy: 0.518, auc score: 0.5399120475266848.
fold 8/10: train accuracy: 0.5195185185185185, test accuracy: 0.5323333333333333, auc score: 0.5981495321928806.
fold 9/10: train accuracy: 0.520259259

In [22]:
svm_pred = (1-svm.predict(test_data)) / 2
save_csv('/data0/semi-conductor-image-classification-first/sample_submission.csv', '/data0/semi-conductor-image-classification-first/svm_prediction.csv', data['test_list'], svm_pred)

Processed 3001 lines.


# Isolation Forest

In [15]:
from sklearn.ensemble import IsolationForest

In [17]:
iso_forest = IsolationForest(random_state=0).fit(X)

train_accs = list()
test_accs = list()
auc_scores = list()

iso_forest = IsolationForest(random_state=0)
# reverse = False
for i, (train_index, test_index) in enumerate(kf.split(X)):
    X_trainset, X_testset = X[train_index], X[test_index]
    y_trainset, y_testset = y[train_index], y[test_index]
    
    iso_forest.fit(X_trainset)
    
    X_predset = (1 - iso_forest.predict(X_trainset)) / 2
    train_acc = np.sum(abs((1-X_predset) - y_trainset)) / len(X_trainset)
    
    y_predset = (1 - iso_forest.predict(X_testset)) / 2
    test_acc = np.sum(abs((1-y_predset) - y_testset )) / len(X_testset)
    
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    
    auc_score = roc_auc_score(y_testset, y_predset)
    auc_scores.append(auc_score)
    
    print('fold {0}/10: train accuracy: {1}, test accuracy: {2}, auc score: {3}.'.format(i, train_acc, test_acc, auc_score))
    
print('average train accuracy: {0}, test accuracy: {1}, auc score: {2}'.format(sum(train_accs)/len(train_accs), sum(test_accs)/len(test_accs), sum(auc_scores)/len(auc_scores)))



fold 0/10: train accuracy: 0.8292222222222222, test accuracy: 0.83, auc score: 0.5316604065105623.




fold 1/10: train accuracy: 0.8278518518518518, test accuracy: 0.8306666666666667, auc score: 0.5299956021109868.




fold 2/10: train accuracy: 0.8288888888888889, test accuracy: 0.8303333333333334, auc score: 0.5186268063187889.




fold 3/10: train accuracy: 0.8287777777777777, test accuracy: 0.8223333333333334, auc score: 0.5369183011889572.




fold 4/10: train accuracy: 0.8296296296296296, test accuracy: 0.8176666666666667, auc score: 0.5193697060288989.




fold 5/10: train accuracy: 0.8270740740740741, test accuracy: 0.8333333333333334, auc score: 0.5289051767114582.




fold 6/10: train accuracy: 0.831, test accuracy: 0.8333333333333334, auc score: 0.5161142835025297.




fold 7/10: train accuracy: 0.8261111111111111, test accuracy: 0.8346666666666667, auc score: 0.5051871277034382.




fold 8/10: train accuracy: 0.8286666666666667, test accuracy: 0.8293333333333334, auc score: 0.5394729557659304.




fold 9/10: train accuracy: 0.8272962962962963, test accuracy: 0.8313333333333334, auc score: 0.5287727440890594.
average train accuracy: 0.8284518518518518, test accuracy: 0.8293000000000001, auc score: 0.525502310993061


In [18]:
forest_pred = (1 - iso_forest.predict(test_data)) / 2
save_csv('/data0/semi-conductor-image-classification-first/sample_submission.csv', '/data0/semi-conductor-image-classification-first/isolation_forest_prediction.csv', data['test_list'], forest_pred)



Processed 3001 lines.
