# Load data

In [1]:
from dataloader import DataLoader

In [2]:
dataloader = DataLoader('/data0/semi-conductor-image-classification-first')
train_ratio = 1
data = dataloader.get_data(train_ratio)
print('train data:', data['train_data'].shape)
print('validation data:', data['validation_data'].shape)
print('test data:', data['test_data'].shape)
print('train labels:', data['train_labels'].shape)
print('validation labels:', data['validation_labels'].shape)

number of good images: 27000  number of bad images: 3000  number of test images: 3000
train data: (30000, 224, 224, 3)
validation data: (0, 224, 224, 3)
test data: (3000, 224, 224, 3)
train labels: (30000,)
validation labels: (0,)


# K-means

In [17]:
import torchvision.models as models
import torch
import numpy as np
import math
from sklearn.cluster import KMeans

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
data_train = np.transpose(data['train_data'], (0,3,1,2))

batch_size = 256
num_train = data_train.shape[0]

In [22]:
resnet18 = models.resnet18(pretrained=True, progress=True)
resnet18.to(device)

for param in resnet18.parameters():
    param.requires_grad = False

# total squared error if cluster label 0 -> true label 0
total_se00 = 0
# if cluster label 0 -> true label 1
total_se01 = 0

X = np.array([])

for i in range(math.ceil(float(num_train) / float(batch_size))):
    batch = torch.from_numpy(data_train[batch_size*i : batch_size*(i+1)]).float().to(device)
    labels = data['train_labels'][batch_size*i : batch_size*(i+1)]
    # print(batch.shape)
    features = resnet18(batch)
    # print(features.shape)
    
    if X.shape[0] == 0:
        X = features.cpu().numpy()
    else:
        X = np.concatenate((X, features.cpu().numpy()))
    # X = features.cpu().numpy()
    # kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
    
    # total_se00 += np.sum((kmeans.labels_ - labels)**2)
    # total_se01 += np.sum(((1-kmeans.labels_) - labels)**2)

print(X.shape)


(30000, 1000)
0.49296666666666666 0.5070333333333333 0.49296666666666666


In [23]:
kmeans = KMeans(n_clusters=2, random_state=0, max_iter=1000).fit(X)

total_se00 = np.sum((kmeans.labels_ - data['train_labels'])**2)
total_se01 = np.sum(((1-kmeans.labels_) - data['train_labels'])**2)

mse00 = total_se00 / num_train
mse01 = total_se01 / num_train
mse = min(mse00, mse01)
print(mse00, mse01, mse)

0.49296666666666666 0.5070333333333333 0.49296666666666666


# SVM

In [25]:
from sklearn.svm import OneClassSVM
clf = OneClassSVM(gamma='auto').fit(X)
pred_svm = clf.predict(X)


In [33]:
# total squared error if cluster label 0 -> true label 0
# if cluster label 0 -> true label 1
pred_svm = (pred_svm + 1)/2
svm_se00 = np.sum((pred_svm - data['train_labels'])**2)
svm_se01 = np.sum(((1-pred_svm) - data['train_labels'])**2)

svm_mse00 = svm_se00 / num_train
svm_mse01 = svm_se01 / num_train
svm_mse = min(svm_mse00, svm_mse01)
print(svm_mse00, svm_mse01, svm_mse)

0.521 0.479 0.479


In [34]:
accuracy = pred_svm == data['train_labels']
print(pred_svm)
print(data['train_labels'])
accuracy = np.sum(accuracy) / num_train
print(accuracy)


[1. 1. 1. ... 0. 1. 1.]
[0 0 0 ... 0 0 0]
0.479
