In [None]:
from sklearn.datasets import fetch_openml
import sklearn
import csv
### data 로드 ###
mnist = fetch_openml('mnist_784')
#print(sklearn.__version__)
#!pip install scikit-learn==0.24.2

In [None]:

### data 일부만 추출하는 과정 ###
def extract_data(dataset, name = 'mnist',ratio = 0.1):
    data_list = dataset['data']
    label_list = dataset['target']
    ratio = (1-ratio)
    test_data = data_list[int(data_list.shape[0]*ratio):,]
    test_label = label_list[int(label_list.shape[0]*ratio):,]
    return test_data, test_label

test_data, test_label = extract_data(mnist)

dic = {}
for label in test_label:
  if label not in dic:
    dic[label]=0
  dic[label]+=1
print(dic)
print(len(test_data))

In [None]:
import numpy as np
import copy
import matplotlib.pyplot as plt
import sys

### 거리 메소드 ###
### 1번과 다르게 mahalanobis는 너무 오래걸려서 뺐다 ###
def euclidean(x,center):
    return ((x - center)**2).sum(axis=1)
    
### 초기화 메소드 ###
### 1번과 동일하게 작성 ###
def random_initialize(k, data, dim):
    centers = []
    for i in range(dim):
      centers.append(np.random.randint(np.min(data), np.max(data), size = k))
    return np.array(centers)

def eff_initialize(k, data, dim, label):
    visit = set()
    label_visit = set()
    centers = np.zeros((k,dim))
    temp = copy.deepcopy(data.T)
    i = np.random.randint(0,temp.shape[0],size=1)
    i = int(i)
    visit.add(i)
    centers[0] = temp[i]
    label_visit.add(label[i])
    while len(visit)<k:
        _next = np.argmax(((temp-temp[i])**2).sum(axis=1))
        if _next not in visit and label[_next] not in label_visit:
          centers[len(visit)] = temp[_next]
          visit.add(_next)
          label_visit.add(label[_next])
          i=_next
        else:
          temp[_next] = temp[i]
    return centers

### 1번과 동일하게 kmeans 작성 ###
def kmeans(k, dim, img, label, init='eff_initialize', distance_method = 'euclidean'):
    data = img.reshape(-1,dim).T ### dim, -1
    ### initialize ###
    centers = []
    
    if init=='random_initialize':
        centers = random_initialize(k, data, dim) ### nan 떠서 안됌
    elif init=='eff_initialize':
        centers = eff_initialize(k,data, dim, label)
    if distance_method=='euclidean':
        method = euclidean
    
    pre_centers = copy.deepcopy(centers)
    while True:
        distances = []
        center_distance = {i:[] for i in range(k)}
        out = []
        for i,center in enumerate(centers):
            distance = method(data.T, center) # 원소개수, 센터개수
            out.append(distance)
        distances = np.array(out)
        
        outputs = {i:[] for i in range(10)}
        centers = []
        candidates = np.argmin(distances,axis=0)
        for _k in range(k):
            centers.append(data[:,np.where(candidates==_k)[0]].mean(axis=1))
            outputs[_k].append(np.where(candidates==_k)[0])
        centers = np.array(centers).squeeze()
        
        #print(centers.shape) # 10,784
        
        if (pre_centers-centers).sum()==0:
            break
        pre_centers = copy.deepcopy(centers)
    return centers, outputs

### accuracy 메소드 작성 ###
def accuracy(preds, targets):
  targets = np.array(list(map(int,targets))) ### str -> int로 바꾸기
  preds = preds.astype('int')
  result = []
  for pred, target in zip(preds, targets):
    if pred==target:
      result.append(1)
    else:
      result.append(0)
  return sum(result)/len(result)
  
centers, outputs = kmeans(10, 784, test_data, test_label)

'''
### kmeans 센터마다 아웃풋 출력과정
plt.figure(figsize=(5,5))
for pred, output in outputs.items():
  for idx in output[0]:
    plt.axis('off')
    plt.imshow(test_data[idx].reshape((28,28)).astype('uint8'), cmap='gray')
    plt.show()
'''

In [None]:
import numpy as np
import copy
from tqdm import tqdm

### gradient 계산하는 메소드 ###
def gradient_calculate(img,k):
  dy_filter = np.array([[-1,-2,-1],[0,0,0],[1,2,1]]) # dy 필터생성
  dx_filter = np.array([[-1,0,1],[-2,0,2],[-1,0,1]]) # dx 필터생성
  img = img.reshape((28,28))
  img = np.pad(img, ((1,1),(1,1)),'constant', constant_values=0) # 패딩작업
  divide_height = img.shape[0]//k # 나누고 난뒤 높이
  divide_width = img.shape[1]//k # 나누고 난뒤 너비
  feature = []
  for h in range(k):
    for w in range(k):
      
      divide_img = copy.deepcopy(img[h*divide_height:(h+1)*divide_height, w*divide_width:(w+1)*divide_width])
      orientation_dic = {i:0 for i in np.arange(-np.pi,np.pi,np.pi/4)} # 8 dim을 만들기 위해서
      dims = sorted(orientation_dic.keys())
      for i in range(1,divide_img.shape[0]-1):
        for j in range(1, divide_img.shape[1]-1):
          ### dy, dx 계산하고 orientation, magnitude 구하는 과정 ###
          dy = (divide_img[i-1:i+2,j-1:j+2]*dy_filter).sum()
          dx = (divide_img[i-1:i+2, j-1:j+2]*dx_filter).sum()
          magnitude = (dy**2 + dx**2)**(1/2)
          orientation = np.arctan(dy/(dx+1e-8))
          for idx in range(len(dims)-1):
            ### 해당 orientation 반올림 하는 과정 ###
            if dims[idx]<=orientation<dims[idx+1]:
              if abs(dims[idx]-orientation) < abs(dims[idx+1]-orientation):
                orientation_dic[dims[idx]]+=magnitude
              else:
                orientation_dic[dims[idx+1]]+=magnitude
              break
      
      for key,value in sorted(orientation_dic.items(), key=lambda x: x[0]):
        feature.append(value)
  
  return feature

### 해당 features 만드는 과정 -> 8 x 4 feature 7000개
features = []
for img in tqdm(test_data): # 실행과정 볼려고 tqdm 사용함
  feature = gradient_calculate(img,2)
  features.append(feature)
features = np.array(features)
'''
### 히스토그램 출력하는 과정이므로 주석처리 ###
for img in test_data:
  feature = gradient_calculate(img,2)
  a,b,c,d = np.array(feature).reshape(4,8)
  plt.bar(np.arange(len(a)),a)
  plt.show()
  plt.bar(np.arange(len(a)),b)
  plt.show()
  plt.bar(np.arange(len(a)),c)
  plt.show()
  plt.bar(np.arange(len(a)),d)
  plt.show()
  features.append(feature)
  break
'''



In [None]:
'''
### 8 x 4 features 로 kmeans 한 결과 출력하는 과정이므로 주석처리 ###
centers, outputs = kmeans(10, 32, np.array(features), test_label)

plt.figure(figsize=(35,30))
for _, output in outputs.items():
  
  for idx in output[0]:
    plt.axis('off')
    plt.imshow(test_data[idx].reshape((28,28)).astype('uint8'), cmap='gray')

    plt.show()
'''

In [None]:
### pytorch 이용해서 Lenet 만드는 과정 ###

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt

class Net(nn.Module):

    def __init__(self): # 초기화 진행 -> conv와 fc 만듬, dp는 overfit을 목적으로 만들었기에 필요없음
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 3)
        self.conv3 = nn.Conv2d(16, 120, 3)
        self.fc1 = nn.Linear(120 * 3 * 3, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        self.pool = nn.AvgPool2d(2)
        self.tanh = nn.Tanh()

    def forward(self, x):
        x = self.pool(self.tanh(self.conv1(x)))
        x = self.pool(self.tanh(self.conv2(x)))
        x = self.tanh(self.conv3(x))
        
        x = x.view(-1, 120*3*3) # 차원 맞춰줘야 되므로 reshape 과정

        x = self.tanh(self.fc1(x))
        x = self.tanh(self.fc2(x))
        x = self.fc3(x)
        return x


def train(model, data, target):
  
  model = model.cuda()
  model.train()
  optimizer = optim.SGD(model.parameters(), lr=0.1) # optimize는 SGD
  scheduler = optim.lr_scheduler.StepLR(optimizer,20) # 혹시나를 위해서 scheduler 설정
  criterion = nn.CrossEntropyLoss() # multi-class 이므로 crossentropy loss 사용
  losses = []
  scores = []
  for epoch in range(100):
    running_loss = 0
    inputs = []
    labels = []
    preds = []
    targets = []
    for input, label in zip(data,target):
      if len(inputs)!=4: # batch단위로 묶는 과정
        inputs.append(input)
        labels.append(int(label))
        continue
      inputs = torch.tensor(inputs,dtype=torch.float).reshape(4,1,28,28).cuda() # expand dim해서 conv와 차원을 맞춰준다.
      inputs = inputs/255. # normalize 하는 과정 -> 혹시나의 nan, inf 등의 floating error를 위하여
      
      labels = torch.tensor(labels).cuda()
      outputs = net(inputs)
      
      
      loss = criterion(outputs, labels)
      optimizer.zero_grad()
      loss.backward() # gradient 계산
      optimizer.step() # update model's weights
      
      for output, label in zip(torch.argmax(torch.softmax(outputs,dim=1), dim=1), labels): # softmax를 이용해서 predition을 한다
          preds.append(output.item())
          targets.append(label.item())
      running_loss += loss.item()
      inputs = []
      labels = []
    # loss와 score 출력하는 과정
    preds = np.array(preds)
    targets = np.array(targets)
    score = (preds==targets).sum()/len(preds)
    losses.append(running_loss/len(data))
    scores.append(score)
    print('epoch : {}, loss : {}, score : {}'.format(epoch, running_loss/len(data), score))
    scheduler.step()
    if score==1:
      x_axis = np.arange(len(losses))
      plt.plot(x_axis,losses)
      x_axis = np.arange(len(scores))
      plt.plot(x_axis,scores)
      break
  return model
net = Net()
#summary(net, (1,28,28), device='cpu')
model = train(net, test_data, test_label)

In [None]:
### 784 dim으로 진행 ###
centers, outputs = kmeans(10, 784, test_data, test_label)
i=0
total_accuracy = 0
x_axis = []
y_axis = []
for pred, output in outputs.items():
  center_preds = {i:0 for i in range(10)}
  indexes = output[0]
  for idx in indexes:
    ### prediction -> labeling 하는 과정
    input = torch.tensor(test_data[idx], dtype=torch.float).reshape((1,1,28,28)).cuda()
    output = model(input)
    number = torch.argmax(output).item() # 어차피 제일 큰값이 softmax해도 크므로 할필요 없다 생각하고 argmax만 적용
    center_preds[number]+=1
  center_preds = sorted(center_preds.items(), key=lambda x:-x[1]) # 높게 나온 순으로 정렬
  label = center_preds[0][0] # 가장 앞에껄로 labeling
  accuracy = center_preds[0][1]/len(indexes) # accuracy 계산
  print('label : {}, accuracy : {}'.format(label, accuracy))
  total_accuracy += accuracy
  x_axis.append(str(label))
  y_axis.append(accuracy)
print('total score : {}'.format(total_accuracy/10))
helper = np.arange(len(x_axis))
plt.bar(helper, y_axis)
plt.xticks(ticks=helper, labels=x_axis)
plt.xlabel('labels')
plt.ylabel('Accuracy')
plt.show()


In [None]:
### 784 dim으로 진행 ###
features = np.array(features)
centers, outputs = kmeans(10, 32, features, test_label)
i=0
total_accuracy = 0
x_axis = []
y_axis = []
for pred, output in outputs.items():
  center_preds = {i:0 for i in range(10)}
  indexes = output[0]
  for idx in indexes:
    ### prediction -> labeling 하는 과정, 위 784로 했을때와 동일
    input = torch.tensor(test_data[idx], dtype=torch.float).reshape((1,1,28,28)).cuda()
    output = model(input)
    number = torch.argmax(output).item()
    center_preds[number]+=1
  center_preds = sorted(center_preds.items(), key=lambda x:-x[1])
  label = center_preds[0][0]
  accuracy = center_preds[0][1]/len(indexes)
  print('label : {}, accuracy : {}'.format(label, accuracy))
  total_accuracy += accuracy
  x_axis.append(str(label))
  y_axis.append(accuracy)
print('total score : {}'.format(total_accuracy/10))
helper = np.arange(len(x_axis))
plt.bar(helper, y_axis)
plt.xticks(ticks=helper, labels=x_axis)
plt.xlabel('labels')
plt.ylabel('Accuracy')
plt.show()
