# Abalone Age Prediction

![method](https://user-images.githubusercontent.com/28593767/112245885-1e390080-8c95-11eb-9dee-5f66bb94cbf8.png)

회귀 분석을 이용한 전복 나이 예측

## Initializing

In [1]:
import numpy as np
import pandas as pd
import csv

np.random.seed(777)

In [2]:
# Training hyper parameters
RAND_MEAN = 0
RAND_STD = 0.0030

LEARNING_RATE = 0.001

In [3]:
# Implement main function
# epoch number, minibatch size, output report, training rate
def main_exec(epoch_count=10, mb_size=10, report=1, train_rate=0.8):
    load_dataset()   # Load data
    init_model()     # Initialize parameters
    train_and_test(epoch_count, mb_size, report, train_rate)   # Train and test

In [4]:
# Implement loading dataset function
def load_dataset():
    with open('data_nn/abalone.csv') as csv_file:
        csvreader = csv.reader(csv_file)
        next(csvreader, None)    # Skip the first row (column info)
        rows = []
        # Store csv data to empty list, rows
        for row in csvreader:
            rows.append(row)
    # Global variable (전역 변수)
    # Input vector size increases from 8 to 10 (One-hot vector)
    global data, input_cnt, output_cnt
    input_cnt, output_cnt = 10, 1    # Size of independant and dependant variables
    data = np.zeros([len(rows), input_cnt+output_cnt])   # Buffer
    
    # One-hot vector
    # I = [1,0,0], M = [0,1,0], F = [0,0,1]
    for n, row in enumerate(rows):
        if row[0] == 'I': data[n, 0] = 1
        if row[0] == 'M': data[n, 1] = 1
        if row[0] == 'F': data[n, 2] = 1
        data[n, 3:] = row[1:]    # For the rest, store data from enumerate

In [5]:
# Implement parameters initializing function

def init_model():
    global weight, bias, input_cnt, output_cnt
    weight = np.random.normal(RAND_MEAN, RAND_STD, [input_cnt, output_cnt])
    bias = np.zeros([output_cnt])
    # print(weight)
    # print(bias)

In [6]:
# data 출력 형태 확인 (원 핫 벡터 확인)

with open('data_nn/abalone.csv') as csvfile:
    csvreader = csv.reader(csvfile)
    next(csvreader, None)    # Skip the first row (column info)
    rows = []
    for row in csvreader:
        rows.append(row)
        
global data
data = np.zeros([len(rows), 11]) 

for n, row in enumerate(rows):
    if row[0] == 'I': data[n, 0] = 1
    if row[0] == 'M': data[n, 1] = 1
    if row[0] == 'F': data[n, 2] = 1
    data[n, 3:] = row[1:]  
print(data)

[[ 0.      1.      0.     ...  0.101   0.15   15.    ]
 [ 0.      1.      0.     ...  0.0485  0.07    7.    ]
 [ 0.      0.      1.     ...  0.1415  0.21    9.    ]
 ...
 [ 0.      1.      0.     ...  0.2875  0.308   9.    ]
 [ 0.      0.      1.     ...  0.261   0.296  10.    ]
 [ 0.      1.      0.     ...  0.3765  0.495  12.    ]]


In [7]:
# Implement training and testing function & Ouput training result

def train_and_test(epoch_count, mb_size, report, train_rate):
    step_count = arrange_data(mb_size, train_rate)  # Return how many steps in each minibatch
    test_x, test_y = get_test_data()                # Get x and y value from test data
    
    # Nested for-loop
    for epoch in range(epoch_count):
        losses, accs = [], []         # Store loss and accuracy of total minibatch (1 epoch)
        for n in range(step_count):
            # Return x and y value from train data from minibatch size and step count
            train_x, train_y = get_train_data(mb_size, n)
            loss, acc = run_train(train_x, train_y)
            losses.append(loss)
            accs.append(acc)
        
        if report > 0 and (epoch+1) % report == 0:
            acc = run_test(test_x, test_y)
            # format 5.3f : 소수점을 포함한 전체 자릿수.소수점 이하 자릿수
            print("Epoch {} : Train - loss = {:5.3f}. accuracy = {:5.3f} / Test = {:5.3f}".\
                  format(epoch+1, np.mean(losses), np.mean(accs), acc))
            
    final_acc = run_test(test_x, test_y)
    print("\n 최종 테스트 결과 : final accuracy = {:5.3f}.format(final_acc)")

In [8]:
print("총 데이터의 수(행) :", data.shape[0])
mb_size = 100
step_count = int(data.shape[0] * 0.7) // mb_size
print("데이터의 70%의 미니배치 스텝 수 :", step_count)

총 데이터의 수(행) : 4177
데이터의 70%의 미니배치 스텝 수 : 29


## Arranging data

In [9]:
# Implement arranging data function

def arrange_data(mb_size, train_rate):
    global data, shuffle_map, test_begin_index
    # Randomly shuffle the data
    shuffle_map = np.arange(data.shape[0])
    np.random.shuffle(shuffle_map)
    # Get minibatch step count
    step_count = int(data.shape[0] * train_rate) // mb_size
    
    # Get training and testing data boundatry index
    test_begin_index = step_count * mb_size
    
    return step_count

In [13]:
# Implement dividing train data function
def get_train_data(mb_size, nth):
    global data, shuffle_map, test_begin_index, output_cnt
    if nth == 0 :
        np.random.shuffle(shuffle_map[:test_begin_index])
    train_data = data[shuffle_map[mb_size * nth : mb_size * (nth+1)]]
    
    return train_data[:, :-output_cnt], train_data[:, -output_cnt:]

# Implement dividing train data function test data function
def get_test_data():
    global data, shuffle_map, test_begin_index, output_cnt
    test_data = data[shuffle_map[test_begin_index:]]
    
    return test_data[:, :-output_cnt], test_data[:, -output_cnt:]

In [14]:
nth = 0
mb_size = 100

global shuffle_map
train_data = data[shuffle_map[mb_size * nth : mb_size * (nth+1)]] 
print(train_data.shape)

print("---"*20)
for n, i in enumerate(train_data[0:5]):
    print(n,i)

NameError: name 'shuffle_map' is not defined

In [15]:
# Shuffle data
shuffle_map = np.arange(data.shape[0])
# Get minibatch step count
step_count = int(data.shape[0] * 0.8) // mb_size 
# Get training data and testing data boundary index
test_begin_index = step_count * mb_size
# Print boundary index
print("경계 인덱스 생성 : ", test_begin_index) 
# Print regular order
print("일반적인 순서 \n", shuffle_map) 
np.random.shuffle(shuffle_map[:test_begin_index]) 
# Print shuffled order till boundary index
# Print shuffled data from 0 to 3300 and regular data after
print("처음부터 경계선까지의 순서 셔플 \n", shuffle_map) 
# Print shuffled data from 3295 to 3300 and regular data from 3300 to 3395
print("3295번째부터 3305번째까지의 순서 출력 \n", shuffle_map[3295:3305]) 

경계 인덱스 생성 :  3300
일반적인 순서 
 [   0    1    2 ... 4174 4175 4176]
처음부터 경계선까지의 순서 셔플 
 [2852 2325 3253 ... 4174 4175 4176]
3295번째부터 3305번째까지의 순서 출력 
 [1693 2119 2982 2863 2151 3300 3301 3302 3303 3304]


## Training data

run_train()은 학습을 수행하는 과정이다.

1. 순전파 과정을 통해 얻은 예측에 대한 손실 구하기
    * forward_neuralnet(), forward_postproc()
2. 예측된 값을 바탕으로 정확도 산출하기
    * eval_accuracy()
3. 손실이 나오기까지 영향을 미친모든 요소에 대한 기울기 구하기
    * backprop_postproc(), backprop_neuralnet()
    
이렇게 구한 결괏값을 학습률과 곱해 기존 파라미터에서 빼주는 학습단계로 진행된다.

In [16]:
# Implement calculating output from matrix multiplication
def forward_neuralnet(x):
    global weight
    # matmul() : multiply matrices
    output = np.matmul(x, weight) + bias
    # Return output and x. x will be used for backpropagation (aux_nn)
    return output, x

In [17]:
# Implement processing forward propagation and calculating MSE function
def forward_postproc(output, y):
    # Mean value of squared difference
    diff = output - y
    square = np.square(diff)
    loss = np.mean(square)
    
    return loss, diff

In [18]:
# Implement evaluation of neural net(regression) function
def eval_accuracy(output, y):
    mdiff = np.mean(np.abs(output - y) / y)
    return 1 - mdiff

In [19]:
# Implement actual training and actual testing function
def run_train(x, y): 
    output, aux_nn = forward_neuralnet(x)        # Calculate neural net output
    loss, aux_pp = forward_postproc(output, y)   # Calculate loss function
    accuracy = eval_accuracy(output, y)          # First step of training
    
    G_loss = 1.0                                 # Second setep of training
    G_outout = backprop_postproc(G_loss, aux_pp) # Get output gradient from back propagation
    backprop_neuralnet(G_outout, aux_nn)         # Update output gradient 
    
    return loss, accuracy
    
    
def run_test(x, y):
    output, _ = forward_neuralnet(x)
    accuracy = eval_accuracy(output, y)
    
    return accuracy

In [20]:
# Implement processing back propagation function
def backprop_postproc(G_loss, diff):
    shape = diff.shape
    
    g_loss_square = np.ones(shape) / np.prod(shape)
    g_square_diff = diff * 2
    g_diff_output = 1
    
    G_square = g_square_diff * G_loss
    G_diff = g_square_diff * G_square
    G_output = g_diff_output * G_diff
    
    return G_output

In [21]:
# Implement back propagation function
def backprop_neuralnet(G_output, x):
    global weight, bias
    g_output_w = x.transpose()
    
    G_w = np.matmul(g_output_w, G_output) 
    G_b = np.sum(G_output, axis=0)
    
    weight -= LEARNING_RATE * G_w
    bias -= LEARNING_RATE * G_b

In [23]:
main_exec(epoch_count=1000, mb_size=100, report=100, train_rate=0.7)

  """
  # Remove the CWD from sys.path while we load stuff.
  


Epoch 100 : Train - loss =   nan. accuracy =   nan / Test =   nan
Epoch 200 : Train - loss =   nan. accuracy =   nan / Test =   nan
Epoch 300 : Train - loss =   nan. accuracy =   nan / Test =   nan
Epoch 400 : Train - loss =   nan. accuracy =   nan / Test =   nan
Epoch 500 : Train - loss =   nan. accuracy =   nan / Test =   nan
Epoch 600 : Train - loss =   nan. accuracy =   nan / Test =   nan
Epoch 700 : Train - loss =   nan. accuracy =   nan / Test =   nan
Epoch 800 : Train - loss =   nan. accuracy =   nan / Test =   nan
Epoch 900 : Train - loss =   nan. accuracy =   nan / Test =   nan
Epoch 1000 : Train - loss =   nan. accuracy =   nan / Test =   nan

 최종 테스트 결과 : final accuracy = {:5.3f}.format(final_acc)
