[전제조건] 
- 알파벳 26개를 사용하는 언어를 식별할 수 있다.
- 언어마다 사용하는 알파벳 빈도는 다르다

[모델]
- 알파벳 입력 시 해당 언어 출력

[풀이]
- 언어별로 알파벳 빈도를 계산한다.
    - 방법 : 각 파일을 읽어 단어별로 나누고, 각 단어의 알파벳 빈도를 계산
- 입력된 알파벳 빈도와 언어별 빈도를 비교하여 가장 유사한 언어를 출력

[모델]
- 딥러닝을 활용
- 입력 데이터 : 알파벳 빈도
- 입력 라벨 : 4개 언어 - en, fr, id, tl
- 활성화 함수 : softmax
- 손실 함수 : categorical_crossentropy
- 최적화 알고리즘 : adam
- 평가 지표 : accuracy

#### 1. 데이터 준비

In [2]:
# load modules
train_dir = '../DATA/lang_data/train/'
test_dir = '../DATA/lang_data/test/'

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [10]:
# load data
# 1) 한 파일씩 불러오기 for loop
# 2) 단어별로 나누기 split
# 3) 한 글자씩 숫자로 바꾸기 (ord), 예: [92, 93, 99]

for root, dirs, files in os.walk(train_dir):        # copilot은 .walk()를 사용; dirpath, dirnames, filenames
    for file in files:
        f = open(os.path.join(root, file), 'r')
        data = f.read()
        f.close()
        
        data = data.split()
        data_num = [list(map(ord, list(word.lower()))) for word in data if word.isalpha()]
        # word가 알파벳이면 word의 각 글자를 숫자로 바꿈, 소문자 기준
        
        print(file)
        print(data)
        print(data_num)
        print(len(data_num))
        # 가장 긴 단어의 길이
        print(max([len(word) for word in data_num]))    # 14
        break
    break

en-1.txt
['The', 'main', 'Henry', 'Ford', 'Museum', 'building', 'houses', 'some', 'of', 'the', 'classrooms', 'for', 'the', 'Henry', 'Ford', 'Academy', 'Henry', 'Ford', 'Academy', 'is', 'the', 'first', 'charter', 'school', 'in', 'the', 'United', 'States', 'to', 'be', 'developed', 'jointly', 'by', 'a', 'global', 'corporation,', 'public', 'education,', 'and', 'a', 'major', 'nonprofit', 'cultural', 'institution.', 'The', 'school', 'is', 'sponsored', 'by', 'the', 'Ford', 'Motor', 'Company,', 'Wayne', 'County', 'Regional', 'Educational', 'Service', 'Agency', 'and', 'The', 'Henry', 'Ford', 'Museum', 'and', 'admits', 'high', 'school', 'students.', 'It', 'is', 'located', 'in', 'Dearborn,', 'Michigan', 'on', 'the', 'campus', 'of', 'the', 'Henry', 'Ford', 'museum.', 'Enrollment', 'is', 'taken', 'from', 'a', 'lottery', 'in', 'the', 'area', 'and', 'totaled', '467', 'in', '2010.[1]', 'Freshman', 'meet', 'inside', 'the', 'main', 'museum', 'building', 'in', 'glass', 'walled', 'classrooms,', 'while', '

In [31]:
# 이제 위 파일을 전체 파일에 적용
# 1) 파일별로 데이터 불러오기
# 2) 라벨 지정 : {'en' : 0, 'fr' : 1, 'id' : 2, 'tl' : 3}
# 3) 데이터 크기 지정 : 가장 긴 단어의 길이를 컬럼으로 하는 데이터프레임 생성, 0으로 채우기

for root, dirs, files in os.walk(train_dir):
    data_list = []
    label_list = []
    
    for file in files:
        f = open(os.path.join(root, file), 'r')
        data = f.read()
        f.close()
        
        data = data.split()
        data_num = [list(map(ord, list(word.lower()))) for word in data if word.isalpha()]
        # ============== 여기까진 위와 동일 ===================
        data_list.append(data_num)
        
        if 'en' in file:
            label_list.append(0)
        elif 'fr' in file:
            label_list.append(1)
        elif 'id' in file:
            label_list.append(2)
        elif 'tl' in file:
            label_list.append(3)
    
    print(len(data_list))
    print(len(label_list))
    print(data_list[0])
    print(label_list[0])
    

40
40
[[116, 104, 101], [109, 97, 105, 110], [104, 101, 110, 114, 121], [102, 111, 114, 100], [109, 117, 115, 101, 117, 109], [98, 117, 105, 108, 100, 105, 110, 103], [104, 111, 117, 115, 101, 115], [115, 111, 109, 101], [111, 102], [116, 104, 101], [99, 108, 97, 115, 115, 114, 111, 111, 109, 115], [102, 111, 114], [116, 104, 101], [104, 101, 110, 114, 121], [102, 111, 114, 100], [97, 99, 97, 100, 101, 109, 121], [104, 101, 110, 114, 121], [102, 111, 114, 100], [97, 99, 97, 100, 101, 109, 121], [105, 115], [116, 104, 101], [102, 105, 114, 115, 116], [99, 104, 97, 114, 116, 101, 114], [115, 99, 104, 111, 111, 108], [105, 110], [116, 104, 101], [117, 110, 105, 116, 101, 100], [115, 116, 97, 116, 101, 115], [116, 111], [98, 101], [100, 101, 118, 101, 108, 111, 112, 101, 100], [106, 111, 105, 110, 116, 108, 121], [98, 121], [97], [103, 108, 111, 98, 97, 108], [112, 117, 98, 108, 105, 99], [97, 110, 100], [97], [109, 97, 106, 111, 114], [110, 111, 110, 112, 114, 111, 102, 105, 116], [99, 11

### 여기서부터 수정

<hr>


In [4]:
# test 의 shape 확인
for root, dirs, files in os.walk(test_dir):
    data_list = []
    label_list = []
    
    for file in files:
        f = open(os.path.join(root, file), 'r')
        data = f.read()
        f.close()
        
        data = data.split()
        data_num = [list(map(ord, list(word.lower()))) for word in data if word.isalpha()]
        # ============== 여기까진 위와 동일 ===================
        data_list.append(data_num)
        
        if 'en' in file:
            label_list.append(0)
        elif 'fr' in file:
            label_list.append(1)
        elif 'id' in file:
            label_list.append(2)
        elif 'tl' in file:
            label_list.append(3)
    
    print(len(data_list))
    print(len(label_list))
    print(data_list[0])
    print(label_list[0])

16
16
[[114, 101, 100, 105, 114, 101, 99, 116, 115], [102, 111, 114], [116, 104, 101], [112, 97, 114, 116], [111, 102], [97], [115, 101, 101], [102, 111, 114], [111, 116, 104, 101, 114], [115, 101, 101], [119, 105, 110, 100, 111, 119, 115], [109, 105, 99, 114, 111, 115, 111, 102, 116], [119, 105, 110, 100, 111, 119, 115], [115, 99, 114, 101, 101, 110, 115, 104, 111, 116], [111, 102], [119, 105, 110, 100, 111, 119, 115], [115, 104, 111, 119, 105, 110, 103], [116, 104, 101], [97, 99, 116, 105, 111, 110], [99, 101, 110, 116, 101, 114], [97, 110, 100], [115, 116, 97, 114, 116], [109, 101, 110, 117], [100, 101, 118, 101, 108, 111, 112, 101, 114], [109, 105, 99, 114, 111, 115, 111, 102, 116], [119, 114, 105, 116, 116, 101, 110], [105, 110], [119, 111, 114, 107, 105, 110, 103], [115, 116, 97, 116, 101], [112, 117, 98, 108, 105, 99, 108, 121], [114, 101, 108, 101, 97, 115, 101, 100], [115, 111, 117, 114, 99, 101], [109, 111, 100, 101, 108], [99, 108, 111, 115, 101, 100], [115, 104, 97, 114, 10

In [None]:
# max 확인


In [32]:
# dataset 만들기
# 1) data_list를 데이터프레임으로 변환 
#   (1) 가장 긴 단어의 길이를 컬럼, 단어 개수를 행으로 하는, 0으로 채운 데이터프레임 생성
#   (2) data_list의 각 단어를 순서대로 채워넣기

data_frame = pd.DataFrame(0, index = range(len(data_list[0])), columns = range(len(max(data_list[0], key = len))))
# data_frame.shape  # (671, 14)

for idx, word in enumerate(data_list[0]):
    for idx2, i in enumerate(word):
        # print(idx, i)
        data_frame.loc[idx, idx2] = i
print(data_frame.head(), '\n')
print(data_frame.tail())

    0    1    2    3    4    5   6   7   8   9   10  11  12  13
0  116  104  101    0    0    0   0   0   0   0   0   0   0   0
1  109   97  105  110    0    0   0   0   0   0   0   0   0   0
2  104  101  110  114  121    0   0   0   0   0   0   0   0   0
3  102  111  114  100    0    0   0   0   0   0   0   0   0   0
4  109  117  115  101  117  109   0   0   0   0   0   0   0   0 

      0    1    2   3    4    5    6    7    8   9   10  11  12  13
666   98  121    0   0    0    0    0    0    0   0   0   0   0   0
667  101  120  112  97  110  100  105  110  103   0   0   0   0   0
668  118    0    0   0    0    0    0    0    0   0   0   0   0   0
669  116    0    0   0    0    0    0    0    0   0   0   0   0   0
670  101    0    0   0    0    0    0    0    0   0   0   0   0   0


In [36]:
print(data_frame.shape, data_list[0][-1])   # 잘 들어간 것을 확인

(671, 14) [101]


In [37]:
# 2) 이제 전체 데이터를 입력 : for list in data_list / 이름은 data_frame_{idx}

for idx, data in enumerate(data_list):
    data_frame = pd.DataFrame(0, index = range(len(data)), columns = range(len(max(data, key = len))))
    
    for idx2, word in enumerate(data):
        for idx3, i in enumerate(word):
            data_frame.loc[idx2, idx3] = i
    
    globals()['data_frame_{}'.format(idx)] = data_frame   # data_frame_0, data_frame_1, ...
    
print(data_frame_0.head(), '\n')

    0    1    2    3    4    5   6   7   8   9   10  11  12  13
0  116  104  101    0    0    0   0   0   0   0   0   0   0   0
1  109   97  105  110    0    0   0   0   0   0   0   0   0   0
2  104  101  110  114  121    0   0   0   0   0   0   0   0   0
3  102  111  114  100    0    0   0   0   0   0   0   0   0   0
4  109  117  115  101  117  109   0   0   0   0   0   0   0   0 



globals() : 전역 변수를 딕셔너리 형태로 반환

In [38]:
data_frame_11.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,99,101,116,0,0,0,0,0,0,0,0,0,0,0,0
1,97,114,116,105,99,108,101,0,0,0,0,0,0,0,0
2,101,115,116,0,0,0,0,0,0,0,0,0,0,0,0
3,117,110,101,0,0,0,0,0,0,0,0,0,0,0,0
4,233,98,97,117,99,104,101,0,0,0,0,0,0,0,0


df_list 완성 : 파일별 데이터프레임을 리스트로 저장

In [45]:
# 모든 data_frame을 하나로 합치기
df_list = [globals()['data_frame_{}'.format(i)] for i in range(len(data_list))]
df_list[0].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,116,104,101,0,0,0,0,0,0,0,0,0,0,0
1,109,97,105,110,0,0,0,0,0,0,0,0,0,0
2,104,101,110,114,121,0,0,0,0,0,0,0,0,0
3,102,111,114,100,0,0,0,0,0,0,0,0,0,0
4,109,117,115,101,117,109,0,0,0,0,0,0,0,0


In [39]:
# label_list 확인
print(label_list)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]


#### 문제 발생 : 데이터프레임의 shape이 달라 학습이 불가능
- 해결 : 데이터프레임의 shape을 동일하게 맞춰준다.

<hr>

In [5]:
# data_list 내의 가장 긴 단어 개수 확인
print([len(max(data, key = len)) for data in data_list])
print(max([len(max(data, key = len)) for data in data_list]))   # 41

[15, 18, 15, 18, 17, 18, 19, 18, 18, 16, 16, 16, 16, 17, 22, 16]
22


In [6]:
# data_list 중 가장 많은 행 찾기
print([len(data) for data in data_list])
print(max([len(data) for data in data_list]))   # 11467

[6871, 16014, 3585, 1487, 1672, 10625, 4206, 7163, 1391, 958, 800, 3520, 815, 1202, 4081, 239]
16014


In [35]:
# 열의 길이를 41, 행의 길이를 11467로 맞추기
for idx, data in enumerate(data_list):
    data_frame = pd.DataFrame(0, index = range(11467), columns = range(41))
    
    for idx2, word in enumerate(data):
        for idx3, i in enumerate(word):
            data_frame.loc[idx2, idx3] = i
    
    globals()['data_frame_{}'.format(idx)] = data_frame

In [36]:
data_frame_0.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,116,104,101,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,109,97,105,110,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,104,101,110,114,121,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,102,111,114,100,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,109,117,115,101,117,109,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# 다시 합치기 : 1차원으로 reshape
df_list = [globals()['data_frame_{}'.format(i)].values.reshape(-1) for i in range(len(data_list))]
df_list[0]

array([116, 104, 101, ...,   0,   0,   0], dtype=int64)

In [38]:
# 

#### 2. Dataset 생성

In [39]:
# 3) dataset class 생성 : __len__, __getitem__ 구현
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class LangDataset(Dataset):
    def __init__(self, x, y):
        super(LangDataset, self).__init__()
        self.data = torch.tensor(np.array(x), dtype = torch.float32)
        self.labels = torch.tensor(y, dtype = torch.long)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        data = self.data[idx]
        labels = self.labels[idx]
        return data, labels
    
# 4) DataLoader 생성
train_data = LangDataset(df_list, label_list)


In [40]:
# train_data 확인
print(train_data.data.shape)
print(train_data.labels.shape)

torch.Size([40, 470147])
torch.Size([40])


#### 3. DataLoader 생성

In [61]:
# from torch.utils.data import DataLoader
Batchs = 32
train_dl = DataLoader(train_data, batch_size = Batchs, shuffle = True, drop_last = True)
print(len(train_dl.dataset))
print(train_dl.dataset.__getitem__(0))

40
(tensor([116., 104., 101.,  ...,   0.,   0.,   0.]), tensor(0))


#### 4. 모델 생성

In [42]:
# 1. 모델 생성
# - basic하게, relu, linear만 사용
# - hidden_layer 변수로 조절
import torch.nn as nn
import torch.nn.functional as F

class LangModel(nn.Module):
    def __init__(self, IN, Hidden_list, OUT):
        super(LangModel, self).__init__()
        self.in_layer = nn.Linear(IN, Hidden_list[0])
        self.hidden = nn.ModuleList()
        for idx, hidden in enumerate(Hidden_list[:-1]):
            self.hidden.append(nn.Linear(hidden, Hidden_list[idx+1]))
        self.out_layer = nn.Linear(Hidden_list[-1], OUT)
        
    def forward(self, x):
        y = F.relu(self.in_layer(x))   # IN -> Hidden_list[0]
        for hidden in self.hidden:
            y = F.relu(hidden(y)) # Hidden_list[0] -> Hidden_list[1] -> ... -> Hidden_list[-1]
        y = self.out_layer(y)           # Hidden_list[-1] -> OUT
        return y

In [43]:
# 2. 모델 확인
# - IN : train_data.data.shape[1]
IN, Hidden_list, OUT = train_data.data.shape[1], [256, 128, 64], 4
model = LangModel(IN, Hidden_list, OUT)
print(model)

LangModel(
  (in_layer): Linear(in_features=470147, out_features=256, bias=True)
  (hidden): ModuleList(
    (0): Linear(in_features=256, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=64, bias=True)
  )
  (out_layer): Linear(in_features=64, out_features=4, bias=True)
)


#### 5. 모델 학습

In [58]:
len(train_dl)

1

In [142]:
# 1. 함수 생성
# - CrossEntropyLoss, Adam, ReduceLROnPlateau 사용
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

def training(model, train_dl, Epochs, lr = 0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr)
    
    for epoch in range(1, Epochs+1):
        for inputs, labels in train_dl.dataset:
            outputs = model(inputs)
            outputs = outputs.view(-1, 4)
            labels = labels.view(-1)
            # print(outputs, labels, outputs.shape, labels.shape)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # if batch % 10 == 0:
            #     print(f'Epoch : {epoch}, Batch : {batch}, Loss : {loss}')
        
        scheduler = ReduceLROnPlateau(optimizer, 'min', factor = 0.1, patience = 5)
        scheduler.step(loss)   # loss가 감소하지 않으면 학습률을 줄임
        # scheduler 정리
        # 1. 구성
            # - optimizer : 최적화할 optimizer
            # - mode : min / max 중 하나 선택, min: 감소, max: 증가
            # - factor : 학습률을 줄일 비율; new_lr = lr * factor
            # - patience : 성능이 증가하지 않는 epoch 수
            # - verbose : True로 설정하면 감소되는 학습률 출력
        # 2. 
        
        print(f'Epoch : {epoch}, batch : {epoch*len(train_dl.dataset)}, Loss : {loss}')
    print('Finished Training')


In [143]:
# 2. training function test
if False:
    Epochs = 50
    IN = train_data.data.shape[1]
    model = LangModel(IN, [64, 32, 16], 4)

    training(model, train_dl, Epochs)

Epoch : 1, batch : 40, Loss : 24.50965118408203
Epoch : 2, batch : 80, Loss : 1.501654863357544
Epoch : 3, batch : 120, Loss : 1.5018951892852783
Epoch : 4, batch : 160, Loss : 1.4979037046432495
Epoch : 5, batch : 200, Loss : 1.493213176727295
Epoch : 6, batch : 240, Loss : 1.4894179105758667
Epoch : 7, batch : 280, Loss : 1.4856727123260498
Epoch : 8, batch : 320, Loss : 1.4819412231445312
Epoch : 9, batch : 360, Loss : 1.478238821029663
Epoch : 10, batch : 400, Loss : 1.4745771884918213
Epoch : 11, batch : 440, Loss : 1.4709645509719849
Epoch : 12, batch : 480, Loss : 1.4674067497253418
Epoch : 13, batch : 520, Loss : 1.4639077186584473
Epoch : 14, batch : 560, Loss : 1.460470199584961
Epoch : 15, batch : 600, Loss : 1.4570962190628052
Epoch : 16, batch : 640, Loss : 1.4537872076034546
Epoch : 17, batch : 680, Loss : 1.4505435228347778
Epoch : 18, batch : 720, Loss : 1.447365641593933
Epoch : 19, batch : 760, Loss : 1.4442534446716309
Epoch : 20, batch : 800, Loss : 1.44120657444000

##### Debug Log<hr>
1. ValueError: expected sequence of length 671 at dim 1 (got 14)
    - 데이터프레임의 shape이 달라 학습이 불가능
    - 데이터프레임의 shape을 동일하게 맞춰준다.
    => data를 1차원으로 변환 : data = data.values.view(-1)

2. RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
    - inputs = inputs.float()
    labels = labels.long()
    해당 코드 삭제

3. Target 4 어쩌구
    - 라벨링 문제... argmax()는 0~3인데 나는 1~4로 라벨링
    - 수정

4. batch가 하나만 작동
    - train_dl.dataset 해야 했었다

5. 학습이 안됨 : loss = 0.0 -> 들쭉날쭉
    - outputs = outputs.view(-1, 4)
    - labels = labels.view(-1)

    => 사실 입력 타입보다는 학습량의 문제로 보임! 에포크를 늘릴 수록 안정적으로 loss가 감소

#### 성공ㅠㅠ 이제 testing 함수 만들기<hr>

#### 5. Testing 함수 생성

In [144]:
# 1. test_df_list
def dataframe_func(file_dir):
    for root, dirs, files in os.walk(file_dir):
        data_list = []
        label_list = []
        
        for file in files:
            f = open(os.path.join(root, file), 'r')
            data = f.read()
            f.close()
            
            data = data.split()
            data_num = [list(map(ord, list(word.lower()))) for word in data if word.isalpha()]
            data_list.append(data_num)
            
            if 'en' in file:
                label_list.append(0)
            elif 'fr' in file:
                label_list.append(1)
            elif 'id' in file:
                label_list.append(2)
            elif 'tl' in file:
                label_list.append(3)

    len_col, len_row = max([len(max(data, key = len)) for data in data_list]), max([len(data) for data in data_list])
    df_list = []
    for idx, data in enumerate(data_list):
        data_frame = pd.DataFrame(0, index = range(len_row), columns = range(len_col))
        
        for idx2, word in enumerate(data):
            for idx3, i in enumerate(word):
                data_frame.loc[idx2, idx3] = i
        
        df_list.append(data_frame.values.reshape(-1))   # 1차원으로 reshape, list에 추가
    
    return df_list, label_list

test_data, test_label = dataframe_func(test_dir)

In [145]:
# test_dl
test_data = LangDataset(test_data, test_label)
test_dl = DataLoader(test_data, batch_size = Batchs, shuffle = True, drop_last = True)
print(len(test_dl.dataset))

16


In [3]:
# 2. class 분포 확인
# - class별로 데이터가 얼마나 있는지 확인

tr_target, te_target = pd.Series(label_list), pd.Series(test_label)
tr_target.value_counts(), te_target.value_counts()

NameError: name 'label_list' is not defined

분포 일정, accuacy 사용 가능

In [None]:
# 2. testing 함수 : 정확도 계산
# - accuracy, f1_score
from sklearn.metrics import accuracy_score, f1_score

def testing(model, test_dl):
    model.eval()    # 평가 모드로 전환
    with torch.no_grad():   # 기울기 계산 비활성화
        for inputs, labels in test_dl.dataset:
            outputs = model(inputs)
            outputs = outputs.view(-1, 4)
            labels = labels.view(-1)
            pred = torch.argmax(outputs, 1)
            acc = accuracy_score(labels, pred)
            f1 = f1_score(labels, pred, average = 'macro')  # 'macro' : 단순 평균
            
    return acc, f1

In [None]:
# 근데 training이랑 testing 합치면 안됨? => model.eval() 차이

def training(model, train_dl, Epochs, lr = 0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr)
    
    for epoch in range(1, Epochs+1):
        for inputs, labels in train_dl.dataset:
            outputs = model(inputs)
            outputs = outputs.view(-1, 4)
            labels = labels.view(-1)
            # print(outputs, labels, outputs.shape, labels.shape)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # if batch % 10 == 0:
            #     print(f'Epoch : {epoch}, Batch : {batch}, Loss : {loss}')
            
        scheduler = ReduceLROnPlateau(optimizer, 'min', factor = 0.1, patience = 5)
        scheduler.step(loss)   # loss가 감소하지 않으면 학습률을 줄임
        # scheduler 정리
            # - optimizer : 최적화할 optimizer
            # - mode : min / max 중 하나 선택, min: 감소, max: 증가
            # - factor : 학습률을 줄일 비율; new_lr = lr * factor
            # - patience : 성능이 증가하지 않는 epoch 수
            # - verbose : True로 설정하면 감소되는 학습률 출력
            
        print(f'Epoch : {epoch}, batch : {epoch*len(train_dl.dataset)}, Loss : {loss}')
    print('Finished Training')

In [1]:
# 학습 / 평가 종합 함수
# TODO : .train(), .eval() 적용한 함수 구현
# - training 구성 요소 : .train(), for batch

def train_eval(model, train_dl, test_dl, Epochs, lr = 0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr)
    
    for epoch in range(1, Epochs+1):
        # training
        model.train()
        for inputs, labels in train_dl.dataset:
            outputs = model(inputs)
            outputs = outputs.view(-1, 4)   # shape 맞추기
            labels = labels.view(-1)        # shape 맞추기
            # print(outputs, labels, outputs.shape, labels.shape)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        scheduler = ReduceLROnPlateau(optimizer, 'min', factor = 0.1, patience = 5)
        scheduler.step(loss)   # loss가 감소하지 않으면 학습률을 줄임
        # scheduler 정리
            # - optimizer : 최적화할 optimizer
            # - mode : min / max 중 하나 선택, min: 감소, max: 증가
            # - factor : 학습률을 줄일 비율; new_lr = lr * factor
            # - patience : 성능이 증가하지 않는 epoch 수
            # - verbose : True로 설정하면 감소되는 학습률 출력
            
            # if batch % 10 == 0:
            #     print(f'Epoch : {epoch}, Batch : {batch}, Loss : {loss}')

        if scheduler.num_bad_epochs > scheduler.patience:
            print('Early Stopping : Over patience limit')
            break
            
    # evaluation
    model.eval()
    with torch.no_grad():
        for idx, (inputs, labels) in enumerate(test_dl.dataset):
            outputs = model(inputs)
            outputs = outputs.view(-1, 4)
            labels = labels.view(-1)
            pred = torch.argmax(outputs, 1)
            acc = accuracy_score(labels, pred)
            f1 = f1_score(labels, pred, average = 'macro')

            if idx % 10 == 0:
                print(f'Pred : {pred}, Labels : {labels}')
    
    print('Finished Training')
    print(f'Epoch : {epoch}, Loss : {loss}, Accuracy : {acc}, F1_score : {f1}')

In [None]:
# 

In [None]:
# 3. 
def testing(epoch, kind='validation'):
    model.eval()   # testing mode activate
    
    with torch.no_grad():
        test_report = [[], [], []]
        dataLoader = val_dl if kind == 'validation' else test_dl
        for cnt, (feature, target) in enumerate(dataLoader):
            feature, target = feature.to(DEVICE), target.to(DEVICE)
            target = target.squeeze()
            pred = model(feature)
            
            loss = LOSS_FUNC(pred, target)
            test_report[0].append(loss)
            
            acc = metrics.accuracy(pred, target, task="multiclass", num_classes=CLASSES)
            test_report[1].append(acc)
            
            f1 = metrics.f1_score(pred, target, task="multiclass", num_classes=CLASSES)
            test_report[2].append(f1)
            
            # OPTIMIZER.zero_grad()   # Reset W, b
            # loss.backward()         # Proceed backward to loss values
            # OPTIMIZER.step()        # Update W, b
            
    testing_type = "Validation" if kind == 'valid' else "Test"
    
    loss_score = (sum(test_report[0])/batch).item()
    acc_score = (sum(test_report[1])/batch).item()
    f1_score = (sum(test_report[2])/batch).item()
    print(f'[{epoch+1} {testing_type}] Loss ==> {loss_score:.3f}, Accuracy ==> {acc_score:.3f}, F1 Score ==> {f1_score:.3f}')
    
    return loss_score, acc_score, f1_score