In [1]:
import os
import re
import pandas as pd

def open_list(txt_file_path:str):
    items = []
    with open(txt_file_path, "r") as f:
        for line in f:
            items.append(str(line.strip()))
    return items

version = 'adamw_lambda_0.001'
txt_file_path = os.path.join('tb_logs',version,'rank_0.log')
open_list(txt_file_path)

['colossalai - colossalai - 2022-03-28 01:47:26,292 INFO:',
 "{'BATCH_SIZE': 128, 'NUM_EPOCHS': 30}",
 '',
 'colossalai - colossalai - 2022-03-28 01:47:26,296 INFO: cuDNN benchmark = True, deterministic = False',
 'colossalai - colossalai - 2022-03-28 01:47:26,306 INFO: Using LossHook for training, priority = 0',
 'colossalai - colossalai - 2022-03-28 01:47:26,308 INFO: Using LRSchedulerHook for training, priority = 1',
 'colossalai - colossalai - 2022-03-28 01:47:26,311 INFO: Using LogMetricByEpochHook for training, priority = 10',
 'colossalai - colossalai - 2022-03-28 01:47:26,313 INFO: Using LogMemoryByEpochHook for training, priority = 10',
 'colossalai - colossalai - 2022-03-28 01:47:26,315 INFO: Using LogTimingByEpochHook for training, priority = 10',
 'colossalai - colossalai - 2022-03-28 01:47:26,317 INFO: Using TensorboardHook for training, priority = 10',
 'colossalai - colossalai - 2022-03-28 01:47:26,320 INFO: Lower value means higher priority for calling hook function',
 

In [2]:
data = []

lines = open_list(txt_file_path)
for line in lines:
    loss_value = re.search(" Loss = ([\d|.]*)", line)
    if loss_value is not None:
        loss_value = loss_value.group(1)
    else:
        continue
    
    epoch = re.search("Epoch ([\d]*) ", line).group(1)
    split = re.search("/ (.*)]: ", line).group(1)

    data.append([version, loss_value, epoch, split])

data = pd.DataFrame(data,columns=['version','loss','epoch','split'])
data['loss'] = data['loss'].astype(float)
data['epoch'] = data['epoch'].astype(int)
data = data.sort_values(by=['version','epoch','split']).reset_index(drop=True)
data

Unnamed: 0,version,loss,epoch,split
0,adamw_lambda_0.001,0.711490,0,Test
1,adamw_lambda_0.001,0.204510,0,Test
2,adamw_lambda_0.001,0.833190,0,Train
3,adamw_lambda_0.001,0.579780,0,Train
4,adamw_lambda_0.001,0.641630,1,Test
...,...,...,...,...
115,adamw_lambda_0.001,0.003875,28,Train
116,adamw_lambda_0.001,0.229360,29,Test
117,adamw_lambda_0.001,0.048594,29,Test
118,adamw_lambda_0.001,0.242670,29,Train


In [3]:
# data.to_csv('tmp.csv')

In [4]:
# Run all

# Propose several learning rates for real training.
learning_rates = [0.1,0.05,0.001]
# Choose one optimizer 
optimizer_methods = ['sgd','adamw']
# Choose two learning rate scheduling method 
scheduler_methods = ['lambda','multistep','onecycle']

# Loop
data = []
for learning_rate in learning_rates:
    for optimizer_method in optimizer_methods:
        for scheduler_method in scheduler_methods:
            version = f'{optimizer_method}_{scheduler_method}_{learning_rate}'
            txt_file_path = os.path.join('tb_logs',version,'rank_0.log')
            lines = open_list(txt_file_path)
            for line in lines:
                loss_value = re.search(" Loss = ([\d|.]*)", line)
                if loss_value is not None:
                    loss_value = loss_value.group(1)
                else:
                    continue

                epoch = re.search("Epoch ([\d]*) ", line).group(1)
                split = re.search("/ (.*)]: ", line).group(1)

                data.append([optimizer_method, scheduler_method, learning_rate, loss_value, epoch, split])

data = pd.DataFrame(data,columns=['optimizer','scheduler','learning_rate','loss','epoch','split'])
data['loss'] = data['loss'].astype(float)
data['learning_rate'] = data['learning_rate'].astype(float)
data['epoch'] = data['epoch'].astype(int)
data = data.sort_values(by=['optimizer','scheduler','learning_rate','epoch','split']).reset_index(drop=True)
data

Unnamed: 0,optimizer,scheduler,learning_rate,loss,epoch,split
0,adamw,lambda,0.001,0.711490,0,Test
1,adamw,lambda,0.001,0.204510,0,Test
2,adamw,lambda,0.001,0.833190,0,Train
3,adamw,lambda,0.001,0.579780,0,Train
4,adamw,lambda,0.001,0.641630,1,Test
...,...,...,...,...,...,...
9175,sgd,onecycle,0.100,2.271900,29,Train
9176,sgd,onecycle,0.100,0.027415,29,Train
9177,sgd,onecycle,0.100,0.089330,29,Train
9178,sgd,onecycle,0.100,0.242670,29,Train


In [5]:
data.to_csv('tb_logs/results.csv',index=False)