### 500543568_530454874_hyper_test_log_to_excel

Team members: Andrew Zhang (SID:500543568) Vincent Yunansan (SID:530454874)  
This file transform all final log files into an excel file which can be used for further analysis  

In [12]:
#imports
import os
import re
import pandas as pd
import matplotlib.pyplot as plt 
from datetime import datetime

In [13]:
def time_to_seconds(time_str):
    '''
    helper function to convert time string (e.g. 1m2s) to seconds (e.g. 62)
    input : time string
    output : integer 
    '''
    
    minutes, seconds = map(int, re.findall(r'(\d+)m(\d+)s', time_str)[0])
    return minutes * 60 + seconds

def extract_info(file_path):

    '''
    function to extract training and validation information from log files
    input: file_path string which points to the directory containing log files
    output: dataframe containing training and validation information
    '''
    
    #ensure filepath ends with '/'
    if not file_path.endswith('/'):
        file_path += '/'

    training_df = pd.DataFrame(columns=['category','model_name', 'optimizer_name', 'learning_rate', 'epoch', 'loss', 'f1_score','time', 'size'])
    validation_df = pd.DataFrame(columns=['category','model_name', 'optimizer_name', 'learning_rate', 'epoch', 'loss', 'f1_score','time', 'size'])
    
    for file_name in os.listdir(file_path):
        try:
            model_name, optimizer_name, learning_rate = file_name[:-3].split('_')
        except ValueError:
            print(f"Skipping file {file_name} due to incorrect format.")
            continue

        with open(os.path.join(file_path, file_name), 'r') as file:
            print(f'file being read: {file}')
            lines = file.readlines()
            for line in lines:
                if 'Training' in line:
                    try:
                        # Match the exact structure with possible extra spaces
                        epoch, lr, loss, time = re.findall(r'Epoch:\s+(\d+), Current LR:\s+([\d\.e-]+), Loss:\s+([\d\.]+), Time:\s+(\d+m\d+s)', line)[0]
                        training_df.loc[len(training_df)] = {
                            'category': 'training',
                            'model_name': model_name,
                            'optimizer_name': optimizer_name,
                            'learning_rate': float(lr),
                            'epoch': int(epoch),
                            'loss': float(loss),
                            'time': time_to_seconds(time)
                        }
                    except IndexError:
                        print(f"Skipping line due to unmatched pattern: {line.strip()}")
                        continue
                elif 'Validation' in line:
                    try:
                        epoch, loss, lr, f1_score, time = re.findall(r'Epoch:\s+(\d+), Loss:\s+([\d\.]+), current lr:\s+([\d\.e-]+), F1 Score:\s+([\d\.]+), Time:\s+(\d+m\d+s)', line)[0]
                        validation_df.loc[len(validation_df)] = {
                            'category': 'validation',
                            'model_name': model_name,
                            'optimizer_name': optimizer_name,
                            'learning_rate': float(lr),
                            'epoch': int(epoch),
                            'loss': float(loss),
                            'f1_score': float(f1_score),
                            'time': time_to_seconds(time)
                        }
                    except IndexError:
                        print(f"Skipping line due to unmatched pattern: {line.strip()}")
                        continue
                elif 'Model size on disk' in line:
                    try:
                        size = re.findall(r'Model size on disk:\s([\d\.]+) MB', line)[0]
                        validation_df.loc[len(validation_df)] = {
                            'category': 'size',
                            'model_name': model_name,
                            'optimizer_name': optimizer_name,
                            'size': float(size)
                        }
                    except IndexError:
                        print(f"Skipping line due to unmatched pattern: {line.strip()}")
                        continue
    
    results_df = pd.concat([training_df, validation_df], ignore_index=True)

    return results_df

In [14]:
#create excel and save in models/logs

source_path = 'models/logs/final'
results_df = extract_info(source_path)

target_path = 'models/logs/'

with pd.ExcelWriter(target_path + 'output.xlsx') as writer:
    results_df.to_excel(writer, sheet_name='results', index=False)

Skipping file .DS_Store due to incorrect format.
file being read: <_io.TextIOWrapper name='models/logs/final/regnetMixPrec_Adadelta_0.01.log' mode='r' encoding='UTF-8'>
Skipping line due to unmatched pattern: 2024-05-14 01:03:57,396 - __main__ - INFO - Training concluded >>> Total Run Time: 276m58s
file being read: <_io.TextIOWrapper name='models/logs/final/regnet_Adadelta_0.01.log' mode='r' encoding='UTF-8'>
Skipping line due to unmatched pattern: 2024-05-02 13:10:05,024 - __main__ - INFO - Training concluded >>> Total Run Time: 542m5s
file being read: <_io.TextIOWrapper name='models/logs/final/googlenet_SGD_0.01.log' mode='r' encoding='UTF-8'>
Skipping line due to unmatched pattern: 2024-05-01 08:48:21,789 - __main__ - INFO - Training concluded >>> Total Run Time: 215m37s
file being read: <_io.TextIOWrapper name='models/logs/final/resnet50_SGD_0.01.log' mode='r' encoding='UTF-8'>
Skipping line due to unmatched pattern: 2024-05-02 04:07:07,005 - __main__ - INFO - Training concluded >>