In [2]:
import pandas as pd
import os, glob

def get_header_from_schema(schema: pd.DataFrame, subdir: str) -> list[str]:
    names = schema[schema['file pattern'].str.contains(f'^{subdir}')]['content'].tolist()
    return [s.replace(' ', '_') for s in names]


def get_df_from_multiple_csv_in_folder(folder: os.PathLike, names, limit_no_of_file=None, usecols=[], dtype=None, converters=None) -> pd.DataFrame:
    all_csv = glob.glob(os.path.join(folder, '*.csv'))
    if limit_no_of_file:
        all_csv = all_csv[:limit_no_of_file]

    try:
        return pd.concat((pd.read_csv(f, names=names, usecols=usecols, dtype=dtype, converters=converters, on_bad_lines='skip', verbose=True)\
                          for f in all_csv), ignore_index=True)
    except MemoryError:
        retry_files = len(all_csv) // 2
        print(f"Out of memory, retry with litmiting of {retry_files} files")
        get_df_from_multiple_csv_in_folder(folder, names, limit_no_of_file=retry_files, usecols=usecols, dtype=dtype)

In [7]:
import configparser, numpy as np

config = configparser.ConfigParser()
config.read('data.ini')
default = config['DEFAULT']

TRACES_PATH = default.get('traces_path')
MACHINE_EVENTS_SUBDIR = default.get('machine_events_subdir')

schema = pd.read_csv(os.path.join(TRACES_PATH, 'schema.csv'))

machines_info = get_df_from_multiple_csv_in_folder(os.path.join(TRACES_PATH, MACHINE_EVENTS_SUBDIR),\
                                                names=get_header_from_schema(schema, MACHINE_EVENTS_SUBDIR),\
                                                usecols=['machine_ID', 'CPUs', 'Memory'],
                                                dtype={'machine_ID': np.int64})
machines_info.dropna(inplace=True)
machines_info.drop_duplicates(subset='machine_ID', inplace=True)
machines_info.to_csv(default.get('machines_info_path'), index=False)

Tokenization took: 10.67 ms
Type conversion took: 2.01 ms
Parser memory cleanup took: 0.00 ms


In [8]:
TASK_EVENTS_SUBDIR = default.get('task_events_subdir')

task_events = get_df_from_multiple_csv_in_folder(os.path.join(TRACES_PATH, TASK_EVENTS_SUBDIR),\
                                                    names=get_header_from_schema(schema, TASK_EVENTS_SUBDIR),\
                                                    usecols=['time', 'job_ID', 'task_index', 'event_type', 'CPU_request', 'memory_request'],\
                                                    dtype={'event type': 'category'})
task_events.dropna(inplace=True)

submitted_tasks = task_events[task_events['event_type'] == 1]
submitted_tasks.drop(columns=['event_type'], inplace=True)
# tasks might be scheduled multiple time but might be fail/evicted/lost and scheduled again, keep only the last scheduled timestamp
submitted_tasks.drop_duplicates(subset=['job_ID', 'task_index'], keep='last', inplace=True)
submitted_tasks.rename(columns={'time': 'start_time'}, inplace=True)

task_events = task_events[task_events['event_type'] == 4]
task_events.drop(columns=['event_type', 'CPU_request', 'memory_request'], inplace=True)
task_events.rename(columns={'time': 'finished_time'}, inplace=True)

KEY = ['job_ID', 'task_index']
submitted_tasks.set_index(KEY, inplace=True)
task_events.set_index(KEY, inplace=True)
tasks = task_events.join(submitted_tasks, on=KEY, how='inner')

tasks = tasks[tasks.finished_time > tasks.start_time]
tasks['runtime'] = (tasks.finished_time - tasks.start_time) / 1e+6 # time is in microsecond, convert to second

tasks.drop(columns=['start_time', 'finished_time'], inplace=True)
tasks.reset_index(drop=True, inplace=True)
tasks.to_csv(default['tasks_path'], index=False)

Tokenization took: 23.20 ms
Type conversion took: 0.00 ms
Parser memory cleanup took: 0.00 ms
Tokenization took: 29.57 ms
Type conversion took: 25.51 ms
Parser memory cleanup took: 0.00 ms
Tokenization took: 20.36 ms
Type conversion took: 5.81 ms
Parser memory cleanup took: 0.00 ms
Tokenization took: 20.85 ms
Type conversion took: 13.01 ms
Parser memory cleanup took: 0.00 ms
Tokenization took: 16.66 ms
Type conversion took: 0.00 ms
Parser memory cleanup took: 0.00 ms
Tokenization took: 33.52 ms
Type conversion took: 5.51 ms
Parser memory cleanup took: 0.00 ms
Tokenization took: 13.37 ms
Type conversion took: 25.24 ms
Parser memory cleanup took: 0.00 ms
Tokenization took: 28.03 ms
Type conversion took: 7.00 ms
Parser memory cleanup took: 0.00 ms
Tokenization took: 4.00 ms
Type conversion took: 1.00 ms
Parser memory cleanup took: 0.00 ms
Tokenization took: 26.79 ms
Type conversion took: 7.03 ms
Parser memory cleanup took: 0.00 ms
Tokenization took: 21.07 ms
Type conversion took: 7.16 ms


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submitted_tasks.drop(columns=['event_type'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submitted_tasks.drop_duplicates(subset=['job_ID', 'task_index'], keep='last', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submitted_tasks.rename(columns={'time': 'start_time'}, inplace=True)
