# Philly data preparation

Weifan jiang, weifanjiang@g.harvard.edu

In [1]:
import datetime
import csv
import os
import json
import random
import numpy as np
import tqdm
import pandas as pd

## utility functions

In [8]:
# parse string date
def parse_date(date_str):
    if date_str is None or date_str == '' or date_str == 'None':
        return None
    if date_str.endswith("PST") or date_str.endswith("PDT"):
        date_str = date_str[:-4]
    return datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')


# convert time delta object to number of minutes
def timedelta_to_minutes(timedelta):
    minutes = 0.0
    minutes += timedelta.days * 24 * 60
    minutes += timedelta.seconds / 60.0
    minutes += timedelta.microseconds / (60 * 1000)
    return minutes


# count the number of machines that a job is scheduled on
# returns (cpu count, gpu count)
def count_machines(detail):
    cpu_count = len(detail)
    gpu_count = 0
    for machine in detail:
        gpu_count += len(machine["gpus"])
    return cpu_count, gpu_count


# read csv (the format is not compatible with pandas.read_csv)
def philly_read_csv(fpath, max_lines):
    columns = None
    data_lists = None
    with open(fpath, "r") as fin:
        reader = csv.reader(fin)
        columns = [x.strip() for x in next(reader)]
        data_lists = [list() for _ in columns]
        pbar = tqdm.tqdm_notebook(total=max_lines)
        for row_raw in reader:
            pbar.update(1)
            row = row_raw

            # special case for file misformat in gpu utilization trace
            if len(row) != len(columns):
                if row[-1] == "":
                    row = row[:-1]
                if len(row) == 18 and len(columns) == 10:
                    row = row[0:2] + row[2:][::2]
                row = row + [None, ] * (len(columns) - len(row))
            
            # remove the time zone
            if columns[0] == 'time':
                row[0] = row[0][:-4]
            
            for idx, element in enumerate(row):
                data_lists[idx].append(element)
        pbar.close()
    data_dict = dict()
    for colname, elements in zip(columns, data_lists):
        data_dict[colname] = elements
    return pd.DataFrame(data=data_dict)

## preparation

In [5]:
# seed
np.random.seed(10)
random.seed(10)


# data location
trace_dir = "philly-traces/trace-data/"
job_log_path = os.path.join(trace_dir, "cluster_job_log")
output_dir = "data/philly"
sampled_jobs_path = os.path.join(output_dir, "sampled_jobs.json")
os.makedirs(output_dir, exist_ok=True)
job_data_dir = os.path.join(output_dir, "job_data")
os.makedirs(job_data_dir, exist_ok=True)

## Sample jobs

In [7]:
if not os.path.isfile(sampled_jobs_path):
    # read full data
    with open(job_log_path, "r") as fin:
        job_log = json.load(fin)
    

    # filter for jobs with one attempt
    jobs_single_attempt = [x for x in job_log if len(x["attempts"]) == 1]
    # jobs with complete runtime properties
    for job in jobs_single_attempt:
        start_time = parse_date(job["attempts"][0]["start_time"])
        end_time = parse_date(job["attempts"][0]["end_time"])
        if start_time is not None and end_time is not None:
            job["runtime_min"] = timedelta_to_minutes(end_time - start_time)
        else:
            job["runtime_min"] = None
    jobs_single_attempt = [x for x in jobs_single_attempt if x['runtime_min'] is not None]
    # filter for jobs that lasted for at list 5 minutes
    jobs_single_attempt = [x for x in jobs_single_attempt if 5 <= x['runtime_min']]
    # try to select jobs scheduled on multiple GPUs
    jobs_single_attempt = [
        x for x in jobs_single_attempt if count_machines(x["attempts"][0]["detail"])[1] > 1
    ]


    # check the distribution of jobs with different final status
    jobs_pass = [x for x in jobs_single_attempt if x["status"] == "Pass"]
    jobs_killed = [x for x in jobs_single_attempt if x["status"] == "Killed"]
    jobs_failed = [x for x in jobs_single_attempt if x["status"] == "Failed"]
    print('job status: Pass ({}), Killed ({}), Failed ({})'.format(
        len(jobs_pass), len(jobs_killed), len(jobs_failed)
    ))


    # sample size: min length of qualified jobs in each status
    output_json = list()
    job_size = np.amin([len(jobs_pass), len(jobs_killed), len(jobs_failed)])
    pbar = tqdm.tqdm_notebook(total=job_size * 3, desc="extract sampled jobs")
    for jobs in [jobs_pass, jobs_killed, jobs_failed]:
        sampled_jobs = random.sample(jobs, job_size)
        for job in sampled_jobs:
            pbar.update(1)
            output_job = dict()
            for key in ("status", "vc", "jobid", "submitted_time", "user", "runtime_min"):
                output_job[key] = job[key]
            for key in ("start_time", "end_time", "detail"):
                output_job[key] = job["attempts"][0][key]
            output_json.append(output_job)
    pbar.close()
    with open(sampled_jobs_path, "w") as fout:
        json.dump(output_json, fout, indent=2)


with open(sampled_jobs_path, "r") as fin:
    sampled_jobs = json.load(fin)

## load full cluster traces

In [None]:
gpu_df = philly_read_csv(os.path.join(trace_dir, "cluster_gpu_util", 44750641))