In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter, SecondLocator

# Plot 1: Succeeded Jobs
with open('unique_jobs_24.txt', 'r') as file:
    succeeded_data = file.readlines()

# Extracting succeeded timestamps
succeeded_timestamps = [line.split()[5] for line in succeeded_data if line.split()[5] != "<none>"]
succeeded_timestamps = [ts.split(',')[0] for ts in succeeded_timestamps]  # Extract only the first timestamp
succeeded_timestamps = pd.to_datetime(succeeded_timestamps, errors='coerce')  # Use errors='coerce' to handle parsing errors

# Extracting first succeeded timestamp
first_succeeded_timestamp = succeeded_timestamps.dropna().min()

succeeded_counts = succeeded_timestamps.value_counts().sort_index()

# Plot 2: Running and Pending Jobs
with open('running_24_nodes_job.txt', 'r') as file:
    running_data = file.readlines()

timestamps_running = []
cumulative_sum_running = 0
encountered_jobs_running = set()

for line in running_data:
    parts = line.split()
    job_id = parts[0]
    if job_id in encountered_jobs_running:
        continue
    
    start_time = pd.to_datetime(parts[3])
    finish_time_str = parts[5].split(',')[0]
    
    if finish_time_str != '<none>':
        finish_time = pd.to_datetime(finish_time_str)
        timestamps_running.append((start_time, 1))
        timestamps_running.append((finish_time, -1))
        encountered_jobs_running.add(job_id)

timestamps_running.sort()

x_running = []
y_running = []
for timestamp in timestamps_running:
    cumulative_sum_running += timestamp[1]
    x_running.append(timestamp[0])
    y_running.append(cumulative_sum_running)

with open('pending_24_nodes_job.txt', 'r') as file:
    pending_data = file.readlines()

timestamps_pending = []
cumulative_sum_pending = 0
encountered_jobs_pending = set()

for line in pending_data:
    parts = line.split()
    job_id = parts[0]
    if job_id in encountered_jobs_pending:
        continue
    
    start_time_str = parts[2]
    if start_time_str == '<none>':
        continue
    
    start_time = pd.to_datetime(start_time_str)
    finish_time_str = parts[3].split(',')[0]
    
    if finish_time_str != '<none>':
        finish_time = pd.to_datetime(finish_time_str)
        timestamps_pending.append((start_time, 1))
        timestamps_pending.append((finish_time, -1))
        encountered_jobs_pending.add(job_id)

timestamps_pending.sort()

x_pending = []
y_pending = []
for timestamp in timestamps_pending:
    cumulative_sum_pending += timestamp[1]
    x_pending.append(timestamp[0])
    y_pending.append(cumulative_sum_pending)

# Plot 3: Total Number of Unique Jobs Over Time
data = pd.read_csv('agc-24-3700-240326-1510-exp3.txt', sep='\s{2,}', engine='python')
data = data[data['NAME'].str.startswith('reana-run-job-')].drop_duplicates(subset=['NAME'])
data['CREATED'] = pd.to_datetime(data['CREATED'])
job_counts = data.groupby('CREATED').size().cumsum()

# Plotting the data
plt.figure(figsize=(12, 8))

# Plot succeeded jobs
plt.plot(succeeded_counts.index, succeeded_counts.cumsum(), label='Finished', linestyle='-', color='green', alpha=0.5)

# Plot first succeeded job
#plt.axvline(x=first_succeeded_timestamp, color='red', linestyle='--', label='First Succeeded Job')

# Plot running jobs
plt.plot(x_running, y_running, linestyle='-', color='blue', alpha=0.5, linewidth=3, label='Running')

# Plot pending jobs
plt.plot(x_pending, y_pending, linestyle='-', color='orange', alpha=0.5, linewidth=3, label='Pending')

# Plot total number of unique jobs over time
plt.plot(job_counts.index, job_counts.values, linestyle='-', color='purple', label='Created')

plt.xlabel('Processing time')
plt.ylabel('Number of Jobs')
plt.title("AGC CMS ttbar analysis running on REANA on 24 nodes (8 vCPU, 16 GiB RAM) requesting 3.7GiB per job")
plt.gca().xaxis.set_major_formatter(DateFormatter("%H:%M:%S"))
plt.gca().xaxis.set_major_locator(SecondLocator(interval=40))
plt.grid(True)
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


# Open the input file for reading
with open('agc-48-3700-240404-0840-exp3-a-1.txt', 'r') as f:
    lines = f.readlines()  # Read all lines from the file

# Initialize a dictionary to store the last unique "reana-run-job" entries
unique_jobs = {}

# Extract "reana-run-job" entries from the lines
for line in lines:
    if line.strip().startswith('reana-run-job-'):
        job_id = line.strip().split()[0]  # Extract the job ID
        unique_jobs[job_id] = line.strip()  # Store the latest entry for this job ID

# Write the unique "reana-run-job" entries to a new file
with open('unique_jobs_48.txt', 'w') as f:
    for job in unique_jobs.values():
        f.write(job + '\n')


In [None]:
#Collection of starting 
with open('unique_jobs_3_1.txt', 'r') as f:
    lines = f.readlines()  

formatted_times = []


for line in lines:
    timestamp = line.strip().split()[4]
    time_part = timestamp.split('T')[1].split('Z')[0]
    formatted_times.append(time_part)

with open('started_time_3.txt', 'w') as f:
    for time in formatted_times:
        f.write(time + '\n')


In [None]:
# Read the data from the text file
with open('agc-3-1850-240410-1000-exp3-1.txt', 'r') as file:
    data = file.readlines()

# Sort the data based on job status (Pending, Running)
sorted_data = sorted(data, key=lambda x: x.split()[1])

# Filter pending jobs and save them to a separate text file
pending_jobs = [line for line in sorted_data if line.split()[1] == 'Pending']
with open('pending_3nodes_job_exp5.txt', 'w') as file:
    file.writelines(pending_jobs)


In [None]:
# Read the data from the text file
with open('agc-3-1850-240410-1000-exp3-1.txt', 'r') as file:
    data = file.readlines()

# Sort the data based on job status (Pending, Running)
sorted_data = sorted(data, key=lambda x: x.split()[1])

# Filter pending jobs and save them to a separate text file
running_jobs = [line for line in sorted_data if line.split()[1] == 'Running']
with open('running_3nodes_job_exp5.txt', 'w') as file:
    file.writelines(running_jobs)