In [2]:
import polars as pl

In [3]:
# nextflow log thirsty_pasteur -f task_id,workdir,attempt,realtime,status >run_stats/thirsty_pasteur.csv

# df = pl.read_csv('run_stats/compassionate_solvay.csv', separator='\t', has_header=False)
df = pl.read_csv('run_stats/thirsty_pasteur.csv', separator='\t', has_header=False) # 100 runs, t2d

df.columns = 'task_id,workdir,attempt,realtime,status'.split(',')
df = df.sort('task_id')
df[:4]

task_id,workdir,attempt,realtime,status
i64,str,i64,str,str
1,"""gs://singlem-s…",1,"""27m""","""COMPLETED"""
2,"""gs://singlem-s…",1,"""3h 34m 37s""","""COMPLETED"""
3,"""gs://singlem-s…",1,"""55m 30s""","""COMPLETED"""
4,"""gs://singlem-s…",1,"""20m 9s""","""COMPLETED"""


In [4]:
# regex to convert (which is e.g. 1h 33m 53s) to seconds
import re
def to_seconds(s):
    if matches := re.match(r'(\d+)h (\d+)m ([\d\.]+)s', s):
        (h, m, s) = matches.groups()
        return int(h) * 3600 + int(m) * 60 + float(s)
    elif matches := re.match(r'(\d+)m ([\d\.]+)s', s):
        (m, s) = matches.groups()
        return int(m) * 60 + float(s)
    elif matches := re.match(r'([\d\.]+)s', s):
        (s,) = matches.groups()
        return float(s)
    elif matches := re.match(r'(\d+)h', s):
        (h,) = matches.groups()
        return int(h) * 3600
    elif matches := re.match(r'(\d+)m', s):
        (m,) = matches.groups()
        return int(m) * 60
    elif matches := re.match(r'(\d+)h (\d+)m', s):
        (h, m) = matches.groups()
        return int(h) * 3600 + int(m) * 60
    elif matches := re.match(r'-', s):
        return 0
    else:
        raise ValueError(f'Could not parse {s}')
df = df.with_columns(pl.col('realtime').apply(to_seconds).cast(pl.Int32).alias('realtime_s'))
df[:4]

task_id,workdir,attempt,realtime,status,realtime_s
i64,str,i64,str,str,i32
1,"""gs://singlem-s…",1,"""27m""","""COMPLETED""",1620.0
2,"""gs://singlem-s…",1,"""3h 34m 37s""","""COMPLETED""",
3,"""gs://singlem-s…",1,"""55m 30s""","""COMPLETED""",
4,"""gs://singlem-s…",1,"""20m 9s""","""COMPLETED""",


In [5]:
# A rough total cost is $0.010051 per hour for the VM. So take the total time and multiply by that. Plus the disk cost, which is 30GB * $0.04 per month
# With t2d-standard-1, spot price is $0.007068
spot_cost = 0.007068
cost_estimate1 = df['realtime_s'].sum() * (spot_cost / 3600 + 30 * 0.04 / 30 / 24 / 3600)
cost_estimate1
# Then there is ~550,000 runs, but we only processed 100, so multiply by that.
cost_estimate2 = cost_estimate1 * 550000 / 100
cost_estimate2
# But those prices are USD, so converting to AUD
cost_estimate3 = cost_estimate2 * 1.495784
print('total cost', cost_estimate3)
print('cost of the test run', cost_estimate1 *  1.495784)

total cost 1881.4940253833777
cost of the test run 0.3420898227969778


In [6]:
run_ids = pl.read_csv('../100accessions2.txt', has_header=False)
run_ids.columns = ['acc']

df = pl.read_csv('~/git/sandpiper/sra_metadata/shotgun_sra_20240112.some_columns.csv.gz', has_header=False)
df.columns = ['acc','releasedate','mbases','organism','mbytes','avgspotlen']
df[:4]

m = run_ids.join(df, on='acc', how='inner')
total_processed = sum(m['mbases'])

total = sum(df['mbases'])
total_processed / total * 100

observed_cost = 0.89
total / total_processed * observed_cost

5568.617740426561