#Data Generation
##Details:
One row ~ one project

Features:

- `n_tasks` -> number of activities (project size)
- `n_edges` -> number of precedence links
- `network_density` -> how interconnected the tasks are {`n_edges`/ `n_tasks`^2}
- `T_baseline` -> total duration from CPM using most-likely (m)
- `cp_len` -> number of tasks on the critical path
- `pct_critical_tasks` -> proportion of critical tasks {`cp_len`/`n_tasks`}
- `avg_task_duration` -> mean(m) across all tasks
- `variability` -> average uncertainty width {mean(p-o)}
- `spi_early` -> {EV/PV} at 20% progress
- `cpi_early` -> {EV/AC} at 20% progress
- `instability` -> {std(T)} or variance of project durations across MC runs
- `risk_exposure` -> fraction of MC runs exceeding {baseline*1.025}
- `buffer_factor` -> buffer applied to baseline duration for 'on-time' threshold
- `p_late_diag` -> sanity check

Label:
`label_delay` = 1 if p_late > 1.025, 0 otherwise

##Steps:
1. Generate DAGs randomly (at least for now, later can use templates/ rangen2)
2. Generate PERT triplets
3. Run CPM logic (forward/backward passes)
4. Baseline definition
5. EVM snapshot
6. MC labels
7. Feature engineering
8. Write the csv


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# front matter
import math, random, statistics
from dataclasses import dataclass
from typing import List, Tuple, Dict
import csv
import glob                      # for getting all rcp files from set1
import os
import pandas as pd
RNG = random.Random(42)          # answer to ultimate question of life, the universe, and everything :)


In [17]:
# parsing one .rcp file and outputs-> durations: dict[int, int]; edges: list[tuple(int, int)]
def read_pat(path, drop_dummies=True):
  durations = {}
  edges = []
  with open(path, "r") as f:
    lines=[l.strip() for l in f if l.strip()]
    # header:
    n_acts, n_res = map(int, lines[0].split())
    # skip line 2:
    index = 2
    # looping through each activity
    for id in range(1, n_acts+1):
      line = lines[index]
      parts = [int(x) for x in line.split()]
      durations[id] = parts[0]                                                        # 1-indexed!!! don't screw up
      index_after_res = n_res+1                                                       # skipping the resources
      k = parts[index_after_res]
      succs = parts[index_after_res+1: index_after_res+1+k]
      for s in succs:
        edges.append((id, s))                                                         # also 1-indexed!!! (as good as R now lol)
      index += 1
    # dropping dummy nodes (start and end) and then remapping
    if drop_dummies:
      actual_ids = list(range(2, n_acts))
      id_map = {old: new for new, old in enumerate(actual_ids, start=1)}               # change ids to one less(2->1, 3->2)
      durations = {id_map[i]: d for i, d in durations.items() if i in id_map}
      edges = [(id_map[u],id_map[v]) for u, v in edges if u in id_map and v in id_map] # remapping edges as per new activity ids

    return durations, edges

In [18]:
# assign pert triplets with rules: m = duration from projects; o = max(1, round(0.7*m)); p = max(m+1, round(1.4*m)); o < m < p
@dataclass
class Pert:
  o: float
  m: float
  p: float
def make_triplets(duration: int):
  m = duration
  o = max(1, round(0.7*m))
  p = max(m+1, round(1.4*m))
  return Pert(o, m, p)

In [19]:
# getting predecessors from edgelist
def get_preds(edges, n_tasks):
  preds = [[] for i in range(n_tasks+1)]
  for u, v in edges:
    preds[v].append(u)
  return preds


In [6]:
# cpm: forward pass + backtrack; (ES, EF, T, cp) -> (earliest start, earliest finish, total duration, critical path)
def cpm_early(preds: List[List[int]], durations: List[float], eps = 1e-9):
  n = len(durations) - 1                                    # 1 indexed!!
  es = [0.0]*(n+1)
  ef = [0.0]*(n+1)
  for i in range(1, n+1):
    if preds[i]:
      es[i] =  max(ef[j] for j in preds[i])
    ef[i] = es[i] + durations[i]
  T = max(ef[1:])                                            # makespan
  # backtrack to get critical path
  cp = []
  i = max(range(1, n + 1), key=lambda k: ef[k])              # position of largest EF (sink)
  while True:
    cp.append(i)                                             # since it is the last task, it definitely on the critical path
    if not preds[i]:
      break
    options = [j for j in preds[i] if abs(ef[i]-ef[j]) < eps]
    if options:
      best_j = max(options, key= lambda j: ef[j])            # for multiple options, pick the one with the longest ef
    else:
      best_j = max(preds[i], key=lambda j: ef[j])            # fallback: pick pred with latest EF
    i = best_j
  cp.reverse()
  return es, ef, T, cp

In [7]:
# getting baseline ES, EF, T, and CP by running CPM on the most likely values (from the triplets)
def baseline_schedule(preds: List[List[int]], pert_triplets: List[Pert], n_tasks, buffer_factor= 1.025, eps = 1e-9):       # buffer factor of 2.5%-> strict, gives more variations in data
  # extract most likely durations from Pert objects
  m = [0.0]*(n_tasks+1)
  for i in range(1, n_tasks+1):
    m[i] = pert_triplets[i].m
  # run CPM on m_i
  es_b, ef_b, T_b, cp = cpm_early(preds, m, eps)
  # get deadline = buffer * baseline total duration
  deadline = buffer_factor*T_b
  # building output dictionary
  out ={
      "baseline_T": T_b,
      "baseline_es": es_b,
      "baseline_ef": ef_b,
      "baseline_cp": cp,
      "deadline": deadline,
      "buffer_factor": buffer_factor
  }
  return out


In [8]:
# helper â†’ sample duration d ~ Triangular(o, m, p)
def sample_triangular(pert_triplets):
    n_tasks = len(pert_triplets) - 1          # because index 0 is dummy
    d = [0.0] * (n_tasks + 1)
    for i in range(1, n_tasks + 1):
        t = pert_triplets[i]
        d[i] = RNG.triangular(t.o, t.m, t.p)
    return d

In [9]:
# Early EVM snapshot: getting performance indices CPI, SPI
def evm_snapshot(preds: List[List[int]], pert_triplets: List[Pert], ef_baseline: List[float], T_baseline: float, frac= 0.20, eps=1e-9):
  # number of real tasks
  n_tasks = len(pert_triplets) - 1
  # early snapshot time
  t_early = T_baseline * frac
  t_early = max(t_early, min(ef_baseline[1:]) + 1e-6)  # ensure >= first planned finish
  # sample duration (single draw)
  d = sample_triangular(pert_triplets)
  # running cpm to get ef_real
  es_real, ef_real, T_real, cp_real = cpm_early(preds, d, eps)
  # defining aggregates pv, ev, ac-> planned value, earned value, actual cost
  pv = 0.0
  ev = 0.0
  ac = 0.0
  for i in range(1, n_tasks + 1):
      t = pert_triplets[i]
      if ef_baseline[i] <= t_early:
          pv += t.m
      if ef_real[i] <= t_early:
          ev += t.m
          ac += d[i]
  # calculating performance indices spi, cpi-> schedule performance index, cost performance index
  spi = ev/max(pv, eps)
  cpi= ev/max(ac, eps)
  # building output dictionary
  out = {
        "t_early": t_early,
        "PV": pv,
        "EV": ev,
        "AC": ac,
        "SPI_early": spi,
        "CPI_early": cpi
        }
  return out


In [10]:
# monte carlo sims
def monte_carlo_label(preds: List[List[int]], pert_triplets: List[Pert], deadline: float, K: int, eps=1e-9):
  n_tasks = len(pert_triplets) - 1
  # init counter
  count = 0
  # run simulation K times-> sample duration, run cpm to get total duration for each run
  for k in range(K):
    d = sample_triangular(pert_triplets)
    es_k, ef_k, T_k, cp_k = cpm_early(preds, d, eps)
    # increase counter to record number of 'late' projects
    if T_k > deadline:
      count += 1
  # p_late-> fraction of runs that finish late
  p_late = count/K
  label = 1 if p_late >= 0.5 else 0      # label as delayed if the probability of delay is more than 0.5

  return p_late, label

In [11]:
# putting it all together-> features for one row of the file
def project_features(preds, edges, pert_triplets, base, snap, p_late, label):
  # structure:
  n_tasks = len(preds) - 1
  n_edges = len(edges)
  max_edges = n_tasks * (n_tasks - 1) / 2
  density   = n_edges / max(max_edges, 1)
  cp = base["baseline_cp"]
  cp_len = len(cp)
  pct_critical_tasks = cp_len/max(n_tasks, 1)
  T_baseline = base["baseline_T"]

  # uncertainty:
  m_list = [pert_triplets[t].m for t in range(1, n_tasks+1)]
  ranges = [pert_triplets[i].p - pert_triplets[i].o for i in range(1, n_tasks + 1)]
  avg_task_duration = statistics.fmean(m_list)
  variability = statistics.fmean(ranges)
  instability = statistics.stdev(m_list) if n_tasks>1 else 0.0

  # performance/early health:
  spi_early = snap["SPI_early"]
  cpi_early = snap["CPI_early"]

  risk_exposure = p_late

  # policy knobs:
  buffer_factor = base["buffer_factor"]

  # building dict:
  row = {
        # structure
        "n_tasks": n_tasks,
        "n_edges": n_edges,
        "density": density,
        "critical_path_len": cp_len,
        "pct_critical_tasks": pct_critical_tasks,
        "T_baseline": T_baseline,

        # uncertainty
        "mean_m": avg_task_duration,
        "mean_range_po": variability,
        "instability_m": instability,   # optional

        # early health
        "spi_early": spi_early,
        "cpi_early": cpi_early,

        # policy
        "buffer_factor": buffer_factor,

        # labels/diagnostics
        "label_delay": label,
        "p_late_diag": p_late,
    }
  return row


In [12]:
# function to build full dataset
def build_data_rangen(path, buffer_factor, K):
  all_files = glob.glob(path + "/*.rcp")
  all_files.sort()
  print(f"Found {len(all_files)} .rcp files")

  rows = []

  for file_path in all_files:
      # --- parse this .rcp file ---
      durations, edges = read_pat(file_path, drop_dummies=True)
      n_tasks = len(durations)

      # preds
      preds = get_preds(edges, n_tasks)

      # PERT triplets (1-indexed list)
      triplets = [None] * (n_tasks + 1)
      for i in range(1, n_tasks + 1):
          triplets[i] = make_triplets(durations[i])

      # baseline schedule
      base = baseline_schedule(
          preds,
          triplets,
          n_tasks,
          buffer_factor=buffer_factor
      )

      # EVM snapshot
      snap = evm_snapshot(
          preds,
          triplets,
          base["baseline_ef"],
          base["baseline_T"]
      )

      # Monte Carlo
      p_late, label = monte_carlo_label(
          preds,
          triplets,
          base["deadline"],
          K,
      )

      # one feature row
      row = project_features(preds, edges, triplets, base, snap, p_late, label)

      rows.append(row)

  print(f"Built {len(rows)} rows")
  return rows


In [13]:
# write to csv:
def write_csv(rows: List[Dict], path: str, round_cols: List[str]):

    # rounding
    if round_cols:
        for r in rows:
            for c in round_cols:
                if c in r and isinstance(r[c], float):
                    r[c] = round(r[c], 3)

    # fields= first row
    fields = list(rows[0].keys())

    with open(path, "w", newline="") as f:
        w = csv.DictWriter(f, fieldnames=fields)
        w.writeheader()
        w.writerows(rows)

In [14]:
if __name__ == "__main__":

  RNG.seed(42)

  # folder with your .rcp files (Set 1)
  folder = "/content/drive/MyDrive/Set 1"

  rows = build_data_rangen(
      path=folder,
      buffer_factor=1.01, # 1% buffer
      K=200,
  )

  pretty_cols = [
      "T_baseline",
      "mean_m",
      "mean_range_po",
      "spi_early",
      "cpi_early",
      "instability_m",
      "density",
      "pct_critical_tasks",
      "buffer_factor",
      "p_late_diag",
  ]

  out_csv = "rg30_s.csv"
  write_csv(rows, out_csv, round_cols=pretty_cols)

  print(f"Wrote {len(rows)} rows -> {out_csv}")
  pos = sum(r["label_delay"] for r in rows)
  print(f"label 1s: {pos}/{len(rows)} = {pos/len(rows):.2f}")


Found 900 .rcp files
Built 900 rows
Wrote 900 rows -> rg30_s.csv
label 1s: 465/900 = 0.52


In [15]:
df = pd.DataFrame(rows)
print(df["label_delay"].value_counts(normalize=True))
df.head()

label_delay
1    0.516667
0    0.483333
Name: proportion, dtype: float64


Unnamed: 0,n_tasks,n_edges,density,critical_path_len,pct_critical_tasks,T_baseline,mean_m,mean_range_po,instability_m,spi_early,cpi_early,buffer_factor,label_delay,p_late_diag
0,30,28,0.064,2,0.067,20.0,5.467,3.867,3.014,0.765,1.093,1.01,1,0.535
1,30,54,0.124,3,0.1,22.0,5.667,4.067,2.324,0.692,1.069,1.01,1,0.615
2,30,194,0.446,3,0.1,27.0,5.533,3.933,2.315,4.0,1.186,1.01,1,0.61
3,30,43,0.099,6,0.2,26.0,5.167,3.667,2.705,0.654,0.961,1.01,1,0.625
4,30,50,0.115,6,0.2,31.0,5.667,4.067,2.881,0.68,0.973,1.01,0,0.485


#Data Preprocessing