# Parsing the Juliet dataset with Joern

In [None]:
joern-parse <path-to-juliet-CWE> --language c

In [None]:
joern <path-to-cpg.bin>

* `joern-parse` converts the C source files into a **Code Property Graph (CPG)**.
* The CPG unifies:

  * **AST** (Abstract Syntax Tree)
  * **CFG** (Control Flow Graph)
  * **PDG** (Program Dependence Graph)
* This representation enables **structural, control-flow, and dataflow-aware analysis**.
* The Juliet test suite is ideal because:

  * Functions ending in `_bad` contain known vulnerabilities
  * Functions starting with `good` represent safe variants


In [None]:
run.ossdataflow


In [None]:
val julietMethods = cpg.method.filter { m =>
  m.name != "<global>" &&
  m.name != "main" &&
  m.ast.isCall.nonEmpty && (
    m.name.endsWith("_bad") ||
    m.name.startsWith("good")
  )
}.toList

* Excludes:

  * `<global>` (artifacts of parsing)
  * `main` (test harness, not vulnerability logic)
* Keeps only functions that:

  * Contain actual behavior (`ast.isCall.nonEmpty`)
  * Follow Juliet naming conventions (`_bad` / `good*`)
* This ensures **clean supervision** for ML.

In [None]:
def labelOf(m: io.shiftleft.codepropertygraph.generated.nodes.Method): Int = if (m.name.contains("_bad")) 1 else 0


* `1` → vulnerable function
* `0` → safe function
* This labeling is **standard and defensible** in academic work because Juliet explicitly encodes ground truth.


In [None]:
val cpgFeatures = julietMethods.map { m =>

  /* =====================================================
   * AST – Size & Structural Complexity
   * ===================================================== */
  val astNodes     = m.ast.size
  val callCount    = m.ast.isCall.size
  val controlCount = m.ast.isControlStructure.size
  val identifierCount = m.ast.isIdentifier.size
  val literalCount    = m.ast.isLiteral.size

  /* Approximate AST depth */
  val maxAstDepth =
    m.ast.depth.maxOption.getOrElse(0)

  /* =====================================================
   * API Usage (general across CWEs)
   * ===================================================== */
  val inputCallCount =
    m.ast.isCall.name("recv|read|scanf|gets|fgets").size

  val conversionCallCount =
    m.ast.isCall.name("atoi|atol|strtol").size

  val memoryCallCount =
    m.ast.isCall.name("memcpy|memmove|strcpy|strncpy").size

  val execCallCount =
    m.ast.isCall.name("system|exec.*").size

  val formatCallCount =
    m.ast.isCall.name("printf|sprintf|snprintf").size

  /* =====================================================
   * Pointer / Type Complexity (Joern-safe)
   * ===================================================== */
  val pointerIdentifierCount =
    m.ast.isIdentifier.code(".*\\*.*").size   // heuristic

  val addressOfCount =
    m.ast.isCall.code(".*&.*").size           // heuristic

  /* =====================================================
   * CFG – Control Flow Complexity
   * ===================================================== */
  val branchCount =
    m.ast.isControlStructure.code("if|switch").size

  val loopCount =
    m.ast.isControlStructure.code("for|while|do").size

  val returnCount =
    m.ast.isReturn.size

  val cyclomaticComplexity =
    branchCount + loopCount + 1

  /* =====================================================
   * Guards / Validation
   * ===================================================== */
  val boundsCheckCount =
    m.ast.isControlStructure
      .condition
      .code(".*<.*|.*<=.*|.*>.*|.*>=.*")
      .size

  val nullCheckCount =
    m.ast.isControlStructure
      .condition
      .code(".*NULL.*")
      .size

  /* =====================================================
   * Memory Writes & Sinks
   * ===================================================== */
  val arrayWriteCount =
    m.ast.code(".*\\[.*\\].*=.*").size

  val sinkCallCount =
    m.ast.isCall.name("memcpy|memmove|strcpy|system|exec.*").size

  /* =====================================================
   * Dataflow / Taint (coarse, thesis-safe)
   * ===================================================== */
  val taintReachable =
    inputCallCount > 0 && sinkCallCount > 0

  /* =====================================================
   * Def–Use Approximation
   * ===================================================== */
  val defCount =
    m.ast.isIdentifier.where(_.inAssignment).size

  val useCount =
    m.ast.isIdentifier.size

  /* =====================================================
   * Loop Context
   * ===================================================== */
  val sinkInsideLoop =
    m.ast.isCall
      .name("memcpy|memmove|strcpy|system|exec.*")
      .inAst
      .isControlStructure
      .code("for|while|do")
      .nonEmpty

  /* =====================================================
   * Normalized / Derived
   * ===================================================== */
  val callsPerAstNode =
    if (astNodes > 0) callCount.toDouble / astNodes else 0.0

  val checksPerSink =
    if (sinkCallCount > 0) boundsCheckCount.toDouble / sinkCallCount else 0.0

  /* =====================================================
   * Final Feature Vector
   * ===================================================== */
  (
    m.fullName,
    labelOf(m),

    astNodes,
    callCount,
    controlCount,
    identifierCount,
    literalCount,
    maxAstDepth,

    inputCallCount,
    conversionCallCount,
    memoryCallCount,
    execCallCount,
    formatCallCount,

    pointerIdentifierCount,
    addressOfCount,

    branchCount,
    loopCount,
    returnCount,
    cyclomaticComplexity,

    boundsCheckCount,
    nullCheckCount,

    arrayWriteCount,
    sinkCallCount,

    taintReachable,

    defCount,
    useCount,

    sinkInsideLoop,

    callsPerAstNode,
    checksPerSink
  )
}


### AST size & structural complexity

```scala
astNodes, callCount, controlCount,
identifierCount, literalCount, maxAstDepth
```

**Why these matter:**

* Vulnerable code often shows:

  * Higher complexity
  * Deeper nesting
  * More operations per function
* `maxAstDepth` approximates **nesting depth**, which correlates with logic errors.

---

### API usage features

```scala
inputCallCount
conversionCallCount
memoryCallCount
execCallCount
formatCallCount
```

* Integer overflow vulnerabilities often involve:

  * **Input functions** (`read`, `scanf`)
  * **Integer conversions** (`atoi`, `strtol`)
  * **Memory operations** (`memcpy`, `strcpy`)
* These features encode **semantic risk**, not just syntax.

---

### Pointer & type complexity (heuristic)

```scala
pointerIdentifierCount
addressOfCount
```

**Explanation:**

* Increased pointer usage often correlates with:

  * Buffer miscalculations
  * Memory corruption

---

### Control-flow & cyclomatic complexity

```scala
branchCount
loopCount
cyclomaticComplexity
```

**Explanation:**

* Complex control flow increases:

  * Missed bounds checks
  * Incorrect size calculations
* Cyclomatic complexity is computed conservatively as:

```
branches + loops + 1
```

---

### Guards & validation checks

```scala
boundsCheckCount
nullCheckCount
```

**Why this matters:**

* Safe code typically contains:

  * Explicit bounds checks
  * NULL validation

This directly models **defensive programming behavior**.

---

### Memory writes & sinks

```scala
arrayWriteCount
sinkCallCount
```

---

### Coarse taint approximation

```scala
val taintReachable =
  inputCallCount > 0 && sinkCallCount > 0
```

**Explanation:**

* This is a **lightweight taint proxy**:

  * Does NOT require full dataflow tracking
* Indicates potential **input → sink** paths

---

### Sink inside loop

```scala
sinkInsideLoop
```

**Explanation:**

* Repeated writes inside loops amplify overflow risk
* Especially relevant for size-miscalculated buffers

---

### Normalized features

```scala
callsPerAstNode
checksPerSink
```

**Why normalization is important:**

* Prevents model bias toward large functions
* Encodes **density**, not absolute counts
* `checksPerSink` is especially insightful:

  * Low value → risky behavior



In [None]:
import java.nio.file.{Files, Paths}

val cpgHeader =
  """id,label,
ast_nodes,call_count,control_count,identifier_count,literal_count,max_ast_depth,
input_call_count,conversion_call_count,memory_call_count,exec_call_count,format_call_count,
pointer_identifier_count,address_of_count,
branch_count,loop_count,return_count,cyclomatic_complexity,
bounds_check_count,null_check_count,
array_write_count,sink_call_count,
taint_reachable,
def_count,use_count,
sink_inside_loop,
calls_per_ast_node,checks_per_sink
"""

val cpgRows =
  cpgFeatures.map {
    case (
      id, label,

      astNodes,
      callCount,
      controlCount,
      identifierCount,
      literalCount,
      maxAstDepth,

      inputCallCount,
      conversionCallCount,
      memoryCallCount,
      execCallCount,
      formatCallCount,

      pointerIdentifierCount,
      addressOfCount,

      branchCount,
      loopCount,
      returnCount,
      cyclomaticComplexity,

      boundsCheckCount,
      nullCheckCount,

      arrayWriteCount,
      sinkCallCount,

      taintReachable,

      defCount,
      useCount,

      sinkInsideLoop,

      callsPerAstNode,
      checksPerSink
    ) =>
      s"$id,$label," +
      s"$astNodes,$callCount,$controlCount,$identifierCount,$literalCount,$maxAstDepth," +
      s"$inputCallCount,$conversionCallCount,$memoryCallCount,$execCallCount,$formatCallCount," +
      s"$pointerIdentifierCount,$addressOfCount," +
      s"$branchCount,$loopCount,$returnCount,$cyclomaticComplexity," +
      s"$boundsCheckCount,$nullCheckCount," +
      s"$arrayWriteCount,$sinkCallCount," +
      s"$taintReachable," +
      s"$defCount,$useCount," +
      s"$sinkInsideLoop," +
      s"$callsPerAstNode,$checksPerSink"
  }

Files.write(
  Paths.get("cpg_features_cwe680.csv"),
  (cpgHeader + cpgRows.mkString("\n")).getBytes
)
