Skip to content

MaD generator: use --threads=0 and 2GB per thread for --ram by default #19744

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 23, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -20,7 +20,7 @@ repos:
rev: 25.1.0
hooks:
- id: black
files: ^(misc/codegen/.*|misc/scripts/models-as-data/bulk_generate_mad)\.py$
files: ^(misc/codegen/.*|misc/scripts/models-as-data/.*)\.py$

- repo: local
hooks:
18 changes: 12 additions & 6 deletions misc/scripts/models-as-data/bulk_generate_mad.py
Original file line number Diff line number Diff line change
@@ -236,13 +236,12 @@ def generate_models(config, args, project: Project, database_dir: str) -> None:
language = config["language"]

generator = mad.Generator(language)
# Note: The argument parser converts with-sinks to with_sinks, etc.
generator.generateSinks = should_generate_sinks(project)
generator.generateSources = should_generate_sources(project)
generator.generateSummaries = should_generate_summaries(project)
generator.setenvironment(database=database_dir, folder=name)
generator.threads = args.codeql_threads
generator.ram = args.codeql_ram
generator.setenvironment(database=database_dir, folder=name)
generator.run()


@@ -348,7 +347,7 @@ def download_dca_databases(
"""
print("\n=== Finding projects ===")
project_map = {project["name"]: project for project in projects}
analyzed_databases = {}
analyzed_databases = {n: None for n in project_map}
for experiment_name in experiment_names:
response = get_json_from_github(
f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
@@ -361,17 +360,24 @@ def download_dca_databases(
artifact_name = analyzed_database["artifact_name"]
pretty_name = pretty_name_from_artifact_name(artifact_name)

if not pretty_name in project_map:
if not pretty_name in analyzed_databases:
print(f"Skipping {pretty_name} as it is not in the list of projects")
continue

if pretty_name in analyzed_databases:
if analyzed_databases[pretty_name] is not None:
print(
f"Skipping previous database {analyzed_databases[pretty_name]['artifact_name']} for {pretty_name}"
)

analyzed_databases[pretty_name] = analyzed_database

not_found = [name for name, db in analyzed_databases.items() if db is None]
if not_found:
print(
f"ERROR: The following projects were not found in the DCA experiments: {', '.join(not_found)}"
)
sys.exit(1)

def download_and_decompress(analyzed_database: dict) -> str:
artifact_name = analyzed_database["artifact_name"]
repository = analyzed_database["repository"]
@@ -525,7 +531,7 @@ def main(config, args) -> None:
parser.add_argument(
"--codeql-ram",
type=int,
help="What `--ram` value to pass to `codeql` while generating models (by default the flag is not passed)",
help="What `--ram` value to pass to `codeql` while generating models (by default 2048 MB per thread)",
default=None,
)
parser.add_argument(
59 changes: 41 additions & 18 deletions misc/scripts/models-as-data/convert_extensions.py
Original file line number Diff line number Diff line change
@@ -7,65 +7,86 @@
import sys
import tempfile


def quote_if_needed(v):
# string columns
if type(v) is str:
return "\"" + v + "\""
return '"' + v + '"'
# bool column
return str(v)


def parseData(data):
rows = [{ }, { }]
rows = [{}, {}]
for row in data:
d = map(quote_if_needed, row)
provenance = row[-1]
targetRows = rows[1] if provenance.endswith("generated") else rows[0]
helpers.insert_update(targetRows, row[0], " - [" + ', '.join(d) + ']\n')
helpers.insert_update(targetRows, row[0], " - [" + ", ".join(d) + "]\n")

return rows


class Converter:
def __init__(self, language, dbDir):
self.language = language
self.dbDir = dbDir
self.codeQlRoot = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode("utf-8").strip()
self.codeQlRoot = (
subprocess.check_output(["git", "rev-parse", "--show-toplevel"])
.decode("utf-8")
.strip()
)
self.extDir = os.path.join(self.codeQlRoot, f"{self.language}/ql/lib/ext/")
self.dirname = "modelconverter"
self.modelFileExtension = ".model.yml"
self.workDir = tempfile.mkdtemp()


def runQuery(self, query):
print('########## Querying: ', query)
queryFile = os.path.join(self.codeQlRoot, f"{self.language}/ql/src/utils/{self.dirname}", query)
print("########## Querying: ", query)
queryFile = os.path.join(
self.codeQlRoot, f"{self.language}/ql/src/utils/{self.dirname}", query
)
resultBqrs = os.path.join(self.workDir, "out.bqrs")

helpers.run_cmd(['codeql', 'query', 'run', queryFile, '--database', self.dbDir, '--output', resultBqrs], "Failed to generate " + query)
helpers.run_cmd(
[
"codeql",
"query",
"run",
queryFile,
"--database",
self.dbDir,
"--output",
resultBqrs,
],
"Failed to generate " + query,
)
return helpers.readData(self.workDir, resultBqrs)


def asAddsTo(self, rows, predicate):
extensions = [{ }, { }]
extensions = [{}, {}]
for i in range(2):
for key in rows[i]:
extensions[i][key] = helpers.addsToTemplate.format(f"codeql/{self.language}-all", predicate, rows[i][key])

return extensions
extensions[i][key] = helpers.addsToTemplate.format(
f"codeql/{self.language}-all", predicate, rows[i][key]
)

return extensions

def getAddsTo(self, query, predicate):
data = self.runQuery(query)
rows = parseData(data)
return self.asAddsTo(rows, predicate)


def makeContent(self):
summaries = self.getAddsTo("ExtractSummaries.ql", helpers.summaryModelPredicate)
sources = self.getAddsTo("ExtractSources.ql", helpers.sourceModelPredicate)
sinks = self.getAddsTo("ExtractSinks.ql", helpers.sinkModelPredicate)
neutrals = self.getAddsTo("ExtractNeutrals.ql", helpers.neutralModelPredicate)
return [helpers.merge(sources[0], sinks[0], summaries[0], neutrals[0]), helpers.merge(sources[1], sinks[1], summaries[1], neutrals[1])]

return [
helpers.merge(sources[0], sinks[0], summaries[0], neutrals[0]),
helpers.merge(sources[1], sinks[1], summaries[1], neutrals[1]),
]

def save(self, extensions):
# Create directory if it doesn't exist
@@ -77,9 +98,11 @@ def save(self, extensions):
for entry in extensions[0]:
with open(self.extDir + "/" + entry + self.modelFileExtension, "w") as f:
f.write(extensionTemplate.format(extensions[0][entry]))

for entry in extensions[1]:
with open(self.extDir + "/generated/" + entry + self.modelFileExtension, "w") as f:
with open(
self.extDir + "/generated/" + entry + self.modelFileExtension, "w"
) as f:
f.write(extensionTemplate.format(extensions[1][entry]))

def run(self):
Loading
Oops, something went wrong.
Loading
Oops, something went wrong.