Skip to content

Commit c68579b

Browse files
authored
Merge pull request #19499 from paldepind/rust-bulk-model-generator
Rust: Bulk model generator
2 parents 1baf6d6 + 41e76e2 commit c68579b

File tree

56 files changed

+5912
-3
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+5912
-3
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ node_modules/
6262

6363
# Temporary folders for working with generated models
6464
.model-temp
65+
/mad-generation-build
6566

6667
# bazel-built in-tree extractor packs
6768
/*/extractor-pack

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ members = [
1010
"rust/ast-generator",
1111
"rust/autobuild",
1212
]
13+
exclude = ["mad-generation-build"]
1314

1415
[patch.crates-io]
1516
# patch for build script bug preventing bazel build
Lines changed: 335 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,335 @@
1+
"""
2+
Experimental script for bulk generation of MaD models based on a list of projects.
3+
4+
Currently the script only targets Rust.
5+
"""
6+
7+
import os.path
8+
import subprocess
9+
import sys
10+
from typing import NotRequired, TypedDict, List
11+
from concurrent.futures import ThreadPoolExecutor, as_completed
12+
import time
13+
14+
import generate_mad as mad
15+
16+
gitroot = (
17+
subprocess.check_output(["git", "rev-parse", "--show-toplevel"])
18+
.decode("utf-8")
19+
.strip()
20+
)
21+
build_dir = os.path.join(gitroot, "mad-generation-build")
22+
23+
24+
def path_to_mad_directory(language: str, name: str) -> str:
25+
return os.path.join(gitroot, f"{language}/ql/lib/ext/generated/{name}")
26+
27+
28+
# A project to generate models for
29+
class Project(TypedDict):
30+
"""
31+
Type definition for Rust projects to model.
32+
33+
Attributes:
34+
name: The name of the project
35+
git_repo: URL to the git repository
36+
git_tag: Optional Git tag to check out
37+
"""
38+
39+
name: str
40+
git_repo: str
41+
git_tag: NotRequired[str]
42+
43+
44+
# List of Rust projects to generate models for.
45+
projects: List[Project] = [
46+
{
47+
"name": "libc",
48+
"git_repo": "https://github.com/rust-lang/libc",
49+
"git_tag": "0.2.172",
50+
},
51+
{
52+
"name": "log",
53+
"git_repo": "https://github.com/rust-lang/log",
54+
"git_tag": "0.4.27",
55+
},
56+
{
57+
"name": "memchr",
58+
"git_repo": "https://github.com/BurntSushi/memchr",
59+
"git_tag": "2.7.4",
60+
},
61+
{
62+
"name": "once_cell",
63+
"git_repo": "https://github.com/matklad/once_cell",
64+
"git_tag": "v1.21.3",
65+
},
66+
{
67+
"name": "rand",
68+
"git_repo": "https://github.com/rust-random/rand",
69+
"git_tag": "0.9.1",
70+
},
71+
{
72+
"name": "smallvec",
73+
"git_repo": "https://github.com/servo/rust-smallvec",
74+
"git_tag": "v1.15.0",
75+
},
76+
{
77+
"name": "serde",
78+
"git_repo": "https://github.com/serde-rs/serde",
79+
"git_tag": "v1.0.219",
80+
},
81+
{
82+
"name": "tokio",
83+
"git_repo": "https://github.com/tokio-rs/tokio",
84+
"git_tag": "tokio-1.45.0",
85+
},
86+
{
87+
"name": "reqwest",
88+
"git_repo": "https://github.com/seanmonstar/reqwest",
89+
"git_tag": "v0.12.15",
90+
},
91+
{
92+
"name": "rocket",
93+
"git_repo": "https://github.com/SergioBenitez/Rocket",
94+
"git_tag": "v0.5.1",
95+
},
96+
{
97+
"name": "actix-web",
98+
"git_repo": "https://github.com/actix/actix-web",
99+
"git_tag": "web-v4.11.0",
100+
},
101+
{
102+
"name": "hyper",
103+
"git_repo": "https://github.com/hyperium/hyper",
104+
"git_tag": "v1.6.0",
105+
},
106+
{
107+
"name": "clap",
108+
"git_repo": "https://github.com/clap-rs/clap",
109+
"git_tag": "v4.5.38",
110+
},
111+
]
112+
113+
114+
def clone_project(project: Project) -> str:
115+
"""
116+
Shallow clone a project into the build directory.
117+
118+
Args:
119+
project: A dictionary containing project information with 'name', 'git_repo', and optional 'git_tag' keys.
120+
121+
Returns:
122+
The path to the cloned project directory.
123+
"""
124+
name = project["name"]
125+
repo_url = project["git_repo"]
126+
git_tag = project.get("git_tag")
127+
128+
# Determine target directory
129+
target_dir = os.path.join(build_dir, name)
130+
131+
# Clone only if directory doesn't already exist
132+
if not os.path.exists(target_dir):
133+
if git_tag:
134+
print(f"Cloning {name} from {repo_url} at tag {git_tag}")
135+
else:
136+
print(f"Cloning {name} from {repo_url}")
137+
138+
subprocess.check_call(
139+
[
140+
"git",
141+
"clone",
142+
"--quiet",
143+
"--depth",
144+
"1", # Shallow clone
145+
*(
146+
["--branch", git_tag] if git_tag else []
147+
), # Add branch if tag is provided
148+
repo_url,
149+
target_dir,
150+
]
151+
)
152+
print(f"Completed cloning {name}")
153+
else:
154+
print(f"Skipping cloning {name} as it already exists at {target_dir}")
155+
156+
return target_dir
157+
158+
159+
def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]:
160+
"""
161+
Clone all projects in parallel.
162+
163+
Args:
164+
projects: List of projects to clone
165+
166+
Returns:
167+
List of (project, project_dir) pairs in the same order as the input projects
168+
"""
169+
start_time = time.time()
170+
max_workers = min(8, len(projects)) # Use at most 8 threads
171+
project_dirs_map = {} # Map to store results by project name
172+
173+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
174+
# Start cloning tasks and keep track of them
175+
future_to_project = {
176+
executor.submit(clone_project, project): project for project in projects
177+
}
178+
179+
# Process results as they complete
180+
for future in as_completed(future_to_project):
181+
project = future_to_project[future]
182+
try:
183+
project_dir = future.result()
184+
project_dirs_map[project["name"]] = (project, project_dir)
185+
except Exception as e:
186+
print(f"ERROR: Failed to clone {project['name']}: {e}")
187+
188+
if len(project_dirs_map) != len(projects):
189+
failed_projects = [
190+
project["name"]
191+
for project in projects
192+
if project["name"] not in project_dirs_map
193+
]
194+
print(
195+
f"ERROR: Only {len(project_dirs_map)} out of {len(projects)} projects were cloned successfully. Failed projects: {', '.join(failed_projects)}"
196+
)
197+
sys.exit(1)
198+
199+
project_dirs = [project_dirs_map[project["name"]] for project in projects]
200+
201+
clone_time = time.time() - start_time
202+
print(f"Cloning completed in {clone_time:.2f} seconds")
203+
return project_dirs
204+
205+
206+
def build_database(project: Project, project_dir: str) -> str | None:
207+
"""
208+
Build a CodeQL database for a project.
209+
210+
Args:
211+
project: A dictionary containing project information with 'name' and 'git_repo' keys.
212+
project_dir: The directory containing the project source code.
213+
214+
Returns:
215+
The path to the created database directory.
216+
"""
217+
name = project["name"]
218+
219+
# Create database directory path
220+
database_dir = os.path.join(build_dir, f"{name}-db")
221+
222+
# Only build the database if it doesn't already exist
223+
if not os.path.exists(database_dir):
224+
print(f"Building CodeQL database for {name}...")
225+
try:
226+
subprocess.check_call(
227+
[
228+
"codeql",
229+
"database",
230+
"create",
231+
"--language=rust",
232+
"--source-root=" + project_dir,
233+
"--overwrite",
234+
"-O",
235+
"cargo_features='*'",
236+
"--",
237+
database_dir,
238+
]
239+
)
240+
print(f"Successfully created database at {database_dir}")
241+
except subprocess.CalledProcessError as e:
242+
print(f"Failed to create database for {name}: {e}")
243+
return None
244+
else:
245+
print(
246+
f"Skipping database creation for {name} as it already exists at {database_dir}"
247+
)
248+
249+
return database_dir
250+
251+
252+
def generate_models(project: Project, database_dir: str) -> None:
253+
"""
254+
Generate models for a project.
255+
256+
Args:
257+
project: A dictionary containing project information with 'name' and 'git_repo' keys.
258+
project_dir: The directory containing the project source code.
259+
"""
260+
name = project["name"]
261+
262+
generator = mad.Generator("rust")
263+
generator.generateSinks = True
264+
generator.generateSources = True
265+
generator.generateSummaries = True
266+
generator.setenvironment(database=database_dir, folder=name)
267+
generator.run()
268+
269+
270+
def main() -> None:
271+
"""
272+
Process all projects in three distinct phases:
273+
1. Clone projects (in parallel)
274+
2. Build databases for projects
275+
3. Generate models for successful database builds
276+
"""
277+
278+
# Create build directory if it doesn't exist
279+
if not os.path.exists(build_dir):
280+
os.makedirs(build_dir)
281+
282+
# Check if any of the MaD directories contain working directory changes in git
283+
for project in projects:
284+
mad_dir = path_to_mad_directory("rust", project["name"])
285+
if os.path.exists(mad_dir):
286+
git_status_output = subprocess.check_output(
287+
["git", "status", "-s", mad_dir], text=True
288+
).strip()
289+
if git_status_output:
290+
print(
291+
f"""ERROR: Working directory changes detected in {mad_dir}.
292+
293+
Before generating new models, the existing models are deleted.
294+
295+
To avoid loss of data, please commit your changes."""
296+
)
297+
sys.exit(1)
298+
299+
# Phase 1: Clone projects in parallel
300+
print("=== Phase 1: Cloning projects ===")
301+
project_dirs = clone_projects(projects)
302+
303+
# Phase 2: Build databases for all projects
304+
print("\n=== Phase 2: Building databases ===")
305+
database_results = [
306+
(project, build_database(project, project_dir))
307+
for project, project_dir in project_dirs
308+
]
309+
310+
# Phase 3: Generate models for all projects
311+
print("\n=== Phase 3: Generating models ===")
312+
313+
failed_builds = [
314+
project["name"] for project, db_dir in database_results if db_dir is None
315+
]
316+
if failed_builds:
317+
print(
318+
f"ERROR: {len(failed_builds)} database builds failed: {', '.join(failed_builds)}"
319+
)
320+
sys.exit(1)
321+
322+
# Delete the MaD directory for each project
323+
for project, database_dir in database_results:
324+
mad_dir = path_to_mad_directory("rust", project["name"])
325+
if os.path.exists(mad_dir):
326+
print(f"Deleting existing MaD directory at {mad_dir}")
327+
subprocess.check_call(["rm", "-rf", mad_dir])
328+
329+
for project, database_dir in database_results:
330+
if database_dir is not None:
331+
generate_models(project, database_dir)
332+
333+
334+
if __name__ == "__main__":
335+
main()

0 commit comments

Comments
 (0)