|
| 1 | +""" |
| 2 | +Experimental script for bulk generation of MaD models based on a list of projects. |
| 3 | +
|
| 4 | +Currently the script only targets Rust. |
| 5 | +""" |
| 6 | + |
| 7 | +import os.path |
| 8 | +import subprocess |
| 9 | +import sys |
| 10 | +from typing import NotRequired, TypedDict, List |
| 11 | +from concurrent.futures import ThreadPoolExecutor, as_completed |
| 12 | +import time |
| 13 | + |
| 14 | +import generate_mad as mad |
| 15 | + |
| 16 | +gitroot = ( |
| 17 | + subprocess.check_output(["git", "rev-parse", "--show-toplevel"]) |
| 18 | + .decode("utf-8") |
| 19 | + .strip() |
| 20 | +) |
| 21 | +build_dir = os.path.join(gitroot, "mad-generation-build") |
| 22 | + |
| 23 | + |
| 24 | +def path_to_mad_directory(language: str, name: str) -> str: |
| 25 | + return os.path.join(gitroot, f"{language}/ql/lib/ext/generated/{name}") |
| 26 | + |
| 27 | + |
| 28 | +# A project to generate models for |
| 29 | +class Project(TypedDict): |
| 30 | + """ |
| 31 | + Type definition for Rust projects to model. |
| 32 | +
|
| 33 | + Attributes: |
| 34 | + name: The name of the project |
| 35 | + git_repo: URL to the git repository |
| 36 | + git_tag: Optional Git tag to check out |
| 37 | + """ |
| 38 | + |
| 39 | + name: str |
| 40 | + git_repo: str |
| 41 | + git_tag: NotRequired[str] |
| 42 | + |
| 43 | + |
| 44 | +# List of Rust projects to generate models for. |
| 45 | +projects: List[Project] = [ |
| 46 | + { |
| 47 | + "name": "libc", |
| 48 | + "git_repo": "https://github.com/rust-lang/libc", |
| 49 | + "git_tag": "0.2.172", |
| 50 | + }, |
| 51 | + { |
| 52 | + "name": "log", |
| 53 | + "git_repo": "https://github.com/rust-lang/log", |
| 54 | + "git_tag": "0.4.27", |
| 55 | + }, |
| 56 | + { |
| 57 | + "name": "memchr", |
| 58 | + "git_repo": "https://github.com/BurntSushi/memchr", |
| 59 | + "git_tag": "2.7.4", |
| 60 | + }, |
| 61 | + { |
| 62 | + "name": "once_cell", |
| 63 | + "git_repo": "https://github.com/matklad/once_cell", |
| 64 | + "git_tag": "v1.21.3", |
| 65 | + }, |
| 66 | + { |
| 67 | + "name": "rand", |
| 68 | + "git_repo": "https://github.com/rust-random/rand", |
| 69 | + "git_tag": "0.9.1", |
| 70 | + }, |
| 71 | + { |
| 72 | + "name": "smallvec", |
| 73 | + "git_repo": "https://github.com/servo/rust-smallvec", |
| 74 | + "git_tag": "v1.15.0", |
| 75 | + }, |
| 76 | + { |
| 77 | + "name": "serde", |
| 78 | + "git_repo": "https://github.com/serde-rs/serde", |
| 79 | + "git_tag": "v1.0.219", |
| 80 | + }, |
| 81 | + { |
| 82 | + "name": "tokio", |
| 83 | + "git_repo": "https://github.com/tokio-rs/tokio", |
| 84 | + "git_tag": "tokio-1.45.0", |
| 85 | + }, |
| 86 | + { |
| 87 | + "name": "reqwest", |
| 88 | + "git_repo": "https://github.com/seanmonstar/reqwest", |
| 89 | + "git_tag": "v0.12.15", |
| 90 | + }, |
| 91 | + { |
| 92 | + "name": "rocket", |
| 93 | + "git_repo": "https://github.com/SergioBenitez/Rocket", |
| 94 | + "git_tag": "v0.5.1", |
| 95 | + }, |
| 96 | + { |
| 97 | + "name": "actix-web", |
| 98 | + "git_repo": "https://github.com/actix/actix-web", |
| 99 | + "git_tag": "web-v4.11.0", |
| 100 | + }, |
| 101 | + { |
| 102 | + "name": "hyper", |
| 103 | + "git_repo": "https://github.com/hyperium/hyper", |
| 104 | + "git_tag": "v1.6.0", |
| 105 | + }, |
| 106 | + { |
| 107 | + "name": "clap", |
| 108 | + "git_repo": "https://github.com/clap-rs/clap", |
| 109 | + "git_tag": "v4.5.38", |
| 110 | + }, |
| 111 | +] |
| 112 | + |
| 113 | + |
| 114 | +def clone_project(project: Project) -> str: |
| 115 | + """ |
| 116 | + Shallow clone a project into the build directory. |
| 117 | +
|
| 118 | + Args: |
| 119 | + project: A dictionary containing project information with 'name', 'git_repo', and optional 'git_tag' keys. |
| 120 | +
|
| 121 | + Returns: |
| 122 | + The path to the cloned project directory. |
| 123 | + """ |
| 124 | + name = project["name"] |
| 125 | + repo_url = project["git_repo"] |
| 126 | + git_tag = project.get("git_tag") |
| 127 | + |
| 128 | + # Determine target directory |
| 129 | + target_dir = os.path.join(build_dir, name) |
| 130 | + |
| 131 | + # Clone only if directory doesn't already exist |
| 132 | + if not os.path.exists(target_dir): |
| 133 | + if git_tag: |
| 134 | + print(f"Cloning {name} from {repo_url} at tag {git_tag}") |
| 135 | + else: |
| 136 | + print(f"Cloning {name} from {repo_url}") |
| 137 | + |
| 138 | + subprocess.check_call( |
| 139 | + [ |
| 140 | + "git", |
| 141 | + "clone", |
| 142 | + "--quiet", |
| 143 | + "--depth", |
| 144 | + "1", # Shallow clone |
| 145 | + *( |
| 146 | + ["--branch", git_tag] if git_tag else [] |
| 147 | + ), # Add branch if tag is provided |
| 148 | + repo_url, |
| 149 | + target_dir, |
| 150 | + ] |
| 151 | + ) |
| 152 | + print(f"Completed cloning {name}") |
| 153 | + else: |
| 154 | + print(f"Skipping cloning {name} as it already exists at {target_dir}") |
| 155 | + |
| 156 | + return target_dir |
| 157 | + |
| 158 | + |
| 159 | +def clone_projects(projects: List[Project]) -> List[tuple[Project, str]]: |
| 160 | + """ |
| 161 | + Clone all projects in parallel. |
| 162 | +
|
| 163 | + Args: |
| 164 | + projects: List of projects to clone |
| 165 | +
|
| 166 | + Returns: |
| 167 | + List of (project, project_dir) pairs in the same order as the input projects |
| 168 | + """ |
| 169 | + start_time = time.time() |
| 170 | + max_workers = min(8, len(projects)) # Use at most 8 threads |
| 171 | + project_dirs_map = {} # Map to store results by project name |
| 172 | + |
| 173 | + with ThreadPoolExecutor(max_workers=max_workers) as executor: |
| 174 | + # Start cloning tasks and keep track of them |
| 175 | + future_to_project = { |
| 176 | + executor.submit(clone_project, project): project for project in projects |
| 177 | + } |
| 178 | + |
| 179 | + # Process results as they complete |
| 180 | + for future in as_completed(future_to_project): |
| 181 | + project = future_to_project[future] |
| 182 | + try: |
| 183 | + project_dir = future.result() |
| 184 | + project_dirs_map[project["name"]] = (project, project_dir) |
| 185 | + except Exception as e: |
| 186 | + print(f"ERROR: Failed to clone {project['name']}: {e}") |
| 187 | + |
| 188 | + if len(project_dirs_map) != len(projects): |
| 189 | + failed_projects = [ |
| 190 | + project["name"] |
| 191 | + for project in projects |
| 192 | + if project["name"] not in project_dirs_map |
| 193 | + ] |
| 194 | + print( |
| 195 | + f"ERROR: Only {len(project_dirs_map)} out of {len(projects)} projects were cloned successfully. Failed projects: {', '.join(failed_projects)}" |
| 196 | + ) |
| 197 | + sys.exit(1) |
| 198 | + |
| 199 | + project_dirs = [project_dirs_map[project["name"]] for project in projects] |
| 200 | + |
| 201 | + clone_time = time.time() - start_time |
| 202 | + print(f"Cloning completed in {clone_time:.2f} seconds") |
| 203 | + return project_dirs |
| 204 | + |
| 205 | + |
| 206 | +def build_database(project: Project, project_dir: str) -> str | None: |
| 207 | + """ |
| 208 | + Build a CodeQL database for a project. |
| 209 | +
|
| 210 | + Args: |
| 211 | + project: A dictionary containing project information with 'name' and 'git_repo' keys. |
| 212 | + project_dir: The directory containing the project source code. |
| 213 | +
|
| 214 | + Returns: |
| 215 | + The path to the created database directory. |
| 216 | + """ |
| 217 | + name = project["name"] |
| 218 | + |
| 219 | + # Create database directory path |
| 220 | + database_dir = os.path.join(build_dir, f"{name}-db") |
| 221 | + |
| 222 | + # Only build the database if it doesn't already exist |
| 223 | + if not os.path.exists(database_dir): |
| 224 | + print(f"Building CodeQL database for {name}...") |
| 225 | + try: |
| 226 | + subprocess.check_call( |
| 227 | + [ |
| 228 | + "codeql", |
| 229 | + "database", |
| 230 | + "create", |
| 231 | + "--language=rust", |
| 232 | + "--source-root=" + project_dir, |
| 233 | + "--overwrite", |
| 234 | + "-O", |
| 235 | + "cargo_features='*'", |
| 236 | + "--", |
| 237 | + database_dir, |
| 238 | + ] |
| 239 | + ) |
| 240 | + print(f"Successfully created database at {database_dir}") |
| 241 | + except subprocess.CalledProcessError as e: |
| 242 | + print(f"Failed to create database for {name}: {e}") |
| 243 | + return None |
| 244 | + else: |
| 245 | + print( |
| 246 | + f"Skipping database creation for {name} as it already exists at {database_dir}" |
| 247 | + ) |
| 248 | + |
| 249 | + return database_dir |
| 250 | + |
| 251 | + |
| 252 | +def generate_models(project: Project, database_dir: str) -> None: |
| 253 | + """ |
| 254 | + Generate models for a project. |
| 255 | +
|
| 256 | + Args: |
| 257 | + project: A dictionary containing project information with 'name' and 'git_repo' keys. |
| 258 | + project_dir: The directory containing the project source code. |
| 259 | + """ |
| 260 | + name = project["name"] |
| 261 | + |
| 262 | + generator = mad.Generator("rust") |
| 263 | + generator.generateSinks = True |
| 264 | + generator.generateSources = True |
| 265 | + generator.generateSummaries = True |
| 266 | + generator.setenvironment(database=database_dir, folder=name) |
| 267 | + generator.run() |
| 268 | + |
| 269 | + |
| 270 | +def main() -> None: |
| 271 | + """ |
| 272 | + Process all projects in three distinct phases: |
| 273 | + 1. Clone projects (in parallel) |
| 274 | + 2. Build databases for projects |
| 275 | + 3. Generate models for successful database builds |
| 276 | + """ |
| 277 | + |
| 278 | + # Create build directory if it doesn't exist |
| 279 | + if not os.path.exists(build_dir): |
| 280 | + os.makedirs(build_dir) |
| 281 | + |
| 282 | + # Check if any of the MaD directories contain working directory changes in git |
| 283 | + for project in projects: |
| 284 | + mad_dir = path_to_mad_directory("rust", project["name"]) |
| 285 | + if os.path.exists(mad_dir): |
| 286 | + git_status_output = subprocess.check_output( |
| 287 | + ["git", "status", "-s", mad_dir], text=True |
| 288 | + ).strip() |
| 289 | + if git_status_output: |
| 290 | + print( |
| 291 | + f"""ERROR: Working directory changes detected in {mad_dir}. |
| 292 | +
|
| 293 | +Before generating new models, the existing models are deleted. |
| 294 | +
|
| 295 | +To avoid loss of data, please commit your changes.""" |
| 296 | + ) |
| 297 | + sys.exit(1) |
| 298 | + |
| 299 | + # Phase 1: Clone projects in parallel |
| 300 | + print("=== Phase 1: Cloning projects ===") |
| 301 | + project_dirs = clone_projects(projects) |
| 302 | + |
| 303 | + # Phase 2: Build databases for all projects |
| 304 | + print("\n=== Phase 2: Building databases ===") |
| 305 | + database_results = [ |
| 306 | + (project, build_database(project, project_dir)) |
| 307 | + for project, project_dir in project_dirs |
| 308 | + ] |
| 309 | + |
| 310 | + # Phase 3: Generate models for all projects |
| 311 | + print("\n=== Phase 3: Generating models ===") |
| 312 | + |
| 313 | + failed_builds = [ |
| 314 | + project["name"] for project, db_dir in database_results if db_dir is None |
| 315 | + ] |
| 316 | + if failed_builds: |
| 317 | + print( |
| 318 | + f"ERROR: {len(failed_builds)} database builds failed: {', '.join(failed_builds)}" |
| 319 | + ) |
| 320 | + sys.exit(1) |
| 321 | + |
| 322 | + # Delete the MaD directory for each project |
| 323 | + for project, database_dir in database_results: |
| 324 | + mad_dir = path_to_mad_directory("rust", project["name"]) |
| 325 | + if os.path.exists(mad_dir): |
| 326 | + print(f"Deleting existing MaD directory at {mad_dir}") |
| 327 | + subprocess.check_call(["rm", "-rf", mad_dir]) |
| 328 | + |
| 329 | + for project, database_dir in database_results: |
| 330 | + if database_dir is not None: |
| 331 | + generate_models(project, database_dir) |
| 332 | + |
| 333 | + |
| 334 | +if __name__ == "__main__": |
| 335 | + main() |
0 commit comments