In [15]:
import polars as pl
from src.paths import (
    PYTHON_VULNERABILITY_FIXES_DATA_PATH,
    PYTHON_CODE_FIXES_DATA_PATH,
    PYTHON_CODE_FIXES_WITH_CONTEXT_DATA_PATH,
    PYTHON_CODE_UNITS_DATA_PATH,
    PYTHON_CODE_CONTEXT_DATA_PATH,
)
from tqdm import tqdm
import jedi
import whatthepatch
from git import Repo
from typing import Any
import jedi.api
import jedi.common
import json
import shutil
import logging

from src.process_code_changes import get_changes

## Filter out data

In [16]:
# python_commit_data = commit_data_only_top_langs.filter(pl.col("language") == "Python")
python_vulnerability_fixes = pl.read_parquet(PYTHON_VULNERABILITY_FIXES_DATA_PATH)
print(python_vulnerability_fixes.unique("vulnerability_id").shape[0])
python_vulnerability_fixes = python_vulnerability_fixes.group_by('vulnerability_id', 'repo', 'commit', 'pull_request_number', 'file', 'patch', 'patch_time', 'commit_source', 'file_extension', 'language').agg(pl.col("cwe_id"))
python_vulnerability_fixes = python_vulnerability_fixes.unique("patch")
print(python_vulnerability_fixes.unique("vulnerability_id").shape[0])


1805
1803


## Filter out data connected with test functionallity

In [17]:
python_vulnerability_fixes = python_vulnerability_fixes.filter(
    (
        pl.col("file").str.contains(r"\/{0,1}[tT][eE][sS][tT][sS]{0,1}\/")
        | pl.col("patch").str.contains("pytest")
        | pl.col("patch").str.contains("unittest")
    ).not_()
)
python_vulnerability_fixes.unique("vulnerability_id").shape[0]

1796

In [18]:
exclude_langs = [
    "txt",
    "md",
    "JSON",
    "YAML",
    "bugfix",
    "cfg",
    "rst",
    "toml",
    "lock",
    "ini",
    "in",
    "gitignore",
    "sample",
    "pem",
    "feature",
    "tif",
    "security",
    "proto",
    "conf",
    "spec",
    "bin",
    "misc",
    "pyi",
    "pxi",
    "fli",
    "gif",
    "tpl",
    "graphql",
    "http",
    "sgi",
    "pyx",
    "inc"
]
python_vulnerability_fixes = python_vulnerability_fixes.filter(
    (
        pl.col("file").str.split(".").list.last().is_in(exclude_langs)
    ).not_()
)
python_vulnerability_fixes.unique("vulnerability_id").shape[0]

1796

## Process missing commit

In [19]:
print(python_vulnerability_fixes.filter(pl.col("commit").is_null()))
python_vulnerability_fixes = python_vulnerability_fixes.with_columns(
    pl.when(pl.col("pull_request_number") == 24391)
    .then(pl.lit("86664c9405136a4904775c52e6caf100a474ec58"))
    .otherwise(pl.col("commit"))
    .alias("commit")
)
print(python_vulnerability_fixes.filter(pl.col("commit").is_null()))
# No changes related to python: https://github.com/pyca/pyopenssl/commit/6bbf44a00b35fb28df1f66aa194b2fe95eab1ab2
# Very big change: https://github.com/transifex/transifex-client/commit/e0d1f8b38ec1a24e2999d63420554d8393206f58
python_vulnerability_fixes = python_vulnerability_fixes.filter(
    ~pl.col("commit").is_in(
        [
            "6bbf44a00b35fb28df1f66aa194b2fe95eab1ab2",
            "e0d1f8b38ec1a24e2999d63420554d8393206f58",
            "5f7496481bd3db1d06a2d2e62c0dce960a1fe12b",
            # Not exists in repo
            "13336272e32872247fa7d17e964ccd88ec8d1376",
            "2bfe358043096fdba9e2a4cf0f5740102b37fd8f",
        ]
    )
)
python_vulnerability_fixes = python_vulnerability_fixes.filter(
    pl.col("file") != "setup.py"
)
print(python_vulnerability_fixes.filter(pl.col("commit").is_null()))

shape: (1, 11)
┌────────────┬────────────┬────────┬────────────┬───┬───────────┬───────────┬──────────┬───────────┐
│ vulnerabil ┆ repo       ┆ commit ┆ pull_reque ┆ … ┆ commit_so ┆ file_exte ┆ language ┆ cwe_id    │
│ ity_id     ┆ ---        ┆ ---    ┆ st_number  ┆   ┆ urce      ┆ nsion     ┆ ---      ┆ ---       │
│ ---        ┆ str        ┆ str    ┆ ---        ┆   ┆ ---       ┆ ---       ┆ str      ┆ list[str] │
│ str        ┆            ┆        ┆ i64        ┆   ┆ str       ┆ str       ┆          ┆           │
╞════════════╪════════════╪════════╪════════════╪═══╪═══════════╪═══════════╪══════════╪═══════════╡
│ 2021-3733  ┆ python/cpy ┆ null   ┆ 24391      ┆ … ┆ github    ┆ py        ┆ Python   ┆ ["CWE-400 │
│            ┆ thon       ┆        ┆            ┆   ┆           ┆           ┆          ┆ "]        │
└────────────┴────────────┴────────┴────────────┴───┴───────────┴───────────┴──────────┴───────────┘
shape: (0, 11)
┌─────────────┬──────┬────────┬─────────────┬───┬────────────

In [20]:
python_vulnerability_fixes.filter(pl.col("vulnerability_id") == "2022-29198").unique("language")

vulnerability_id,repo,commit,pull_request_number,file,patch,patch_time,commit_source,file_extension,language,cwe_id
str,str,str,i64,str,str,"datetime[μs, UTC]",str,str,str,list[str]
"""2022-29198""","""tensorflow/tensorflow""","""ea50a40e84f6bff15a0912728e35b6…",,"""tensorflow/core/kernels/sparse…","""@@ -67,6 +67,13 @@ class Spars…",2022-05-02 23:16:54 UTC,"""github""","""cc""","""C/C++""","[""CWE-20""]"


In [None]:
if PYTHON_CODE_FIXES_DATA_PATH.exists():
    print("Reading code fixes")
    code_unit_changes = pl.read_parquet(PYTHON_CODE_FIXES_DATA_PATH).to_dicts()
else:
    code_unit_changes: list[dict[str, Any]] = []

repos: dict[str, Repo] = {}

grouped_vulnerabilities = (
    python_vulnerability_fixes.group_by(
        "repo", "vulnerability_id", "commit", "commit_source", "cwe_id"
    )
    .agg(pl.col("patch"), pl.col("file"), pl.col("language"))
    .sample(fraction=1, shuffle=True)
)
errors: list[dict[str, Any]] = []
checked_commits = set([change["commit"] for change in code_unit_changes])


vulnerabilities_to_check = grouped_vulnerabilities.to_dicts()


for commit_data_row in tqdm(vulnerabilities_to_check):
    if commit_data_row["commit"] in checked_commits:
        continue
    try:
        get_changes(commit_data_row)
    except Exception as e:
        logging.exception(f"Error in {commit_data_row}")

Reading code fixes


  5%|▍         | 106/2323 [00:45<06:28,  5.71it/s]ERROR:src.process_code_changes:Error processing commit 48dfc06e49c7f773749e063f8cc69c95509d1c32
ERROR:src.process_code_changes:{'repo': 'mayan-edms/mayan-edms', 'vulnerability_id': '2018-16406', 'commit': '48dfc06e49c7f773749e063f8cc69c95509d1c32', 'commit_source': 'gitlab', 'cwe_id': ['CWE-79'], 'patch': ['@@ -1,15 +1,16 @@\n from __future__ import unicode_literals\n \n from django.apps import apps\n-from django.utils.html import format_html_join\n+from django.utils.html import format_html, format_html_join\n \n from .permissions import permission_cabinet_view\n \n \n def jstree_data(node, selected_node):\n     result = []\n+\n     result.append(\'{\')\n-    result.append(\'"text": "{}",\'.format(node.label))\n+    result.append(format_html(\'"text": "{}",\', node.label))\n     result.append(\n         \'"state": {{ "opened": true, "selected": {} }},\'.format(\n             \'true\' if node == selected_node else \'false\'\n'], 'file': 

In [21]:
new_code_unit_changes = []
new_code_context_changes = []

for commit_data_file in PYTHON_CODE_UNITS_DATA_PATH.rglob("*.json"):
    try:
        with commit_data_file.open() as f:
            new_code_unit_changes.append(json.load(f))
    except:
        print(commit_data_file)
        commit_data_file.unlink()
for commit_data_file in PYTHON_CODE_CONTEXT_DATA_PATH.rglob("*.json"):
    try:
        with commit_data_file.open() as f:
            new_code_context_changes.append(json.load(f))
    except:
        print(commit_data_file)
        commit_data_file.unlink()

pl.DataFrame(new_code_unit_changes).write_parquet(PYTHON_CODE_FIXES_DATA_PATH)
pl.DataFrame(new_code_context_changes).write_parquet(PYTHON_CODE_FIXES_WITH_CONTEXT_DATA_PATH)

In [22]:
code_unit_changes_df = pl.read_parquet(PYTHON_CODE_FIXES_DATA_PATH)
code_unit_changes_df = code_unit_changes_df.filter(
    # pl.col("vulnerability_id").is_in(excluded_vulns).not_(),
    pl.col("new_file") != "setup.py",
    pl.col("old_file") != "setup.py"
).with_columns(
    pl.col("code_unit_after_fix").str.replace_all(r"\n\s*\n", "\n").alias("code_unit_after_fix"),
    pl.col("code_unit_before_fix").str.replace_all(r"\n\s*\n", "\n").alias("code_unit_before_fix"),
)
code_unit_changes_df.describe()

statistic,commit,repo,new_file,patch,code_unit_after_fix,vulnerability_id,cwe_id,old_file,code_unit_before_fix
str,str,str,str,str,str,str,f64,str,str
"""count""","""3282""","""3282""","""3282""","""3282""","""3282""","""3282""",3282.0,"""3282""","""3282"""
"""null_count""","""0""","""0""","""0""","""0""","""0""","""0""",0.0,"""0""","""0"""
"""mean""",,,,,,,,,
"""std""",,,,,,,,,
"""min""","""001b0634cd309e372edb6d7d95d083…","""389ds/389-ds-base""",""".devcontainer/library-scripts/…","""@@ -0,0 +1,13 @@ +def raise_on…","""""","""2013-0208""",,""".devcontainer/library-scripts/…",""""""
"""25%""",,,,,,,,,
"""50%""",,,,,,,,,
"""75%""",,,,,,,,,
"""max""","""ffd3757fc35468a97791e452e7f2d1…","""zwczou/weixin-python""","""zproject/urls.py""","""@@ -99,7 +99,9 @@ extern ""C"" {…","""{%- autoescape true -%} {%- en…","""GHSA-x563-6hqv-26mr""",,"""zproject/urls.py""","""{% if widget.value %}<p class=…"


In [23]:
sorted(code_unit_changes_df.with_columns(
    pl.col("new_file").str.split(".").list.last().alias("lang")
).group_by("vulnerability_id").agg("lang").with_columns(pl.col("lang").list.unique()).unique("lang").filter(pl.col("lang").list.contains("py").not_()).select("vulnerability_id").to_series().to_list())

['2018-10861',
 '2020-5224',
 '2021-25291',
 '2021-29618',
 '2021-41206',
 '2022-25882',
 '2022-31116',
 '2022-41900',
 '2023-28366',
 '2023-46249',
 '2023-52266',
 '2024-21485',
 '2024-32979']

In [None]:
total = 0
errors_count = 0
bad_commits = set()
for row in code_unit_changes_df.sample(fraction=1, shuffle=True).iter_rows(named=True):
    file_extension = row["new_file"].split(".")[-1]
    if file_extension in {"py", "pyi", "pyx", "pxi"}:
        total += 1
        script = jedi.Script(code=row["code_unit_after_fix"])
        errors = script.get_syntax_errors()
        if errors and row["commit"]:
            shutil.rmtree(PYTHON_CODE_UNITS_DATA_PATH / row["commit"], ignore_errors=True)
            shutil.rmtree(PYTHON_CODE_CONTEXT_DATA_PATH / row["commit"], ignore_errors=True)
            # print(row["vulnerability_id"], row["repo"], row["new_file"], row["commit"])
            # print(row["code_unit_after_fix"])
            # print(errors)
            # break
            bad_commits.add(row["commit"])
            errors_count += 1

print(errors_count, total)

0 2544


In [25]:
code_unit_changes_df.unique("vulnerability_id").shape

(1377, 9)