In [1]:
import polars as pl
from src.paths import (
    PYTHON_VULNERABILITY_FIXES_DATA_PATH,
    PYTHON_CODE_FIXES_DATA_PATH,
    PYTHON_CODE_FIXES_WITH_CONTEXT_DATA_PATH,
    PYTHON_CODE_UNITS_DATA_PATH,
    PYTHON_CODE_CONTEXT_DATA_PATH,
)
from tqdm import tqdm
import jedi
import whatthepatch
from git import Repo
from typing import Any
import jedi.api
import jedi.common
import json

from src.process_code_changes import get_changes

## Filter out data

In [2]:
# python_commit_data = commit_data_only_top_langs.filter(pl.col("language") == "Python")
python_vulnerability_fixes = pl.read_parquet(PYTHON_VULNERABILITY_FIXES_DATA_PATH)
print(python_vulnerability_fixes.unique("vulnerability_id").shape[0])
python_vulnerability_fixes = python_vulnerability_fixes.group_by('vulnerability_id', 'repo', 'commit', 'pull_request_number', 'file', 'patch', 'patch_time', 'commit_source', 'file_extension', 'language').agg(pl.col("cwe_id"))
python_vulnerability_fixes = python_vulnerability_fixes.unique("patch")
print(python_vulnerability_fixes.unique("vulnerability_id").shape[0])


1805
1803


## Filter out data connected with test functionallity

In [3]:
python_vulnerability_fixes = python_vulnerability_fixes.filter(
    (
        pl.col("file").str.contains(r"\/{0,1}[tT][eE][sS][tT][sS]{0,1}\/")
        | pl.col("patch").str.contains("pytest")
        | pl.col("patch").str.contains("unittest")
    ).not_()
)
python_vulnerability_fixes.unique("vulnerability_id").shape[0]

1795

In [4]:
exclude_langs = [
    "txt",
    "md",
    "JSON",
    "YAML",
    "bugfix",
    "cfg",
    "rst",
    "toml",
    "lock",
    "ini",
    "in",
    "gitignore",
    "sample",
    "pem",
    "feature",
    "tif",
    "security",
    "proto",
    "conf",
    "spec",
    "bin",
    "misc",
    "pyi",
    "pxi",
    "fli",
    "gif",
    "tpl",
    "graphql",
    "http",
    "sgi",
    "pyx",
    "inc"
]
python_vulnerability_fixes = python_vulnerability_fixes.filter(
    (
        pl.col("file").str.split(".").list.last().is_in(exclude_langs)
    ).not_()
)
python_vulnerability_fixes.unique("vulnerability_id").shape[0]

1795

## Process missing commit

In [5]:
print(python_vulnerability_fixes.filter(pl.col("commit").is_null()))
python_vulnerability_fixes = python_vulnerability_fixes.with_columns(
    pl.when(pl.col("pull_request_number") == 24391)
    .then(pl.lit("86664c9405136a4904775c52e6caf100a474ec58"))
    .otherwise(pl.col("commit"))
    .alias("commit")
)
print(python_vulnerability_fixes.filter(pl.col("commit").is_null()))
# No changes related to python: https://github.com/pyca/pyopenssl/commit/6bbf44a00b35fb28df1f66aa194b2fe95eab1ab2
# Very big change: https://github.com/transifex/transifex-client/commit/e0d1f8b38ec1a24e2999d63420554d8393206f58
python_vulnerability_fixes = python_vulnerability_fixes.filter(~pl.col("commit").is_in(["6bbf44a00b35fb28df1f66aa194b2fe95eab1ab2", "e0d1f8b38ec1a24e2999d63420554d8393206f58"]))
print(python_vulnerability_fixes.filter(pl.col("commit").is_null()))

shape: (1, 11)
┌────────────┬────────────┬────────┬────────────┬───┬───────────┬───────────┬──────────┬───────────┐
│ vulnerabil ┆ repo       ┆ commit ┆ pull_reque ┆ … ┆ commit_so ┆ file_exte ┆ language ┆ cwe_id    │
│ ity_id     ┆ ---        ┆ ---    ┆ st_number  ┆   ┆ urce      ┆ nsion     ┆ ---      ┆ ---       │
│ ---        ┆ str        ┆ str    ┆ ---        ┆   ┆ ---       ┆ ---       ┆ str      ┆ list[str] │
│ str        ┆            ┆        ┆ i64        ┆   ┆ str       ┆ str       ┆          ┆           │
╞════════════╪════════════╪════════╪════════════╪═══╪═══════════╪═══════════╪══════════╪═══════════╡
│ 2021-3733  ┆ python/cpy ┆ null   ┆ 24391      ┆ … ┆ github    ┆ py        ┆ Python   ┆ ["CWE-400 │
│            ┆ thon       ┆        ┆            ┆   ┆           ┆           ┆          ┆ "]        │
└────────────┴────────────┴────────┴────────────┴───┴───────────┴───────────┴──────────┴───────────┘
shape: (0, 11)
┌─────────────┬──────┬────────┬─────────────┬───┬────────────

In [7]:
import logging

if PYTHON_CODE_FIXES_DATA_PATH.exists():
    print("Reading code fixes")
    code_unit_changes = pl.read_parquet(PYTHON_CODE_FIXES_DATA_PATH).to_dicts()
else:
    code_unit_changes: list[dict[str, Any]] = []

repos: dict[str, Repo] = {}

grouped_vulnerabilities = (
    python_vulnerability_fixes.group_by(
        "repo", "vulnerability_id", "commit", "commit_source", "cwe_id"
    )
    .agg(pl.col("patch"), pl.col("file"), pl.col("language"))
    .sample(fraction=1, shuffle=True)
)
errors: list[dict[str, Any]] = []
checked_commits = set([change["commit"] for change in code_unit_changes])


vulnerabilities_to_check = grouped_vulnerabilities.to_dicts()


for commit_data_row in tqdm(vulnerabilities_to_check):
    if commit_data_row["commit"] in checked_commits:
        continue
    try:
        get_changes(commit_data_row)
    except Exception as e:
        logging.exception(f"Error in {commit_data_row}")

Reading code fixes


  3%|▎         | 63/2336 [23:23<14:04:09, 22.28s/it]


KeyboardInterrupt: 

In [7]:
get_changes(commit_data_row)

Error processing commit 0bbb560183fabf0533289700845dafa94951f227
{'repo': 'django/django', 'vulnerability_id': '2019-6975', 'commit': '0bbb560183fabf0533289700845dafa94951f227', 'commit_source': 'github', 'cwe_id': ['CWE-770'], 'patch': ["@@ -30,7 +30,20 @@ def format(number, decimal_sep, decimal_pos=None, grouping=0, thousand_sep='',\n     # sign\n     sign = ''\n     if isinstance(number, Decimal):\n-        str_number = '{:f}'.format(number)\n+        # Format values with more than 200 digits (an arbitrary cutoff) using\n+        # scientific notation to avoid high memory usage in {:f}'.format().\n+        _, digits, exponent = number.as_tuple()\n+        if abs(exponent) + len(digits) > 200:\n+            number = '{:e}'.format(number)\n+            coefficient, exponent = number.split('e')\n+            # Format the coefficient.\n+            coefficient = format(\n+                coefficient, decimal_sep, decimal_pos, grouping,\n+                thousand_sep, force_grouping,\n+ 

KeyError: PosixPath('/Users/somen/Zavodi/unik/open_source_code_vulnerabilities_dataset/.venv/lib/python3.12/site-packages/jedi/third_party/typeshed/stdlib/3/functools.pyi')

In [13]:
get_changes(vulnerabilities_to_check[0])

Error processing commit 43a9c9bfa6aa626ec2a22540bea28d2ca77964be
{'repo': 'pypa/setuptools', 'vulnerability_id': '2022-40897', 'commit': '43a9c9bfa6aa626ec2a22540bea28d2ca77964be', 'commit_source': 'github', 'patch': ['@@ -217,7 +217,7 @@ def wrapper(*args, **kwargs):\n     return wrapper\n \n \n-REL = re.compile(r"""<([^>]*\\srel\\s*=\\s*[\'"]?([^\'">]+)[^>]*)>""", re.I)\n+REL = re.compile(r"""<([^>]*\\srel\\s{0,10}=\\s{0,10}[\'"]?([^\'" >]+)[^>]*)>""", re.I)\n """\n Regex for an HTML tag with \'rel="val"\' attributes.\n """'], 'file': ['setuptools/package_index.py'], 'language': ['Python']}


ValueError: tuple.index(x): x not in tuple

In [6]:
new_code_unit_changes = []
new_code_context_changes = []

for commit_data_file in PYTHON_CODE_UNITS_DATA_PATH.rglob("*.json"):
    try:
        with commit_data_file.open() as f:
            new_code_unit_changes.append(json.load(f))
    except:
        print(commit_data_file)
        commit_data_file.unlink()
for commit_data_file in PYTHON_CODE_CONTEXT_DATA_PATH.rglob("*.json"):
    try:
        with commit_data_file.open() as f:
            new_code_context_changes.append(json.load(f))
    except:
        print(commit_data_file)
        commit_data_file.unlink()

pl.DataFrame(new_code_unit_changes).write_parquet(PYTHON_CODE_FIXES_DATA_PATH)
pl.DataFrame(new_code_context_changes).write_parquet(PYTHON_CODE_FIXES_WITH_CONTEXT_DATA_PATH)

In [23]:
pl.read_parquet("data/python_vulnerability_fixes_code_context_changes1.parquet").shape[0]

3029

In [10]:
code_unit_changes_data = pl.DataFrame(code_unit_changes)
code_context_changes_data = pl.DataFrame(code_context_changes)

code_unit_changes_data.write_parquet(
    "data/python_vulnerability_fixes_code_unit_changes.parquet"
)
code_context_changes_data.write_parquet(
    "data/python_vulnerability_fixes_code_context_changes.parquet"
)

NameError: name 'code_unit_changes' is not defined

In [30]:
print(repo_name)
print(fix_commit)

kovidgoyal/calibre
3a89718664cb8c


In [26]:
diffs = whatthepatch.parse_patch(patch)
list(diffs)[0].changes

[Change(old=177, new=177, line='        pass', hunk=1),
 Change(old=178, new=178, line='', hunk=1),
 Change(old=179, new=179, line='    def skip(self, type):', hunk=1),
 Change(old=180, new=None, line='        if type == TType.STOP:', hunk=1),
 Change(old=181, new=None, line='            return', hunk=1),
 Change(old=182, new=None, line='        elif type == TType.BOOL:', hunk=1),
 Change(old=None, new=180, line='        if type == TType.BOOL:', hunk=1),
 Change(old=183, new=181, line='            self.readBool()', hunk=1),
 Change(old=184, new=182, line='        elif type == TType.BYTE:', hunk=1),
 Change(old=185, new=183, line='            self.readByte()', hunk=1),
 Change(old=220, new=218, line='            for _ in range(size):', hunk=2),
 Change(old=221, new=219, line='                self.skip(etype)', hunk=2),
 Change(old=222, new=220, line='            self.readListEnd()', hunk=2),
 Change(old=None, new=221, line='        else:', hunk=2),
 Change(old=None, new=222, line='     

In [14]:
project = jedi.Project("repos/parisneo/lollms")
script = jedi.Script(
    path="repos/parisneo/lollms/lollms/server/endpoints/lollms_file_system.py", project=project)
name = script.goto(328, 23, follow_imports=True)[0]
name.type

'class'

In [304]:
script.get_context(27).parent().parent()

In [260]:
[name.module_name for name in script.goto(27, 32)]

['tornado.web']

In [214]:
[name for name in script.get_references(53, len("            callback = sanitize_")) if name.is_definition()]

[<Name full_name='code.autocomplete.server.sanitize_callback', description='def sanitize_callback'>]

In [37]:
commit_data_only_top_langs.unique(["cve_id", "commit"]).filter(pl.col("commit") == "63cde2daadc705edf086f2213b48c8c547f98358").head(50)

cve_id,repo,commit,language,reference,year,cwe_id,detection_date,score
str,str,str,str,str,i64,str,str,f64
"""2021-21695""","""jenkinsci/jenkins""","""63cde2daadc705edf086f2213b48c8…","""Java""","""https://github.com/jenkinsci/j…",2021,"""CWE-59""","""2022-05-24T19:19:43.000000""",9.0
"""2021-21690""","""jenkinsci/jenkins""","""63cde2daadc705edf086f2213b48c8…","""Java""","""https://github.com/jenkinsci/j…",2021,"""CWE-22""","""2022-05-24T19:19:44.000000""",9.0
"""2021-21694""","""jenkinsci/jenkins""","""63cde2daadc705edf086f2213b48c8…","""Java""","""https://github.com/jenkinsci/j…",2021,"""CWE-862""","""2022-05-24T19:19:44.000000""",9.0
"""2021-21688""","""jenkinsci/jenkins""","""63cde2daadc705edf086f2213b48c8…","""Java""","""https://github.com/jenkinsci/j…",2021,"""CWE-862""","""2022-05-24T19:19:44.000000""",9.0
"""2021-21692""","""jenkinsci/jenkins""","""63cde2daadc705edf086f2213b48c8…","""Java""","""https://github.com/jenkinsci/j…",2021,"""CWE-22""","""2022-05-24T19:19:44.000000""",9.0
"""2021-21686""","""jenkinsci/jenkins""","""63cde2daadc705edf086f2213b48c8…","""Java""","""https://github.com/jenkinsci/j…",2021,"""CWE-22""","""2022-05-24T19:19:45.000000""",9.0
"""2021-21693""","""jenkinsci/jenkins""","""63cde2daadc705edf086f2213b48c8…","""Java""","""https://github.com/jenkinsci/j…",2021,"""CWE-285""","""2022-05-24T19:19:44.000000""",9.0
"""2021-21691""","""jenkinsci/jenkins""","""63cde2daadc705edf086f2213b48c8…","""Java""","""https://github.com/jenkinsci/j…",2021,"""CWE-59""","""2022-05-24T19:19:44.000000""",9.0
"""2021-21689""","""jenkinsci/jenkins""","""63cde2daadc705edf086f2213b48c8…","""Java""","""https://github.com/jenkinsci/j…",2021,"""CWE-862""","""2022-05-24T19:19:44.000000""",9.0
"""2021-21685""","""jenkinsci/jenkins""","""63cde2daadc705edf086f2213b48c8…","""Java""","""https://github.com/jenkinsci/j…",2021,"""CWE-862""","""2022-05-24T19:19:44.000000""",9.0


In [258]:
another_script = jedi.Script(path="repos/rtxteam/rtx/code/kg2c/create_indexes_constraints_canonicalized.py", project=project)
another_script.goto(102,36)

[]