In [46]:
import os
import re
import javalang
from git import Repo
from javalang import tokenizer as jtok

In [47]:
# github repo
repo_url = "https://github.com/spring-projects/spring-boot.git"
repo_name = repo_url.split('/')[-1].removesuffix('.git')


In [48]:
repo_name

'spring-boot'

In [None]:
# clone repo
Repo.clone_from(repo_url, repo_name)
print("Cloned")


In [50]:
HEX_HASH_RE = re.compile(r"#([0-9a-fA-F]{3,8})\b") # #RGB, #RRGGBB,
LINE_START_HASH_RE = re.compile(r"(?m)^\s*#.*$") # lines that start with '#'

def remove_hashes(code: str) -> str:
    # remove alone # at start of line
    code = LINE_START_HASH_RE.sub("/* removed-nonjava-hash-line */", code)
    # convert #hex color tokens into comments so lexer ignores them
    code = HEX_HASH_RE.sub(r"/*#\1*/", code)
    return code

In [51]:
def parse(code: str):
  try:
    return javalang.parse.parse(code)
  except jtok.LexerError:
    cleaned = remove_hashes(code)
    return javalang.parse.parse(cleaned)

In [52]:
def _dims_count(dims_attr):
    # javalang may give an int or a list for dimensions want to convert this to an int
    if dims_attr is None:
        return 0
    if isinstance(dims_attr, int):
        return dims_attr
    if isinstance(dims_attr, (list, tuple)):
        return len(dims_attr)
    try:
        return int(dims_attr)
    except Exception:
        return 0

In [53]:
# def _dims_count(dims_attr):
#     if dims_attr is None: return 0
#     if isinstance(dims_attr, int): return dims_attr
#     if isinstance(dims_attr, (list, tuple)): return len(dims_attr)
#     try: return int(dims_attr)
#     except Exception: return 0

def type_to_str(t) -> str:
    """Pretty-print type name"""
    if t is None:
        return ""
    if isinstance(t, javalang.tree.BasicType):
        name = t.name # then will get 'int', 'void'
    elif isinstance(t, javalang.tree.ReferenceType):
        name = ".".join(t.name) if isinstance(t.name, list) else t.name
        args = getattr(t, "arguments", None)
        if args:
            rendered = []
            for a in args:
                if hasattr(a, "type") and a.type is not None:
                    rendered.append(type_to_str(a.type))
                else:
                    rendered.append("?")
            name += "<" + ", ".join(rendered) + ">"
    else:
        name = getattr(t, "name", str(t))
    dims = "[]" * _dims_count(getattr(t, "dimensions", 0))
    return name + dims

def param_to_str(p) -> str:
    """Pretty-print param names"""
    t_str = type_to_str(getattr(p, "type", None))
    varargs = "..." if getattr(p, "varargs", False) else ""
    extra_dims = "[]" * _dims_count(getattr(p, "dimensions", 0))
    return f"{t_str}{varargs}{extra_dims} {p.name}".strip()

def simple_signature(node) -> str:
    """public static int gcd(int a, int b)"""
    mods = " ".join(sorted(getattr(node, "modifiers", []) or [])) # this gets the 'public static'
    ret  = type_to_str(getattr(node, "return_type", None)) or "void" # this get return type, default 'void'
    name = node.name
    params = ", ".join(param_to_str(p) for p in (node.parameters or []))
    # join all together using single spaces
    left = " ".join(x for x in [mods, ret, name] if x)
    return f"{left}({params})"


In [54]:
# multi-char operators first
TOKEN_RE = re.compile(r"""(?:==|!=|<=|>=|&&|\|\||::|->) | [A-Za-z_][A-Za-z0-9_]* | \d+\.\d+|\d+| [{}()\[\],.;<>+\-*/%&|^!?=~]""", re.VERBOSE)

def tokenize_code(s: str):
    return TOKEN_RE.findall(s)

In [55]:
java_files = [
    os.path.join(dp, f) for dp, dn, fn in os.walk(repo_name)
    for f in fn if f.endswith(".java")
]
methods = []
for file_path in java_files:
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            code = f.read()

        tree = parse(code)
        lines = code.splitlines()

        method_declarations = list(tree.filter(javalang.tree.MethodDeclaration))
        # sort methods by starting line number
        method_declarations.sort(key=lambda item: item[1].position.line)

        for i, (path, node) in enumerate(method_declarations):
            print('im here')
            start_line = node.position.line
            print('start line')
            
            # end line = start of next method or end of file
            if i + 1 < len(method_declarations):
                next_method_node = method_declarations[i+1][1]
                end_line = next_method_node.position.line - 1
            else:
                # if last method, find end of class body
                end_line = len(lines)
            print('end line')
            print(node.name)
            sig = simple_signature(node)
            methods.append({
                'file': file_path,
                'name': node.name,
                'start_line': start_line,
                'end_line': end_line,
                'signature': sig,
                "code_tokens": tokenize_code(sig + " {")
            })

    except (javalang.parser.JavaSyntaxError, jtok.LexerError) as e:
        print(f"Could not parse {file_path}: {e}")

im here
start line
end line
getStatusReturnsStatus
im here
start line
end line
getProgressDetailReturnsProgressDetails
im here
start line
end line
getProgressDetailReturnsProgressDetailsForLongNumbers
im here
start line
end line
getProgressReturnsProgress
im here
start line
end line
createEvent
im here
start line
end line
createEvent
im here
start line
end line
test
im here
start line
end line
getIdReturnsId
im here
start line
end line
getErrorReturnsErrorDetail
im here
start line
end line
createEvent
im here
start line
end line
toSystemOutPrintsToSystemOut
im here
start line
end line
toPrintsToOutput
im here
start line
end line
getIdReturnsId
im here
start line
end line
createEvent
im here
start line
end line
totalProgress
im here
start line
end line
totalProgressUpdatesSmoothly
im here
start line
end line
run
im here
start line
end line
onUpdate
im here
start line
end line
setup
im here
start line
end line
http
im here
start line
end line
emptyResponse
im here
start line
end line
res

In [56]:
methods

[{'file': 'spring-boot/buildpack/spring-boot-buildpack-platform/src/test/java/org/springframework/boot/buildpack/platform/docker/ProgressUpdateEventTests.java',
  'name': 'getStatusReturnsStatus',
  'start_line': 36,
  'end_line': 41,
  'signature': 'void getStatusReturnsStatus()',
  'code_tokens': ['void', 'getStatusReturnsStatus', '(', ')', '{']},
 {'file': 'spring-boot/buildpack/spring-boot-buildpack-platform/src/test/java/org/springframework/boot/buildpack/platform/docker/ProgressUpdateEventTests.java',
  'name': 'getProgressDetailReturnsProgressDetails',
  'start_line': 42,
  'end_line': 47,
  'signature': 'void getProgressDetailReturnsProgressDetails()',
  'code_tokens': ['void',
   'getProgressDetailReturnsProgressDetails',
   '(',
   ')',
   '{']},
 {'file': 'spring-boot/buildpack/spring-boot-buildpack-platform/src/test/java/org/springframework/boot/buildpack/platform/docker/ProgressUpdateEventTests.java',
  'name': 'getProgressDetailReturnsProgressDetailsForLongNumbers',
  'st

In [60]:
import csv, random, json

out_file = "methods.csv"
# shuffle methods so random
random.shuffle(methods)
# first 25k = train, reamining = test
train_cutoff = 25000
for i, m in enumerate(methods):
    m["dataset_split"] = "train" if i < train_cutoff else "test"

fieldnames = ["file", "name", "start_line", "end_line", "signature", "code_tokens", "dataset_split"]

with open(out_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    for m in methods:
        row = m.copy()
        row["code_tokens"] = json.dumps(m["code_tokens"])
        writer.writerow(row)


In [61]:
import pandas as pd
df = pd.read_csv('methods.csv')

In [62]:
df

Unnamed: 0,file,name,start_line,end_line,signature,code_tokens,dataset_split
0,spring-boot/smoke-test/spring-boot-smoke-test-...,configure,52,56,protected SpringApplicationBuilder configure(S...,"[""protected"", ""SpringApplicationBuilder"", ""con...",train
1,spring-boot/buildpack/spring-boot-buildpack-pl...,loadJsonFromDistributionManifestList,33,40,void loadJsonFromDistributionManifestList(),"[""void"", ""loadJsonFromDistributionManifestList...",train
2,spring-boot/smoke-test/spring-boot-smoke-test-...,testLegacyDot,165,172,void testLegacyDot(),"[""void"", ""testLegacyDot"", ""("", "")"", ""{""]",train
3,spring-boot/documentation/spring-boot-docs/src...,setHost,41,52,public void setHost(Host host),"[""public"", ""void"", ""setHost"", ""("", ""Host"", ""ho...",train
4,spring-boot/module/spring-boot-security/src/ma...,healthMatcher,78,81,private ServerWebExchangeMatcher healthMatcher(),"[""private"", ""ServerWebExchangeMatcher"", ""healt...",train
...,...,...,...,...,...,...,...
34298,spring-boot/documentation/spring-boot-docs/src...,setCheckLocation,41,44,public void setCheckLocation(boolean checkLoca...,"[""public"", ""void"", ""setCheckLocation"", ""("", ""b...",test
34299,spring-boot/core/spring-boot-autoconfigure/src...,expressionIsTrue,45,50,void expressionIsTrue(),"[""void"", ""expressionIsTrue"", ""("", "")"", ""{""]",test
34300,spring-boot/module/spring-boot-servlet/src/tes...,dispatcherServlet,251,260,DispatcherServlet dispatcherServlet(),"[""DispatcherServlet"", ""dispatcherServlet"", ""(""...",test
34301,spring-boot/module/spring-boot-actuator-autoco...,runWhenEnabledPropertyIsFalseShouldNotHaveEndp...,60,65,void runWhenEnabledPropertyIsFalseShouldNotHav...,"[""void"", ""runWhenEnabledPropertyIsFalseShouldN...",test
