In [None]:
import os
import json
import tree_sitter_java as tsjava
from tree_sitter import Language, Parser, Tree, Node
from unixcoder import UniXcoder
import torch

# Helper function to extract code fragments and metadata for each level
def extract_code_fragments(node, code, level: str):
    fragments = []
    if level == "class" and node.type == "class_declaration":
        fragments.append({
            "name": get_node_name(node, code),
            "code": code[node.start_byte:node.end_byte].decode('utf8'),
            "start_byte": node.start_byte,
            "end_byte": node.end_byte
        })
    elif level == "method" and node.type == "method_declaration":
        fragments.append({
            "name": get_node_name(node, code),
            "code": code[node.start_byte:node.end_byte].decode('utf8'),
            "start_byte": node.start_byte,
            "end_byte": node.end_byte
        })
    elif level == "token" and len(node.children) == 0 and (node.type != 'block_comment' and node.type != 'line_comment'):
        fragments.append({
            "name": get_node_name(node, code) or f"token_{node.start_byte}",
            "code": code[node.start_byte:node.end_byte].decode('utf8'),
            "start_byte": node.start_byte,
            "end_byte": node.end_byte
        })
    
    for child in node.children:
        fragments.extend(extract_code_fragments(child, code, level))
    
    return fragments

# Helper function to get the name of a node, if applicable
def get_node_name(node, code):
    for child in node.children:
        if child.type == "identifier":  # Commonly used for class/method names
            return code[child.start_byte:child.end_byte].decode('utf8')
    return None

# Function to generate embeddings with UniXcoder
def generate_embedding(model, device, code):
    tokens_ids = model.tokenize([code])
    source_ids = torch.tensor(tokens_ids).to(device)
    with torch.no_grad():
        _, code_embedding = model(source_ids)
    return code_embedding.tolist()

# Parse a single Java file and extract named fragments by level with embeddings
def parse_java_file(file_path, level, model, device):
    # Read Java file
    with open(file_path, 'rb') as f:
        java_code = f.read()

    # Parse the file
    parser = Parser(Language(tsjava.language()))
    tree = parser.parse(java_code)
    root_node = tree.root_node

    # Extract code fragments and metadata based on level
    fragments = extract_code_fragments(root_node, java_code, level)
    embedded_fragments = []
    for fragment in fragments:
        embedding = generate_embedding(model, device, fragment["code"])
        embedded_fragments.append({
            "name": fragment["name"],
            "embedding": embedding,
            "level": level,
            "position": {"start_byte": fragment["start_byte"], "end_byte": fragment["end_byte"]}
        })
    return embedded_fragments

# Extract and save fragments with embeddings to organized directory structure
def extract_and_save(directory, levels=["class", "method", "token"]):
    # Initialize UniXcoder model and device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = UniXcoder("microsoft/unixcoder-base")
    model.to(device)

    # Traverse directory to find all .java files and relevant directories
    for root, dirs, files in os.walk(directory):
        # Check if there are .java files in the current directory
        java_files = [file for file in files if file.endswith(".java")]
        if java_files:
            base_name = os.path.basename(root)
            is_in_ivy_root = os.path.abspath(root) == os.path.abspath(directory)  # Check if in IVY root directory

            for java_file in java_files:
                file_name = os.path.splitext(java_file)[0]
                java_file_path = os.path.join(root, java_file)

                # Create directories based on level
                for level in levels:
                    if is_in_ivy_root:
                        # Directly create directories with `fooClass`, `fooMethod`, etc.
                        level_dir = os.path.join(root, f"{file_name}", f"{file_name}{level.capitalize()}")
                    else:
                        # Otherwise, create `<base_name>Class`, `<base_name>Method`, etc.
                        parent_dir = os.path.dirname(root)
                        level_dir = os.path.join(parent_dir, f"{base_name}{level.capitalize()}", file_name)

                    os.makedirs(level_dir, exist_ok=True)
                    print(f"Parsing {java_file_path} for level {level}")
                    
                    # Parse and save fragments as JSON files with embeddings
                    embedded_fragments = parse_java_file(java_file_path, level, model, device)
                    for fragment in embedded_fragments:
                        fragment_path = os.path.join(level_dir, f"{fragment['name']}.json")
                        with open(fragment_path, 'w') as json_file:
                            json.dump({
                                "embedding": fragment["embedding"],
                                "level": fragment["level"],
                                "position": fragment["position"]
                            }, json_file, indent=2)

In [None]:
# Usage example
extract_and_save("/scratch/zzhang30/cs412/ant-ivy/src/java/org/apache/ivy")

In [19]:
import os
import json
import tree_sitter_java as tsjava
from tree_sitter import Language, Parser, Tree, Node
from unixcoder import UniXcoder
import torch

# Helper function to extract all code fragments and metadata in a single pass
def extract_code_fragments(node, code):
    fragments = []
    if node.type == "class_declaration":
        fragments.append({
            "name": get_node_name(node, code),
            "code": code[node.start_byte:node.end_byte].decode('utf8'),
            "level": "class",
            "start_byte": node.start_byte,
            "end_byte": node.end_byte
        })
    elif node.type == "method_declaration":
        fragments.append({
            "name": get_node_name(node, code),
            "code": code[node.start_byte:node.end_byte].decode('utf8'),
            "level": "method",
            "start_byte": node.start_byte,
            "end_byte": node.end_byte
        })
    elif len(node.children) == 0 and (node.type != 'block_comment' and node.type != 'line_comment'):
        fragments.append({
            "name": get_node_name(node, code) or f"token_{node.start_byte}",
            "code": code[node.start_byte:node.end_byte].decode('utf8'),
            "level": "token",
            "start_byte": node.start_byte,
            "end_byte": node.end_byte
        })
    
    for child in node.children:
        fragments.extend(extract_code_fragments(child, code))
    
    return fragments

# Helper function to get the name of a node, if applicable
def get_node_name(node, code):
    for child in node.children:
        if child.type == "identifier":  # Commonly used for class/method names
            return code[child.start_byte:child.end_byte].decode('utf8')
    return None

# Function to generate embeddings with UniXcoder
def generate_embedding(model, device, code):
    tokens_ids = model.tokenize([code])
    source_ids = torch.tensor(tokens_ids).to(device)
    with torch.no_grad():
        _, code_embedding = model(source_ids)
    return code_embedding.tolist()

# Parse a single Java file and extract all fragments with embeddings
def parse_java_file(file_path, model, device):
    # Read Java file
    with open(file_path, 'rb') as f:
        java_code = f.read()

    # Parse the file
    parser = Parser(Language(tsjava.language()))
    tree = parser.parse(java_code)
    root_node = tree.root_node

    # Extract all fragments (class, method, and token) in a single pass
    fragments = extract_code_fragments(root_node, java_code)
    embedded_fragments = []
    for fragment in fragments:
        embedding = generate_embedding(model, device, fragment["code"])
        embedded_fragments.append({
            "name": fragment["name"],
            "embedding": embedding,
            "level": fragment["level"],
            "position": {"start_byte": fragment["start_byte"], "end_byte": fragment["end_byte"]}
        })
    return embedded_fragments

# Extract and save fragments with embeddings to organized directory structure
def extract_and_save(directory):
    # Initialize UniXcoder model and device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = UniXcoder("microsoft/unixcoder-base")
    model.to(device)

    # Traverse directory to find all .java files and relevant directories
    for root, dirs, files in os.walk(directory):
        # Check if there are .java files in the current directory
        java_files = [file for file in files if file.endswith(".java")]
        if java_files:
            base_name = os.path.basename(root)
            is_in_ivy_root = os.path.abspath(root) == os.path.abspath(directory)  # Check if in IVY root directory

            for java_file in java_files:
                file_name = os.path.splitext(java_file)[0]
                java_file_path = os.path.join(root, java_file)

                # Parse the file once and extract all fragments with embeddings
                print(f"Parsing {java_file_path}")
                embedded_fragments = parse_java_file(java_file_path, model, device)
                
                # Save fragments based on their level
                for fragment in embedded_fragments:
                    if is_in_ivy_root:
                        # Create directories like `fooClass`, `fooMethod`, etc., for files directly under IVY
                        level_dir = os.path.join(root, f"{file_name}{fragment['level'].capitalize()}")
                    else:
                        # Create directories like `<subdir_name>Class`, `<subdir_name>Method`, etc.
                        parent_dir = os.path.dirname(root)
                        level_dir = os.path.join(parent_dir, f"{base_name}{fragment['level'].capitalize()}", file_name)

                    os.makedirs(level_dir, exist_ok=True)
                    fragment_path = os.path.join(level_dir, f"{fragment['name']}.json")
                    
                    # Write the fragment's embedding and metadata to a JSON file
                    with open(fragment_path, 'w') as json_file:
                        json.dump({
                            "embedding": fragment["embedding"],
                            "level": fragment["level"],
                            "position": fragment["position"]
                        }, json_file, indent=2)



In [20]:
extract_and_save("/scratch/zzhang30/cs412/ant-ivy/src/java/org/apache/ivy")

Parsing /scratch/zzhang30/cs412/ant-ivy/src/java/org/apache/ivy/Ivy14.java
Parsing /scratch/zzhang30/cs412/ant-ivy/src/java/org/apache/ivy/Main.java


KeyboardInterrupt: 