<!-- ## Data Engineering -->

In [1]:
from typing import Generator
import tree_sitter_java as tsjava
from tree_sitter import Language, Parser, Tree, Node
from unixcoder import UniXcoder
import torch

In [2]:
parser = Parser(Language(tsjava.language()))

In [3]:
# Download from https://github.com/apache/ant-ivy/blob/master/src/java/org/apache/ivy/Ivy.java
with open('Ivy.java') as f:
    source_code = f.read()
# source_code

In [4]:
tree = parser.parse(bytes(source_code, 'utf8'))
tree

<tree_sitter.Tree at 0x75e152b5bfc0>

In [5]:
def traverse_tree(tree: Tree) -> Generator[Node, None, None]:
    cursor = tree.walk()

    visited_children = False
    while True:
        if not visited_children:
            yield cursor.node
            if not cursor.goto_first_child():
                visited_children = True
        elif cursor.goto_next_sibling():
            visited_children = False
        elif not cursor.goto_parent():
            break

In [6]:
# See unique node types in tree
x = []
for node in traverse_tree(tree):
    x.append(node.type)
set(x)

{'!',
 '!=',
 '"',
 '&&',
 '(',
 ')',
 '+',
 ',',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '==',
 '>',
 '@',
 '[',
 ']',
 'annotation',
 'annotation_argument_list',
 'argument_list',
 'array_type',
 'assignment_expression',
 'binary_expression',
 'block',
 'block_comment',
 'boolean_type',
 'break',
 'break_statement',
 'case',
 'cast_expression',
 'catch',
 'catch_clause',
 'catch_formal_parameter',
 'catch_type',
 'class',
 'class_body',
 'class_declaration',
 'class_literal',
 'constructor_body',
 'constructor_declaration',
 'decimal_integer_literal',
 'default',
 'dimensions',
 'else',
 'enhanced_for_statement',
 'expression_statement',
 'false',
 'field_access',
 'field_declaration',
 'final',
 'finally',
 'finally_clause',
 'for',
 'formal_parameter',
 'formal_parameters',
 'generic_type',
 'identifier',
 'if',
 'if_statement',
 'import',
 'import_declaration',
 'instanceof',
 'instanceof_expression',
 'int',
 'integral_type',
 'interface',
 'interface_body',
 'interface_declaration'

In [7]:
i = 0
for node in traverse_tree(tree):
    if node.type == 'package_declaration':
        print('i =', i)
        print(source_code[node.start_byte:node.end_byte])
        i += 1
i

i = 0
package org.apache.ivy;


1

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = 'cpu'
model = UniXcoder("microsoft/unixcoder-base")
model.to(device)



UniXcoder(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(51416, 768, padding_idx=1)
      (position_embeddings): Embedding(1026, 768, padding_idx=1)
      (token_type_embeddings): Embedding(10, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [13]:
# Write code for collecting data in a single Java file

results = {} # Store data here
id = 0 # Each embedding will have its own ID

# Run through every node in the parse tree
parser = Parser(Language(tsjava.language()))
tree = parser.parse(bytes(source_code, 'utf8'))
for node in traverse_tree(tree):        
    # Store results
    if node.type == 'class_declaration':
        level = 'class'
    elif node.type == 'method_declaration':
        level = 'method'
    elif (len(node.children) == 0) and (node.type != 'block_comment' and node.type != 'line_comment'): # Leaf node that is not a comment
        level = 'token'
    else: # If node does not fit one of the above levels, move on to the next
        continue
    
    # Get code embedding
    indices = (node.start_byte, node.end_byte) # Start and end indices to parse code fragments out of the source code
    code = source_code[indices[0] : indices[1]]
    with torch.no_grad():
        torch.cuda.empty_cache()
        tokens_ids = model.tokenize(code)
        source_ids = torch.tensor(tokens_ids).to(device)
        tokens_embeddings, code_embedding = model(source_ids)
        torch.cuda.empty_cache()

    results[id] = (code_embedding.tolist(), level, indices) # Add code embedding and related information
    id += 1 # ID for next embedding

In [15]:
import json
json.dump(results, open("results.json", 'w'))

---

## Get all files from github

### Git trees

https://api.github.com/repos/[USER]/[REPO]/git/trees/[BRANCH]?recursive=1

https://api.github.com/repos/apache/ant-ivy/git/trees/5a36a0dbcb15a68aad7e178ea2792bc4bc4ed943?recursive=1

SHA value = 5a36a0dbcb15a68aad7e178ea2792bc4bc4ed943

### PyGithub

In [21]:
from github import Github
from github import Auth
import requests

In [None]:
token = '' # I would generate a fine-grained token through my GitHub account
auth = Auth.Token(token)
g = Github(auth=auth)

In [23]:
repo = g.get_repo(full_name_or_id='apache/ant-ivy')
repo

Repository(full_name="apache/ant-ivy")

In [24]:
contents = repo.get_contents('src/java/org/apache/ivy')[0]
contents

ContentFile(path="src/java/org/apache/ivy/Ivy.java")

In [25]:
contents.path

'src/java/org/apache/ivy/Ivy.java'

In [38]:
response = requests.get(contents.download_url)
print(response.text)

/*
 *  Licensed to the Apache Software Foundation (ASF) under one or more
 *  contributor license agreements.  See the NOTICE file distributed with
 *  this work for additional information regarding copyright ownership.
 *  The ASF licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      https://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 *
 */
package org.apache.ivy;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDat

In [27]:
# Get the download urls of every java file in a subdirectory
user = 'apache'
repo = 'ant-ivy'
sub_dir = 'src/java/org/apache/ivy'
repo = g.get_repo(full_name_or_id = user + '/' + repo)

def traverse_github(urls:list, dir:str):
    for content in repo.get_contents(dir):
        if len(content.name.split('.')) > 1: # Base case: Content is file
            # Only return if filename has .java extension
            if content.name.split('.')[-1] == 'java':
                urls.append(content.download_url)
        else: # Recursive case: Content is folder
            traverse_github(urls, content.path)

urls = []
traverse_github(urls, sub_dir)

In [29]:
indexed_urls = {idx:url for idx,url in enumerate(urls)}
indexed_urls

{0: 'https://raw.githubusercontent.com/apache/ant-ivy/master/src/java/org/apache/ivy/Ivy.java',
 1: 'https://raw.githubusercontent.com/apache/ant-ivy/master/src/java/org/apache/ivy/Ivy14.java',
 2: 'https://raw.githubusercontent.com/apache/ant-ivy/master/src/java/org/apache/ivy/Main.java',
 3: 'https://raw.githubusercontent.com/apache/ant-ivy/master/src/java/org/apache/ivy/ant/AddPathTask.java',
 4: 'https://raw.githubusercontent.com/apache/ant-ivy/master/src/java/org/apache/ivy/ant/AntBuildTrigger.java',
 5: 'https://raw.githubusercontent.com/apache/ant-ivy/master/src/java/org/apache/ivy/ant/AntCallTrigger.java',
 6: 'https://raw.githubusercontent.com/apache/ant-ivy/master/src/java/org/apache/ivy/ant/AntMessageLogger.java',
 7: 'https://raw.githubusercontent.com/apache/ant-ivy/master/src/java/org/apache/ivy/ant/AntWorkspaceResolver.java',
 8: 'https://raw.githubusercontent.com/apache/ant-ivy/master/src/java/org/apache/ivy/ant/BuildOBRTask.java',
 9: 'https://raw.githubusercontent.com/

In [30]:
len(indexed_urls)

488

In [None]:
import json
# json.dump(indexed_urls, open("download_urls.json", 'w'))

---

## Group java files by package

In [40]:
import requests
import re

In [32]:
# Read data from file
import json
files = json.load(open("download_urls.json"))

In [91]:
packages = {}
for id, file in files.items():
    response = requests.get(file)
    match = re.search(r"package [^\s]*;", response.text)
    pkg = match.group()
    if pkg not in packages.keys():
        packages[pkg] = [id]
    else:
        packages[pkg].append(id)

In [93]:
for item in packages.items():
    print(item)

('package org.apache.ivy;', ['0', '1', '2'])
('package org.apache.ivy.ant;', ['3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52'])
('package org.apache.ivy.core;', ['53', '54', '55', '56', '57', '58', '59'])
('package org.apache.ivy.core.cache;', ['60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73'])
('package org.apache.ivy.core.check;', ['74', '75'])
('package org.apache.ivy.core.deliver;', ['76', '77', '78', '79', '80'])
('package org.apache.ivy.core.event;', ['81', '82', '83', '84', '85'])
('package org.apache.ivy.core.event.download;', ['86', '87', '88', '89', '90'])
('package org.apache.ivy.core.event.publish;', ['91', '92', '93'])
('package org.apache.ivy.core.event.resolve;', ['94', '95', '96', '97', '98', '99'