In [None]:
from tree_sitter import Language, Parser
import re
from io import StringIO
import tokenize
import os
import jsonlines
import csv
import re
from typing import List

DOCSTRING_REGEX_TOKENIZER = re.compile(r"[^\s,'\"`.():\[\]=*;>{\}+-/\\]+|\\+|\.+|\(\)|{\}|\[\]|\(+|\)+|:+|\[+|\]+|{+|\}+|=+|\*+|;+|>+|\++|-+|/+")


def tokenize_docstring(docstring: str) -> List[str]:
    return [t for t in DOCSTRING_REGEX_TOKENIZER.findall(docstring) if t is not None and len(t) > 0]


Language.build_library(
  # Store the library in the `build` directory
  'build/my-languages.so',

  # Include one or more languages
  [
    'tree-sitter-r' 
  ]
)

r = Language('build/my-languages.so', 'r')
parser = Parser()
def remove_comments_and_docstrings(source, lang):
    if lang in ['python']:
        """
        Returns 'source' minus comments and docstrings.
        """
        io_obj = StringIO(source)
        out = ""
        prev_toktype = tokenize.INDENT
        last_lineno = -1
        last_col = 0
        for tok in tokenize.generate_tokens(io_obj.readline):
            token_type = tok[0]
            token_string = tok[1]
            start_line, start_col = tok[2]
            end_line, end_col = tok[3]
            ltext = tok[4]
            if start_line > last_lineno:
                last_col = 0
            if start_col > last_col:
                out += (" " * (start_col - last_col))
            # Remove comments:
            if token_type == tokenize.COMMENT:
                pass
            # This series of conditionals removes docstrings:
            elif token_type == tokenize.STRING:
                if prev_toktype != tokenize.INDENT:
                    # This is likely a docstring; double-check we're not inside an operator:
                    if prev_toktype != tokenize.NEWLINE:
                        if start_col > 0:
                            out += token_string
            else:
                out += token_string
            prev_toktype = token_type
            last_col = end_col
            last_lineno = end_line
        temp = []
        for x in out.split('\n'):
            if x.strip() != "":
                temp.append(x)
        return '\n'.join(temp)
    elif lang in ['ruby']:
        return source
    else:
        def replacer(match):
            s = match.group(0)
            if s.startswith('/'):
                return " "  # note: a space and not an empty string
            else:
                return s

        pattern = re.compile(
            r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
            re.DOTALL | re.MULTILINE
        )
        temp = []
        for x in re.sub(pattern, replacer, source).split('\n'):
            if x.strip() != "":
                temp.append(x)
        return '\n'.join(temp)
    
def tree_to_token_index(root_node):
    if (len(root_node.children) == 0 or root_node.type in ['string_literal', 'string', 'character_literal']):
        return [(root_node.start_point, root_node.end_point, root_node.type)]
    else:
        code_tokens = []
        for child in root_node.children:
            code_tokens += tree_to_token_index(child)
        return code_tokens


def tree_to_variable_index(root_node, index_to_code):
    if (len(root_node.children) == 0 or root_node.type in ['string_literal', 'string',
                                                           'character_literal']) and root_node.type != 'comment':
        index = (root_node.start_point, root_node.end_point)
        _, code = index_to_code[index]
        if root_node.type != code:
            return [(root_node.start_point, root_node.end_point)]
        else:
            return []
    else:
        code_tokens = []
        for child in root_node.children:
            code_tokens += tree_to_variable_index(child, index_to_code)
        return code_tokens

def index_to_code_token(index, code):
    start_point = index[0]
    end_point = index[1]
    if start_point[0] == end_point[0]:
        s = code[start_point[0]][start_point[1]:end_point[1]]
    else:
        s = ""
        s += code[start_point[0]][start_point[1]:]
        for i in range(start_point[0] + 1, end_point[0]):
            s += code[i]
        s += code[end_point[0]][:end_point[1]]
    return s

def get_type(parser, code, lang):
    parser.set_language(lang)
    tree = parser.parse(bytes(code, 'utf8'))
    root_node = tree.root_node
    tokens_index = tree_to_token_index(root_node)
    code = code.split('\n')
    code_tokens = [index_to_code_token(x, code) for x in tokens_index]
    index_to_code = {}
    for (index, code) in zip(tokens_index, code_tokens):
        index_to_code[index] = code
    return index_to_code

def get_identifiers(parser, code, lang):
    parser.set_language(lang)
    code = remove_comments_and_docstrings(code,lang)
    tree = parser.parse(bytes(code, 'utf8'))
    root_node = tree.root_node
    tokens_index = tree_to_token_index(root_node)
    code = code.split('\n')
    code_tokens = [index_to_code_token(x, code) for x in tokens_index]
    index_to_code = {}
    for (index, code) in zip(tokens_index, code_tokens):
        if(index[2]=='identifier'):
            index_to_code[index] = code
    return index_to_code

In [24]:
code_set = [
    ("""
    html_extras_for_document <- function(knit_meta, runtime, dependency_resolver,
                                        format_deps = NULL) {
    """, r),
]

In [25]:
import pprint
for code, lang in code_set:
    print('*'*100)
    print("Original Code")
    print(code)
    print("_"*100)
    print("Parsed Identifiers")
    pprint.pprint(get_type(parser, code, lang))
    print('*'*100)

****************************************************************************************************
Original Code

    html_extras_for_document <- function(knit_meta, runtime, dependency_resolver,
                                        format_deps = NULL) {
    
____________________________________________________________________________________________________
Parsed Identifiers
{((1, 4), (1, 28), 'identifier'): 'html_extras_for_document',
 ((1, 29), (1, 31), '<-'): '<-',
 ((1, 32), (1, 40), 'function'): 'function',
 ((1, 40), (1, 41), '('): '(',
 ((1, 41), (1, 50), 'identifier'): 'knit_meta',
 ((1, 50), (1, 51), ','): ',',
 ((1, 52), (1, 59), 'identifier'): 'runtime',
 ((1, 59), (1, 60), ','): ',',
 ((1, 61), (1, 80), 'identifier'): 'dependency_resolver',
 ((1, 80), (1, 81), ','): ',',
 ((2, 40), (2, 51), 'identifier'): 'format_deps',
 ((2, 52), (2, 53), '='): '=',
 ((2, 54), (2, 58), 'null'): 'NULL',
 ((2, 58), (2, 59), ')'): ')',
 ((2, 60), (2, 61), '{'): '{',
 ((2, 61), (2, 61),

In [None]:

with open("MetaData.csv", "r") as metaData, open("MetaData_final.csv", "a+", newline="") as f1:
    writer1 = csv.writer(f1)
    writer1.writerow(["id", "repo", "html_url","owner_login", "contents_url", "created_at", "last_commit", "contributor_count","language", "star_count", "forks_count", "tidy", "# of R file", "# of function"])
    reader = csv.reader(metaData)
    next(reader)
    for row in reader:
        file_path = "" + row[1]+ '/'
        rdpath = file_path + 'man/'
        count = 0
        if os.path.exists(file_path+'functions.jsonl'):
            continue
        for file in os.listdir(rdpath):
            print(rdpath+file)
            with open(rdpath+file, 'r') as rd:
                names = []
                document = ''
                flag = False
                Rfilename = []
                for lines in rd:
                    if lines.__contains__(r'% Please edit documentation in R/'):
                        d = re.split(r'\s+', lines)
                        for item in d:
                            if item.__contains__(r'.r') or item.__contains__(r'.R'):
                                Rfilename.append(item.replace('R/','').replace(',','').replace('\n',''))
                    if lines.__contains__(r'\name{'):
                        names.append(lines[6:-2])
                    if lines.__contains__(r'\alias{'):
                        names.append(lines[7:-2])
                    if lines.__contains__(r'\description{') or lines.__contains__(r'\details{'):
                        flag = True
                        continue
                    if lines=='}\n':
                        flag = False
                    if flag:
                        document = document + lines.replace('\n', ' ')

                print(Rfilename)
                print(names)
                for name in Rfilename:
                    Rfile = open(file_path+name).read()
                    tokens = get_type(parser, Rfile, r)
                    fname = ''
                    code = []
                    t1 = None
                    t2 = None
                    t3 = None
                    c1 = None
                    c2 = None
                    c3 = None
                    codeF = False
                    c = 0
                    isChanged = False
                    for key, value in tokens.items():
                        t1 = t2
                        t2 = t3
                        t3 = key[2]
                        c1 = c2
                        c2 = c3
                        c3 = value
                        if t1 == 'identifier' and c1 in names and (t2 == '<-' or t2 == '=') and t3 == 'function':
                            codeF = True
                            fname = c1
                        if codeF:
                            code.append(value)
                            if value == '{':
                                c += 1
                                isChanged = True
                            if value == '}':
                                c -= 1

                            if c == 0 and isChanged:
                                with jsonlines.open(file_path+'functions.jsonl', mode='a') as writer:
                                    json = {'repo':row[1],'func_name':fname, 'code_tokens':code, 'docstring':document,'docstring_tokens':tokenize_docstring(document)}
                                    writer.write(json)
                                    count += 1
                                code = []
                                codeF = False
                                c = 0
                                isChanged = False
        writer1.writerow([row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9], row[10], row[11], row[12], count])



C:/Users/shawn/Desktop/R/R/ropensci/targets/man/callr_args_default.Rd
['utils_callr.R']
['callr_args_default', 'callr_args_default']
C:/Users/shawn/Desktop/R/R/ropensci/targets/man/reexports.Rd
['tar_tidyselect.R']
['reexports', 'reexports', 'all_of', 'any_of', 'contains', 'ends_with', 'everything', 'last_col', 'matches', 'num_range', 'one_of', 'starts_with']
C:/Users/shawn/Desktop/R/R/ropensci/targets/man/rstudio_addin_tar_glimpse.Rd
['rstudio_addin_tar_glimpse.R']
['rstudio_addin_tar_glimpse', 'rstudio_addin_tar_glimpse']
C:/Users/shawn/Desktop/R/R/ropensci/targets/man/rstudio_addin_tar_load.Rd
['rstudio_addin_tar_load.R']
['rstudio_addin_tar_load', 'rstudio_addin_tar_load']
C:/Users/shawn/Desktop/R/R/ropensci/targets/man/rstudio_addin_tar_make_bg.Rd
['rstudio_addin_tar_make_bg.R']
['rstudio_addin_tar_make_bg', 'rstudio_addin_tar_make_bg']
C:/Users/shawn/Desktop/R/R/ropensci/targets/man/rstudio_addin_tar_outdated.Rd
['rstudio_addin_tar_outdated.R']
['rstudio_addin_tar_outdated', 'rst

In [None]:
Rfile = open('tesselle/chronos/subset.R').read()
tokens = get_type(parser, Rfile, r)
pprint.pprint(tokens)

{((0, 0), (0, 8), 'comment'): '# SUBSET',
 ((1, 0), (1, 25), 'comment'): "#' @include AllGenerics.R",
 ((2, 0), (2, 4), 'null'): 'NULL',
 ((2, 4), (4, 0), '\n'): '',
 ((4, 0), (4, 80), 'comment'): '# Extract '
 ((5, 0), (5, 80), 'comment'): '## [ '
                               '---------------------------------------------------------------------------',
 ((6, 0), (6, 10), 'comment'): "#' @export",
 ((7, 0), (7, 17), 'comment'): "#' @rdname subset",
 ((8, 0), (8, 28), 'comment'): "#' @aliases [,RataDie-method",
 ((9, 0), (9, 9), 'identifier'): 'setMethod',
 ((9, 9), (9, 10), '('): '(',
 ((10, 2), (10, 3), 'identifier'): 'f',
 ((10, 4), (10, 5), '='): '=',
 ((10, 6), (10, 9), 'string'): '"["',
 ((10, 9), (10, 10), ','): ',',
 ((11, 2), (11, 11), 'identifier'): 'signature',
 ((11, 12), (11, 13), '='): '=',
 ((11, 14), (11, 15), 'identifier'): 'c',
 ((11, 15), (11, 16), '('): '(',
 ((11, 16), (11, 17), 'identifier'): 'x',
 ((11, 18), (11, 19), '='): '=',
 ((11, 20), (11, 29), 'string'):

In [None]:
with open("ghMetaData.csv", "r") as metaData:
    reader = csv.reader(metaData)
    next(reader)
    for row in reader:
        file_path = "" + row[1]+ '/functions.jsonl'
        if(os.path.isfile(file_path)):
            os.remove(file_path)

In [None]:
file_path = "jakeyeung/scChIX/"
rdpath = file_path + 'man/'
count = 0
for file in os.listdir(rdpath):
    print(rdpath+file)
    with open(rdpath+file, 'r') as rd:
        names = []
        document = ''
        flag = False
        Rfilename = []
        for lines in rd:
            if lines.__contains__(r'% Please edit documentation in R/'):
                d = re.split(r'\s+', lines)
                for item in d:
                    if item.__contains__(r'.r') or item.__contains__(r'.R'):
                        Rfilename.append(item.replace('R/','').replace(',','').replace('\n',''))
            if lines.__contains__(r'\name{'):
                names.append(lines[6:-2])
            if lines.__contains__(r'\alias{'):
                names.append(lines[7:-2])
            if lines.__contains__(r'\description{') or lines.__contains__(r'\details{'):
                flag = True
                continue
            if lines=='}\n':
                flag = False
            if flag:
                document = document + lines.replace('\n', ' ')

        print(Rfilename)
        print(names)
        for name in Rfilename:
            print(file_path+name)
            Rfile = open(file_path+name).read()
            tokens = get_type(parser, Rfile, r)
            fname = ''
            code = []
            t1 = None
            t2 = None
            t3 = None
            c1 = None
            c2 = None
            c3 = None
            codeF = False
            c = 0
            isChanged = False
            for key, value in tokens.items():
                print(key, value)
                t1 = t2
                t2 = t3
                t3 = key[2]
                c1 = c2
                c2 = c3
                c3 = value
                if t1 == 'identifier' and c1 in names and t2 == '<-' and t3 == 'function':
                    codeF = True
                    fname = c1
                if codeF:
                    code.append(value)
                    if value == '{':
                        c += 1
                        isChanged = True
                    if value == '}':
                        c -= 1

                    if c == 0 and isChanged:
                        with jsonlines.open(file_path+'functions.jsonl', mode='a') as writer:
                            json = {"func_name":fname, "code_token":code, "docstring":document}
                            writer.write(json)
                            count += 1
                        code = []
                        codeF = False
                        c = 0
                        isChanged = False

C:/Users/shawn/Desktop/R/R/jakeyeung/scChIX/man/ClipLast.Rd
['_aux.R']
['ClipLast', 'ClipLast']
C:/Users/shawn/Desktop/R/R/jakeyeung/scChIX/_aux.R
((0, 0), (0, 12), 'comment') # Jake Yeung
((1, 0), (1, 30), 'comment') # Date of Creation: 2021-04-16
((2, 0), (2, 33), 'comment') # File: ~/projects/scChIX/R/aux.R
((3, 0), (3, 1), 'comment') #
((6, 0), (6, 16), 'identifier') AnnotateBins2.R4
((6, 17), (6, 19), '<-') <-
((6, 20), (6, 28), 'function') function
((6, 29), (6, 30), '(') (
((6, 30), (6, 39), 'identifier') terms.mat
((6, 39), (6, 40), ',') ,
((6, 41), (6, 50), 'identifier') top.thres
((6, 51), (6, 52), '=') =
((6, 53), (6, 58), 'float') 0.995
((6, 58), (6, 59), ',') ,
((6, 60), (6, 67), 'identifier') inf.tss
((6, 68), (6, 69), '=') =
((6, 70), (6, 130), 'string') "/Users/yeung/data/scchic/tables/gene_tss_winsize.50000.bed"
((6, 130), (6, 131), ',') ,
((7, 30), (7, 34), 'identifier') txdb
((7, 35), (7, 36), '=') =
((7, 37), (7, 71), 'identifier') TxDb.Mmusculus.UCSC.mm10.knownGene