# Struct Statistics Analysis
This extracts and answers the following questions:
- 1-2. field access inside the loop
- 3. max number of fields in a struct
- 4. has nested structs
- 5. has any storage var of type that is an array of structs
- 6. has any storage var of type that is has values of type structs
- 7. number of structs
- 8. has structs that has at least one member of type array
- 9. Maximum number of maps in structs

In [1]:
import json
import os
import statistics

In [2]:
def remove_comments(raw_lines):
    code_lines = []
    sts_comment = False
    for dline in raw_lines:
        tmp_dline = dline.strip()
        if len(tmp_dline)==0:
            continue
        if tmp_dline.startswith("//"):
            continue
        if tmp_dline.startswith("/*"):
            sts_comment = True
        if not sts_comment:
            code_lines.append(tmp_dline)
        if tmp_dline.endswith("*/"):
            sts_comment = False
    return code_lines

def count_loops(code_lines):
    cnt = 0
    for dline in code_lines:
        if dline.startswith("for (") or dline.startswith("while ("):
            cnt += 1
        elif dline.startswith("for(") or dline.startswith("while("):
            cnt += 1
    return cnt

In [3]:
json_files = os.listdir("./contract-benchmarks-master/loops_asts/")

name_pool = [] # just file names
ast_pool = [] # original full ast
loc_pool = [] # line of code
nol_pool = [] # number of loops

for i in range(len(json_files)):
    print("\r# loading jsons {}/{}".format(i, len(json_files)), end="")
    p = json_files[i]
    with open("./contract-benchmarks-master/loops_asts/{}".format(p), "r", encoding="iso-8859-1") as f:
        raw_lines = f.readlines()
        if len(raw_lines)<=1:
            # skip the empty json
            continue
        # remove the header by starting from 4th line
        raw_ast = json.loads("".join(raw_lines[4:]))
    ast_pool.append(raw_ast)
    name_pool.append(p)
    
    # detect the lines of code using the original source code
    with open("./contract-benchmarks-master/verified_contracts/{}".format(p.replace(".json",".sol")), "r", encoding="iso-8859-1") as f:
        dlines = f.readlines()
    clines = remove_comments(dlines)
    
    loc_pool.append(len(clines))
    nol_pool.append(count_loops(clines))

# loading jsons 4493/4494

In [4]:
# (warning) I'm lazy so here I use a global temporary variable, beware!
# when you call this function on top level, clear the tmp_loop_pool first
tmp_loop_pool = []
def get_loop_node(cnode):
    global tmp_loop_pool
    if cnode is None:
        return
    if isinstance(cnode, dict):
        if "nodeType" in cnode.keys():
            if cnode["nodeType"] in ["ForStatement", "WhileStatement"]:
                tmp_loop_pool.append(cnode)
        for dkey in cnode.keys():
            get_loop_node(cnode[dkey])
    elif isinstance(cnode, list):
        for p in cnode:
            get_loop_node(p)
    else:
        return
    
# extract loops for every file
loop_pool = []
for i in range(len(ast_pool)):
    print("\r# processing {}/{}".format(i, len(ast_pool)),end="")
    tmp_ast = ast_pool[i]
    tmp_loop_pool = []
    get_loop_node(tmp_ast)
    loop_pool.append(tmp_loop_pool)

# processing 4466/4467

In [5]:
# first get all the user-defined struct of a file
# (warning) I'm lazy so here I use a global temporary variable, beware!
# when you call this function on top level, clear the global temp var first
tmp_struct_pool = []
def get_struct_by_name(cnode):
    global tmp_struct_pool
    if cnode is None:
        return
    if isinstance(cnode, dict):
        if "nodeType" in cnode.keys():
            if cnode["nodeType"] in ["StructDefinition"]:
                tmp_struct_pool.append(cnode["canonicalName"])
        for dkey in cnode.keys():
            get_struct_by_name(cnode[dkey])
    elif isinstance(cnode, list):
        for p in cnode:
            get_struct_by_name(p)
    else:
        return
    
# get the canonical name of all structs defined in a file
struct_pool = []
for i in range(len(ast_pool)):
    print("\r# processing {}/{}".format(i, len(ast_pool)),end="")
    tmp_ast = ast_pool[i]
    tmp_struct_pool = []
    get_struct_by_name(tmp_ast)
    struct_pool.append(tmp_struct_pool)

# processing 4466/4467

In [6]:
# for y in struct_pool:
#     if len(y)>0:
#         print(y)

In [7]:
# 1-2. field access inside the loop
# (warning) I'm lazy so here I use a global temporary variable, beware!
# when you call this function on top level, clear the global temp var first
flag_field_access = False
def detect_field_access(cnode, slist):
    global flag_field_access
    if cnode is None:
        return
    if isinstance(cnode, dict):
        if "nodeType" in cnode.keys():
            if cnode["nodeType"] in ["MemberAccess"]:
                if "expression" in cnode.keys():
                    for tp in slist:
                        if tp in cnode["expression"]["typeDescriptions"]["typeString"]:
                            flag_field_access = True
                            return # terminate
                else:
                    raise NotImplementedError("Let me know!")
        for dkey in cnode.keys():
            detect_field_access(cnode[dkey], slist)
    elif isinstance(cnode, list):
        for p in cnode:
            detect_field_access(p, slist)
    else:
        return
    
# detect whether there's field access of a self-defined struct
field_access_pool = []
for i in range(len(ast_pool)):
    print("\r# processing {}/{}".format(i, len(ast_pool)),end="")
    tmp_ast = ast_pool[i]
    tmp_llist = loop_pool[i]
    tmp_slist = struct_pool[i]
    flag_field_access = False
    # zoom into every loop
    for q in tmp_llist:
        detect_field_access(q, tmp_slist)
    field_access_pool.append(flag_field_access)

# processing 4466/4467

In [8]:
# for i in range(len(ast_pool)):
#     if field_access_pool[i]:
#         print("{}, {}".format(i, name_pool[i]))

In [9]:
# 3. max number of fields in a struct
# (warning) I'm lazy so here I use a global temporary variable, beware!
# when you call this function on top level, clear the global temp var first
tmp_max_nfields = 0
def get_max_nfields(cnode):
    global tmp_max_nfields
    if cnode is None:
        return
    if isinstance(cnode, dict):
        if "nodeType" in cnode.keys():
            if cnode["nodeType"] in ["StructDefinition"]:
                if len(cnode["members"])>tmp_max_nfields:
                    tmp_max_nfields = len(cnode["members"])
        for dkey in cnode.keys():
            get_max_nfields(cnode[dkey])
    elif isinstance(cnode, list):
        for p in cnode:
            get_max_nfields(p)
    else:
        return
    
# get the canonical name of all structs defined in a file
max_nfields_pool = []
for i in range(len(ast_pool)):
    print("\r# processing {}/{}".format(i, len(ast_pool)),end="")
    tmp_ast = ast_pool[i]
    tmp_max_nfields = 0
    get_max_nfields(tmp_ast)
    max_nfields_pool.append(tmp_max_nfields)

# processing 4466/4467

In [10]:
# 4. has nested structs
# (warning) I'm lazy so here I use a global temporary variable, beware!
# when you call this function on top level, clear the global temp var first
flag_nested_struct = False
def detect_nested_struct(cnode, slist):
    global flag_nested_struct
    if cnode is None:
        return
    if isinstance(cnode, dict):
        if "nodeType" in cnode.keys():
            if cnode["nodeType"] in ["StructDefinition"]:
                # for every member, seek for existing self-defined structs
                for mm in cnode["members"]:
                    for tp in slist:
                        if tp in mm["typeDescriptions"]["typeString"]:
                        # if "struct" in mm["typeDescriptions"]["typeString"]:
                            flag_nested_struct = True
                            return # terminate
        for dkey in cnode.keys():
            detect_nested_struct(cnode[dkey], slist)
    elif isinstance(cnode, list):
        for p in cnode:
            detect_nested_struct(p, slist)
    else:
        return
    
# detect whether there's nested struct 
nested_struct_pool = []
for i in range(len(ast_pool)):
    print("\r# processing {}/{}".format(i, len(ast_pool)),end="")
    tmp_ast = ast_pool[i]
    tmp_llist = loop_pool[i]
    tmp_slist = struct_pool[i]
    flag_nested_struct = False
    detect_nested_struct(tmp_ast, tmp_slist)
    nested_struct_pool.append(flag_nested_struct)

# processing 4466/4467

In [11]:
# name_pool.index("EtherLab.json")

In [12]:
# struct_pool[773]

In [13]:
# for i in range(len(ast_pool)):
#     if nested_struct_pool[i]:
#         print("{}, {}".format(i, name_pool[i]))

In [14]:
# 5. has any storage var of type that is an array of structs
# (warning) I'm lazy so here I use a global temporary variable, beware!
# when you call this function on top level, clear the global temp var first
flag_array_struct = False
def detect_array_struct(cnode, slist):
    global flag_array_struct
    if cnode is None:
        return
    if isinstance(cnode, dict):
        if "nodeType" in cnode.keys():
            if "nodeType" in ["ParameterList"]:
                # don't count the declaration in parameter list
                return
            if "typeDescriptions" in cnode.keys():
                if "typeString" in cnode["typeDescriptions"].keys():
                    # if typeString is None, then the node represents a literal constant
                    if cnode["typeDescriptions"]["typeString"] is not None:
                        if "[]" in cnode["typeDescriptions"]["typeString"]:
                            for tp in slist:
                                if tp in cnode["typeDescriptions"]["typeString"]:
                                    flag_array_struct = True
                                    return # terminate
        for dkey in cnode.keys():
            detect_array_struct(cnode[dkey], slist)
    elif isinstance(cnode, list):
        for p in cnode:
            detect_array_struct(p, slist)
    else:
        return
    
# detect whether there's nested struct 
array_struct_pool = []
for i in range(len(ast_pool)):
    print("\r# processing {}/{}".format(i, len(ast_pool)),end="")
    tmp_ast = ast_pool[i]
    tmp_llist = loop_pool[i]
    tmp_slist = struct_pool[i]
    flag_array_struct = False
    detect_array_struct(tmp_ast, tmp_slist)
    array_struct_pool.append(flag_array_struct)

# processing 4466/4467

In [15]:
# for i in range(len(ast_pool)):
#     if array_struct_pool[i]:
#         print("{}, {}".format(i, name_pool[i]))

In [16]:
# 6. has any storage var of type that is has values of type structs
# (warning) I'm lazy so here I use a global temporary variable, beware!
# when you call this function on top level, clear the global temp var first
flag_map_struct = False
def detect_map_struct(cnode, slist):
    global flag_map_struct
    if cnode is None:
        return
    if isinstance(cnode, dict):
        if "nodeType" in cnode.keys():
            if "nodeType" in ["ParameterList"]:
                # don't count the declaration in parameter list
                return
            if "typeDescriptions" in cnode.keys():
                if "typeString" in cnode["typeDescriptions"].keys():
                    # if typeString is None, then the node represents a literal constant
                    if cnode["typeDescriptions"]["typeString"] is not None:
                        if "map" in cnode["typeDescriptions"]["typeString"]:
                            for tp in slist:
                                if tp in cnode["typeDescriptions"]["typeString"]:
                                    flag_map_struct = True
                                    return # terminate
        for dkey in cnode.keys():
            detect_map_struct(cnode[dkey], slist)
    elif isinstance(cnode, list):
        for p in cnode:
            detect_map_struct(p, slist)
    else:
        return
    
# detect whether there's nested struct 
map_struct_pool = []
for i in range(len(ast_pool)):
    print("\r# processing {}/{}".format(i, len(ast_pool)),end="")
    tmp_ast = ast_pool[i]
    tmp_llist = loop_pool[i]
    tmp_slist = struct_pool[i]
    flag_map_struct = False
    detect_map_struct(tmp_ast, tmp_slist)
    map_struct_pool.append(flag_map_struct)

# processing 4466/4467

In [17]:
# for i in range(len(ast_pool)):
#     if map_struct_pool[i]:
#         print("{}, {}".format(i, name_pool[i]))

In [18]:
# 7. number of structs
nstruct_pool = []
for i in range(len(ast_pool)):
    print("\r# processing {}/{}".format(i, len(ast_pool)),end="")
    tmp_slist = struct_pool[i]
    nstruct_pool.append(len(tmp_slist))

# processing 0/4467# processing 1/4467# processing 2/4467# processing 3/4467# processing 4/4467# processing 5/4467# processing 6/4467# processing 7/4467# processing 8/4467# processing 9/4467# processing 10/4467# processing 11/4467# processing 12/4467# processing 13/4467# processing 14/4467# processing 15/4467# processing 16/4467# processing 17/4467# processing 18/4467# processing 19/4467# processing 20/4467# processing 21/4467# processing 22/4467# processing 23/4467# processing 24/4467# processing 25/4467# processing 26/4467# processing 27/4467# processing 28/4467# processing 29/4467# processing 30/4467# processing 31/4467# processing 32/4467# processing 33/4467# processing 34/4467# processing 35/4467# processing 36/4467# processing 37/4467# processing 38/4467# processing 39/4467# processing 40/4467# processing 41/4467# processing 42/4467# processing 43/4467# processing 44/4467# processing 45/4467# processing 46/4467# processing 47/4467#

# processing 2178/4467# processing 2179/4467# processing 2180/4467# processing 2181/4467# processing 2182/4467# processing 2183/4467# processing 2184/4467# processing 2185/4467# processing 2186/4467# processing 2187/4467# processing 2188/4467# processing 2189/4467# processing 2190/4467# processing 2191/4467# processing 2192/4467# processing 2193/4467# processing 2194/4467# processing 2195/4467# processing 2196/4467# processing 2197/4467# processing 2198/4467# processing 2199/4467# processing 2200/4467# processing 2201/4467# processing 2202/4467# processing 2203/4467# processing 2204/4467# processing 2205/4467# processing 2206/4467# processing 2207/4467# processing 2208/4467# processing 2209/4467# processing 2210/4467# processing 2211/4467# processing 2212/4467# processing 2213/4467# processing 2214/4467# processing 2215/4467# processing 2216/4467# processing 2217/4467# processing 2218/4467# processing 2219/4467# processing 2220/4467# processi

In [19]:
# 8. has structs that has at least one member of type array
# (warning) I'm lazy so here I use a global temporary variable, beware!
# when you call this function on top level, clear the global temp var first
flag_array_member = False
def detect_array_member(cnode):
    global flag_array_member
    if cnode is None:
        return
    if isinstance(cnode, dict):
        if "nodeType" in cnode.keys():
            if cnode["nodeType"] in ["StructDefinition"]:
                for mm in cnode["members"]:
                    if "[]" in mm["typeDescriptions"]["typeString"]:
                        flag_array_member = True
                        return # terminate
        for dkey in cnode.keys():
            detect_array_member(cnode[dkey])
    elif isinstance(cnode, list):
        for p in cnode:
            detect_array_member(p)
    else:
        return
    
# get the canonical name of all structs defined in a file
array_member_pool = []
for i in range(len(ast_pool)):
    print("\r# processing {}/{}".format(i, len(ast_pool)),end="")
    tmp_ast = ast_pool[i]
    flag_array_member = False
    detect_array_member(tmp_ast)
    array_member_pool.append(flag_array_member)

# processing 4466/4467

In [20]:
# 9. Maximum number of maps in structs
# (warning) I'm lazy so here I use a global temporary variable, beware!
# when you call this function on top level, clear the global temp var first
tmp_max_maps = 0

def count_maps(cnode):
    cnt = 0
    for mm in cnode["members"]:
        if "mapping" in mm["typeDescriptions"]["typeString"]:
            cnt = cnt + 1
    
    return cnt

def get_max_maps(cnode):
    global tmp_max_maps
    if cnode is None:
        return
    if isinstance(cnode, dict):
        if "nodeType" in cnode.keys():
            if cnode["nodeType"] in ["StructDefinition"]:
                num = count_maps(cnode)
                if num > tmp_max_maps:
                    tmp_max_maps = num
        for dkey in cnode.keys():
            get_max_maps(cnode[dkey])
    elif isinstance(cnode, list):
        for p in cnode:
            get_max_maps(p)
    else:
        return
    
# get the canonical name of all structs defined in a file
max_maps_pool = []
for i in range(len(ast_pool)):
    print("\r# processing {}/{}".format(i, len(ast_pool)),end="")
    tmp_ast = ast_pool[i]
    tmp_max_maps = 0
    get_max_maps(tmp_ast)
    max_maps_pool.append(tmp_max_maps)

# processing 4466/4467

In [21]:
import csv
with open('./my_stats.csv', 'w', newline='') as csvfile:
    fieldnames = ['file', 'nlines', 'nloops', 'Q1-2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9']
    spamwriter = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=',')
    spamwriter.writeheader()
    for i in range(len(ast_pool)):
        spamwriter.writerow({
            'file': name_pool[i],
            'nlines': loc_pool[i],
            'nloops': nol_pool[i],
            'Q1-2': field_access_pool[i], # 1-2
            'Q3': max_nfields_pool[i], # 3
            'Q4': nested_struct_pool[i], # 4
            'Q5': array_struct_pool[i], # 5
            'Q6': map_struct_pool[i], # 6
            'Q7': nstruct_pool[i], # 7
            'Q8': array_member_pool[i], # 8
            'Q9': max_maps_pool[i], # 9
        })