Skip to content

Commit 8c80ddb

Browse files
committed
add: preprocessor for comments and tokens corpuses
1 parent beb0b81 commit 8c80ddb

File tree

4 files changed

+485
-7
lines changed

4 files changed

+485
-7
lines changed

python150k/ast_conversion.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
def convert(ast):
2+
increase_by = {} # count of how many idx to increase the new idx by:
3+
# each time there is a value node
4+
cur = 0
5+
for i, node in enumerate(ast):
6+
increase_by[i] = cur
7+
if "value" in node:
8+
cur += 1
9+
10+
new_dp = []
11+
for i, node in enumerate(ast):
12+
inc = increase_by[i]
13+
if "value" in node:
14+
child = [i + inc + 1]
15+
if "children" in node:
16+
child += [n + increase_by[n] for n in node["children"]]
17+
new_dp.append({"type": node["type"], "children": child})
18+
new_dp.append({"value": node["value"]})
19+
else:
20+
if "children" in node:
21+
node["children"] = [n + increase_by[n] for n in node["children"]]
22+
new_dp.append(node)
23+
24+
# sanity check
25+
children = []
26+
for node in new_dp:
27+
if "children" in node:
28+
children += node["children"]
29+
assert len(children) == len(set(children))
30+
return new_dp
31+
32+
33+
def get_dfs(ast, only_leaf=False):
34+
dp = []
35+
for node in ast:
36+
if "value" in node:
37+
dp.append(str(node["value"]))
38+
else:
39+
if not only_leaf:
40+
dp.append("<"+node["type"]+">")
41+
return dp

python150k/parse_python3.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,12 @@ def read_file_to_string(filename):
2020
f.close()
2121
return s
2222

23-
def parse_file(filename):
23+
def parse_file(filename, input="file"):
2424
global c, d
25-
tree = ast.parse(read_file_to_string(filename), filename)
25+
if input == "file":
26+
tree = ast.parse(read_file_to_string(filename), filename)
27+
else:
28+
tree = ast.parse(filename)
2629

2730
json_tree = []
2831
def gen_identifier(identifier, node_type = 'identifier'):
@@ -56,7 +59,7 @@ def traverse(node):
5659
elif isinstance(node, ast.Num):
5760
json_node['value'] = unicode(node.n)
5861
elif isinstance(node, ast.Str):
59-
json_node['value'] = node.s#.decode('utf-8')
62+
json_node['value'] = "string" #node.s #.decode('utf-8')
6063
elif isinstance(node, ast.alias):
6164
json_node['value'] = unicode(node.name)
6265
if node.asname:
@@ -73,6 +76,8 @@ def traverse(node):
7376
children.append(gen_identifier(n))
7477
elif isinstance(node, ast.keyword):
7578
json_node['value'] = unicode(node.arg)
79+
elif isinstance(node, ast.arg):
80+
children.append(gen_identifier(node.arg))
7681

7782

7883
# Process children.
@@ -88,10 +93,12 @@ def traverse(node):
8893
if node.orelse:
8994
children.append(traverse_list(node.orelse, 'orelse'))
9095
elif isinstance(node, ast.With):
96+
children.append(traverse_list(node.items, 'items'))
97+
children.append(traverse_list(node.body, 'body'))
98+
elif isinstance(node, ast.withitem):
9199
children.append(traverse(node.context_expr))
92100
if node.optional_vars:
93101
children.append(traverse(node.optional_vars))
94-
children.append(traverse_list(node.body, 'body'))
95102
elif isinstance(node, ast.Try): # Except
96103
children.append(traverse_list(node.body, 'body'))
97104
children.append(traverse_list(node.handlers, 'handlers'))
@@ -106,14 +113,14 @@ def traverse(node):
106113
children.append(traverse_list(node.args, 'args'))
107114
children.append(traverse_list(node.defaults, 'defaults'))
108115
if node.vararg:
109-
children.append(gen_identifier(node.vararg, 'vararg'))
116+
children.append(traverse(node.vararg))
110117
if node.kwarg:
111-
children.append(gen_identifier(node.kwarg, 'kwarg'))
118+
children.append(traverse(node.kwarg))
112119
elif isinstance(node, ast.ExceptHandler):
113120
if node.type:
114121
children.append(traverse_list([node.type], 'type'))
115122
if node.name:
116-
children.append(traverse_list([node.name], 'name'))
123+
children.append(gen_identifier(node.name, 'name'))
117124
children.append(traverse_list(node.body, 'body'))
118125
elif isinstance(node, ast.ClassDef):
119126
children.append(traverse_list(node.bases, 'bases'))

0 commit comments

Comments
 (0)