Skip to content

Commit 7310491

Browse files
committed
added files accepting/declining + docstrings postprocessing
1 parent 7fd71d9 commit 7310491

File tree

2 files changed

+52
-9
lines changed

2 files changed

+52
-9
lines changed

python150k/preprocess.py

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,6 @@ def get_tokens(
107107
docstring = None
108108
if ds_begin != -1 and ds_end != -1:
109109
docstring = code[ds_begin + 3: ds_end].strip()
110-
docstring += " ."
111110

112111
# Erase docstring from the code
113112
if ds_begin != -1 and ds_end != -1:
@@ -159,6 +158,9 @@ def get_previous_comments(
159158
return precomment
160159

161160

161+
error_counter = 0
162+
163+
162164
def collect_data(
163165
filename: str,
164166
args: argparse.ArgumentParser) -> List[List[str]]:
@@ -170,14 +172,29 @@ def collect_data(
170172
Summarized data from functions.
171173
is_appropriate: bool
172174
A flag indicating that the file is appropriate
173-
(enough scope size).
175+
(enough scope size or no errors in parsing).
174176
"""
177+
global error_counter
178+
179+
# Convert Python 2 to Python 3
180+
os.system(f"~/anaconda3/envs/scs/bin/2to3 {filename} -w -n")
181+
print("Building AST tree from a filename:", filename)
182+
175183
code = read_file_to_string(filename)
184+
185+
# let's replace tabs for spaces in the future
186+
code = re.sub('\t', ' ' * 4, code)
187+
176188
code_lines = code.splitlines()
177189

178-
print("Building AST tree from a filename:", filename)
179-
atok = asttokens.ASTTokens(code, parse=True)
180-
astree = atok.tree
190+
try:
191+
atok = asttokens.ASTTokens(code, parse=True)
192+
astree = atok.tree
193+
except IndentationError:
194+
print("Files with an error:", error_counter)
195+
error_counter += 1
196+
is_appropriate = False
197+
return None, is_appropriate
181198

182199
data = []
183200

@@ -208,6 +225,13 @@ def collect_data(
208225

209226
function_code = code[fun_begin:fun_end]
210227

228+
# if met @classmethod keyword,
229+
# should relax tabulation
230+
start_def = function_code.find("def")
231+
function_code = function_code[start_def:]
232+
# if start_def > 0:
233+
234+
211235
function_code, tokens, comments, docstring, stopwords_count = \
212236
get_tokens(function_code)
213237

@@ -247,6 +271,7 @@ def retrieve_functions_docstrings(
247271

248272
preprocess_code = Preprocess("code")
249273
preprocess_comment = Preprocess("anno")
274+
preprocess_docstring = Preprocess("docs")
250275

251276
comments = []
252277
docstrings = []
@@ -278,7 +303,11 @@ def retrieve_functions_docstrings(
278303

279304
functions.append(code)
280305
tokens.append(fun_tokens_string)
281-
docstrings.append(docstring)
306+
307+
if docstring is not None:
308+
docstring = preprocess_docstring.clean(docstring).strip()
309+
if len(docstring) > 0:
310+
docstrings.append(docstring)
282311

283312
return comments, docstrings, functions, ord_nodes, tokens
284313

@@ -311,6 +340,11 @@ def set_script_arguments(parser):
311340

312341

313342
def main(args):
343+
global error_counter
344+
# Clear the convertation directory
345+
if os.path.exists("converted"):
346+
shutil.rmtree("converted")
347+
os.mkdir("converted")
314348

315349
# Clear the output directory
316350
directory = args.output_dir
@@ -325,7 +359,7 @@ def main(args):
325359
dcs_file = open(os.path.join(directory, args.docstrings_file), "a")
326360
print("Opened output files...")
327361

328-
dcs_cnt, comments_cnt, seq_cnt, ast_cnt = 0, 0, 0, 0
362+
dcs_cnt, comments_cnt, seq_cnt, ast_cnt, file_cnt = 0, 0, 0, 0, 0
329363

330364
for root, _, fnames in sorted(os.walk(args.dirname)):
331365
# print("ROOT:", root)
@@ -336,6 +370,8 @@ def main(args):
336370
filename = os.path.join(root, fname)
337371

338372
data, is_appropriate = collect_data(filename, args)
373+
if not is_appropriate:
374+
continue
339375
comments, docstrings, functions, ord_nodes, tokens = \
340376
retrieve_functions_docstrings(data, args)
341377

@@ -364,10 +400,13 @@ def main(args):
364400
ast_file.write(f"{ast_string}\n")
365401
ast_cnt += 1
366402

403+
file_cnt += 1
367404
print("Updated docstrings count:", dcs_cnt)
368405
print("Updated comment count:", comments_cnt)
369406
print("Updated sequential count:", seq_cnt)
370407
print("Updated AST count:", ast_cnt)
408+
print("Processed/Canceled/Total files:",
409+
f"{file_cnt}/{error_counter}/{file_cnt + error_counter}")
371410
print("~" * 50)
372411

373412
sequence_file.close()

python150k/processor_ast.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
class Preprocess:
88
def __init__(self, mode):
9-
assert mode in ['anno', 'code']
9+
assert mode in ['anno', 'code', 'docs']
1010
self.mode = mode
1111

1212
def tokenize_python(self, snippet: str):
@@ -22,11 +22,15 @@ def clean(self, x):
2222
x = re.sub(r'[‘…—−–]', ' ', x)
2323
x = re.sub(r'[?,`“”’™•°]', '', x)
2424

25-
if self.mode == 'anno':
25+
if self.mode == 'anno' or self.mode == 'docs':
2626
x = re.sub(r'[,:;]', '', x)
2727
x = re.sub(r'([\+\-\*/=(){}%^&\.])', r' \1 ', x)
2828
x = re.sub(r'\.+$', r'', x)
2929

30+
if self.mode == 'docs':
31+
x = re.sub(r'[\t\r\n\v\f]', '', x)
32+
x = re.sub(r'[\(\[\]\)]', x)
33+
3034
if self.mode == 'code':
3135
x = re.sub(r'[\(\[\+\-\*/,:;=(){}%^&\]\)\'\"]', r'', x).strip()
3236
# x = re.sub(r"([])':;{}%^&|")

0 commit comments

Comments
 (0)