Skip to content

Commit c1cc56a

Browse files
committed
TokenError handling
1 parent 2688ad5 commit c1cc56a

File tree

1 file changed

+14
-9
lines changed

1 file changed

+14
-9
lines changed

python150k/preprocess.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -118,16 +118,21 @@ def get_tokens(code: str) -> Tuple[list, int, list]:
118118
comments = list(processor.comments.values())
119119

120120
stopwords_count = 0
121+
is_tokenizable = True
121122

122-
for idx, token in enumerate(
123-
tokenize.tokenize(BytesIO(code.encode('utf-8')).readline)):
124-
# Form indices and tokens
125-
if token.string not in TOKENS_STOPWORDS:
126-
# print(f"idx: {idx}, token: {token.string}")
127-
tokens.append(token.string)
128-
else:
129-
stopwords_count += 1
130-
return code, tokens, comments, docstring, stopwords_count
123+
try:
124+
for idx, token in enumerate(
125+
tokenize.tokenize(BytesIO(code.encode('utf-8')).readline)):
126+
# Form indices and tokens
127+
if token.string not in TOKENS_STOPWORDS:
128+
# print(f"idx: {idx}, token: {token.string}")
129+
tokens.append(token.string)
130+
else:
131+
stopwords_count += 1
132+
except tokenize.TokenError:
133+
is_tokenizable = False
134+
return None, None, comments, docstring, stopwords_count, is_tokenizable
135+
return code, tokens, comments, docstring, stopwords_count, is_tokenizable
131136

132137

133138
def get_previous_comments(fun: ast.FunctionDef, code_lines: List[str]) -> str:

0 commit comments

Comments
 (0)