# Parallel Corpora for Seq2seq Generation Training

## Comments

In [2]:
test = "class MyClass: \n\t\"\"\"A simple example class\"\"\" \n\ti = 12345 # le epic comment\n\twhoa = [i + 1 for i in range(0,10)]\n\n\tdef f(self):\n\t\treturn 'hello world'"
print(test)

class MyClass: 
	"""A simple example class""" 
	i = 12345 # le epic comment
	whoa = [i + 1 for i in range(0,10)]

	def f(self):
		return 'hello world'


In [10]:
import ast
def uncomment(source):
    """ 
    Takes input code and returns code with comments stripped
    Input: code (str)
    Output: code (str)
    """
    try:
        parse = ast.unparse(ast.parse(source))
    except:
        parse = 'nan'
    return parse

In [12]:
uncomment(test)

'class MyClass:\n    """A simple example class"""\n    i = 12345\n    whoa = [i + 1 for i in range(0, 10)]\n\n    def f(self):\n        return \'hello world\''

In [20]:
import pandas as pd
bq_df = pd.read_csv("data/labeled_code/combined_data.csv")

  bq_df = pd.read_csv("data/labeled_code/combined_data.csv")


In [30]:
bq_df['uncommented_content'] = bq_df['content'].apply(uncomment)

In [33]:
bq_df.to_csv("data/combined_data_uncommented.csv") #remember to tell karl to dropna

## Classes

In [43]:
import pandas as pd

In [44]:
# !pip install astunparse
import astunparse
import ast

In [55]:
def extract_func_from_class_node(class_node):
    func_list = class_node.body
    for func_node in func_list:
        if type(func_node) != ast.FunctionDef:
            continue
        arg_list = func_node.args.args
        new_arg_list = []
        for arg in arg_list:
            if arg.arg == "self" or arg.arg == "cls":
                continue
            new_arg_list += [arg]
        func_node.args.args = new_arg_list
    return func_list
def remove_class_from_ast(ast_tree):
    class_nodes = []
    for node in ast.walk(ast_tree):
        for idx, child in enumerate(ast.iter_child_nodes(node)):
            if type(child) == ast.ClassDef:
                child.parent = node
                if type(node) in [ast.If, ast.Try, ast.For]:
                    if child in node.body:
                        # it is in the if
                        child.idx = node.body.index(child)
                    elif child in node.orelse:
                        # it is in the else
                        child.idx = node.orelse.index(child)
                        child.is_else = True
                    elif child in node.finalbody:
                        child.idx = node.finalbody.index(child)
                        child.is_final = True
                    else:
                        raise(f"Not in the body, another speciall case may happen, please look into this node: {ast.dump(node)}")
                else:
                    child.idx = node.body.index(child)
                    
                class_nodes = [child] + class_nodes
    
    if len(class_nodes) == 0:
        # nothing to change
        return None
    
    for class_node in class_nodes:
        func_list = extract_func_from_class_node(class_node)
        idx = class_node.idx
        
        # addressing classes in the else condition
        if hasattr(class_node, "is_else") and class_node.is_else:
            class_node.parent.orelse.pop(idx)
            class_node.parent.orelse = class_node.parent.orelse[:idx] + func_list + class_node.parent.orelse[idx:]
        elif hasattr(class_node, "is_final") and class_node.is_final:
            class_node.parent.finalbody.pop(idx)
            class_node.parent.finalbody = class_node.parent.finalbody[:idx] + func_list + class_node.parent.finalbody[idx:]
        else:
            class_node.parent.body.pop(idx)
            class_node.parent.body = class_node.parent.body[:idx] + func_list + class_node.parent.body[idx:]
            
    return ast_tree

def remove_self_cls_str(script):
    return script.replace("self.", "").replace("cls.", "")

def remove_class(script):
    ast_tree = ast.parse(script)
    processed_ast_tree = remove_class_from_ast(ast_tree)
    if processed_ast_tree:
        return remove_self_cls_str(astunparse.unparse(processed_ast_tree))
    return None

In [51]:
from tqdm.auto import tqdm

# the removing classes method only works on python3 code, so we exclude py150k here
bq_no_outlier_df = pd.read_csv("data/labeled_code/bq_data_no_outlier.csv")
processed_scripts = []
for script in tqdm(bq_no_outlier_df['content']):
    try:
        processed_script = remove_class(script)
        processed_scripts += [processed_script]
    except SyntaxError:
        processed_scripts += [None]
    except Exception as e:
        print(script)
        print(e)
        raise(e)
bq_no_outlier_df['no_class_content'] = processed_scripts

  0%|          | 0/208311 [00:00<?, ?it/s]

In [52]:
bq_no_outlier_df.to_csv("data/labeled_code/bq_data_no_outlier_no_class.csv") #remember to tell karl to dropna

In [57]:
# the removing classes method only works on python3 code, so we exclude py150k here
bq_outlier_df = pd.read_csv("data/labeled_code/bq_data_outlier.csv")

In [58]:

processed_scripts = []
for idx, script in enumerate(tqdm(bq_outlier_df['content'])):
    try:
        processed_script = remove_class(script)
        processed_scripts += [processed_script]
    except SyntaxError:
        processed_scripts += [None]
    except Exception as e:
        print(script)
        print(e)
        raise(e)
bq_outlier_df['no_class_content'] = processed_scripts

  0%|          | 0/875096 [00:00<?, ?it/s]

In [59]:
bq_outlier_df.to_csv("data/labeled_code/bq_data_outlier_no_class.csv") #remember to tell karl to dropna

## List Comps

In [11]:
test

'class MyClass: \n\t"""A simple example class""" \n\ti = 12345 # le epic comment\n\twhoa = [i + 1 for i in range(0,10)]\n\n\tdef f(self):\n\t\treturn \'hello world\''