## Funkcijų išrinkimas (naudojant MPI)

In [None]:
from mpi4py import MPI
import pandas as pd
from pathlib import Path
from tree_sitter import Language, Parser
from socket import gethostname

SCRIPT_DIR = Path().resolve()
first_folder = "extracted data"
second_folder = "V5"
GRAMMAR_DIR = SCRIPT_DIR / "tree-sitter"
INPUT_PARQUET = SCRIPT_DIR / first_folder / second_folder / f"accepted_submissions_{second_folder}.parquet"
OUTPUT_PARQUET_FINAL = SCRIPT_DIR / first_folder / second_folder / f"extracted_functions_nocom_{second_folder}.parquet"
OUTPUT_PARQUET_FINAL.parent.mkdir(parents=True, exist_ok=True)

LANGUAGE_GRAMMARS = {
    "Python": "tree-sitter-python",
    "C++": "tree-sitter-cpp",
    "C": "tree-sitter-c",
    "Java": "tree-sitter-java",
    "C#": "tree-sitter-c-sharp",
}

LANGUAGE_SYMBOLS = {
    "Python": "python",
    "C++": "cpp",
    "C": "c",
    "Java": "java",
    "C#": "c_sharp",
}

LANGUAGE_FUNCTION_NODES = {
    "Python": {"function_definition"},
    "C": {"function_definition"},
    "C++": {"function_definition"},
    "Java": {"method_declaration", "constructor_declaration"},
    "C#": {"method_declaration", "constructor_declaration"},
}

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
hostname = gethostname()
print(f"[Rank {rank}] running on {hostname} (total ranks = {size})")

so_path = SCRIPT_DIR / "build_my_languages.so"
if rank == 0 and not so_path.exists():
    Language.build_library(
        str(so_path),
        [str(GRAMMAR_DIR / folder) for folder in LANGUAGE_GRAMMARS.values()]
    )
comm.Barrier()

PARSERS = {}
for lang, folder in LANGUAGE_GRAMMARS.items():
    lang_symbol = LANGUAGE_SYMBOLS[lang]
    lang_obj = Language(str(so_path), lang_symbol)
    parser = Parser()
    parser.set_language(lang_obj)
    PARSERS[lang] = parser

def find_identifier(node):
    if node.type == "identifier":
        return node
    for child in node.children:
        result = find_identifier(child)
        if result:
            return result
    return None

def extract_functions(code, lang):
    parser = PARSERS[lang]
    tree = parser.parse(bytes(code, "utf8"))
    root = tree.root_node
    target_kinds = LANGUAGE_FUNCTION_NODES[lang]
    functions = []

    def walk(node):
        if node.type in target_kinds:
            text = code[node.start_byte:node.end_byte]
            name_node = find_identifier(node)
            name = code[name_node.start_byte:name_node.end_byte] if name_node else None
            if name and name.lower().strip("_") == "main":
                return
            functions.append((name, text))
        for child in node.children:
            walk(child)

    walk(root)
    return functions

def remove_comments(code, lang):
    parser = PARSERS[lang]
    tree = parser.parse(bytes(code, "utf8"))
    comment_spans = []

    def collect_comments(node):
        if node.type == "comment":
            comment_spans.append((node.start_byte, node.end_byte))
        for child in node.children:
            collect_comments(child)

    collect_comments(tree.root_node)
    comment_spans.sort(reverse=True)
    code_bytes = bytearray(code, "utf8")
    for start, end in comment_spans:
        del code_bytes[start:end]
    return code_bytes.decode("utf8", errors="ignore")

if rank == 0:
    df_raw = pd.read_parquet(INPUT_PARQUET)
    chunks = [df_raw.iloc[i::size] for i in range(size)]
else:
    chunks = None

df_local = comm.scatter(chunks, root=0)

checkpoint_file = OUTPUT_PARQUET_FINAL.parent / f"checkpoint_rank_{rank}.parquet"
output_file = OUTPUT_PARQUET_FINAL.parent / f"extracted_functions_rank_{rank}.parquet"
processed_ids_file = OUTPUT_PARQUET_FINAL.parent / f"processed_ids_rank_{rank}.txt"

records = []
total_functions = 0
processed_batch_ids = set()
lines_processed = 0

for i, row in df_local.iterrows():
    lines_processed += 1
    lang = row["language"]
    file_path = Path(row["file_path"])
    try:
        if not file_path.exists():
            continue

        code = file_path.read_text(encoding="utf-8", errors="ignore")
        clean_code = remove_comments(code, lang)
        funcs = extract_functions(clean_code, lang)

        total_functions += len(funcs)
        for func_name, func_code in funcs:
            records.append({
                "submission_id": row["submission_id"],
                "language": lang,
                "file_path": str(file_path),
                "function_name": func_name,
                "function_code": func_code
            })
        processed_batch_ids.add(str(row["submission_id"]))

        if i % 100 == 0 and records:
            df_new = pd.DataFrame(records)
            if checkpoint_file.exists():
                df_existing = pd.read_parquet(checkpoint_file)
                df_combined = pd.concat([df_existing, df_new], ignore_index=True)
            else:
                df_combined = df_new
            df_combined.to_parquet(checkpoint_file, index=False)
            records = []

            with open(processed_ids_file, "a") as f:
                for pid in processed_batch_ids:
                    f.write(pid + "\n")
            processed_batch_ids.clear()

    except Exception as e:
        print(f"[Rank {rank}] Error in {file_path}: {e}")

if records:
    df_new = pd.DataFrame(records)
    if checkpoint_file.exists():
        df_existing = pd.read_parquet(checkpoint_file)
        df_combined = pd.concat([df_existing, df_new], ignore_index=True)
    else:
        df_combined = df_new
    df_combined.to_parquet(output_file, index=False)
    if checkpoint_file.exists():
        checkpoint_file.unlink()
    with open(processed_ids_file, "a") as f:
        for pid in processed_batch_ids:
            f.write(pid + "\n")

comm.Barrier()
if rank == 0:
    all_files = list(OUTPUT_PARQUET_FINAL.parent.glob("extracted_functions_rank_*.parquet"))
    dfs = [pd.read_parquet(f) for f in all_files]
    final_df = pd.concat(dfs, ignore_index=True)
    final_df.to_parquet(OUTPUT_PARQUET_FINAL, index=False)


## Funkcijų atrinkimas galutiniam duomenų rinkiniui

In [None]:
import pandas as pd
from pathlib import Path

SCRIPT_DIR = Path().resolve()
first_folder = "extracted data"
second_folder = "V5"
GRAMMAR_DIR = SCRIPT_DIR / "tree-sitter"
INPUT_PARQUET = SCRIPT_DIR / first_folder / second_folder / f"accepted_submissions_{second_folder}.parquet"
OUTPUT_PARQUET_FINAL = SCRIPT_DIR / first_folder / second_folder / f"extracted_functions_nocom_{second_folder}.parquet"
OUTPUT_PARQUET_FINAL.parent.mkdir(parents=True, exist_ok=True)

file_path = OUTPUT_PARQUET_FINAL
duomenys = pd.read_parquet(file_path, engine='pyarrow')
dupe_mask = duomenys['function_code'].duplicated()
duomenys_unique = duomenys.loc[~dupe_mask].reset_index(drop=True)
duomenys_unique = duomenys_unique[duomenys_unique['function_code'].str.strip().ne('')]
language_counts = duomenys_unique['language'].value_counts()
print(language_counts)

df_subset = (
    duomenys_unique
    .groupby('language', group_keys=False)
    .apply(lambda grp: grp.sample(n=20000, random_state=42))
    .reset_index(drop=True)
)

df_subset.to_parquet('subset_100k.parquet', index=False)