## Programinių kodų filtravimas pagal pasirinktus reikalavimus

In [None]:
import pandas as pd
import os
from pathlib import Path

SCRIPT_DIR = Path().resolve()
CODENET_BASE_DIR = SCRIPT_DIR.parent / "duomenys" / "CodeNet"
METADATA_DIR = CODENET_BASE_DIR / "Project_CodeNet" / "metadata"
DATA_DIR = CODENET_BASE_DIR / "Project_CodeNet" / "data"
OUTPUT_PARQUET = SCRIPT_DIR / "extracted data" / "accepted_submissions_temp.parquet"
TARGET_LANGUAGES = {"Python", "C++", "C", "C#", "Java"}
ACCEPTED_STATUS = {"Accepted"}

def resolve_path(row):
    lang_folder = LANGUAGE_FOLDER_MAP.get(row["language"])
    prob_id = row["problem_id"]
    sub_id = str(row["submission_id"])
    ext = "." + row["filename_ext"].strip()
    return DATA_DIR / prob_id / lang_folder / (sub_id + ext)

LANGUAGE_FOLDER_MAP = {
    "C++": "C++",
    "C": "C",
    "C#": "C#",
    "Python": "Python",
    "Java": "Java"
}

all_records = []
print("+")
for file in METADATA_DIR.glob("*.csv"):
    print(f"Loading {file.name}")
    df = pd.read_csv(file)
    df.columns = df.columns.str.strip().str.lower()
    required = {"submission_id", "problem_id", "language", "status"}
    if not required.issubset(df.columns):
        print(f"Skipping {file.name} — missing required columns")
        continue
    df = df[df["language"].isin(TARGET_LANGUAGES)]
    df = df[df["status"].isin(ACCEPTED_STATUS)]
    if df.empty:
        print(f"Skipping {file.name} — no accepted rows")
        continue
    df["file_path"] = df.apply(resolve_path, axis=1)
    df = df[df["file_path"].apply(lambda p: p.exists())]
    all_records.append(df)
final_df = pd.concat(all_records, ignore_index=True)
final_df["file_path"] = final_df["file_path"].astype(str)

df_limited = (
    final_df.groupby(['problem_id', 'language'], group_keys=False)
      .apply(lambda x: x.sample(n=min(len(x), 2000), random_state=42))
)
freq_table = df_limited.groupby(['problem_id', 'language']).size().unstack(fill_value=0)
counts = df_limited.groupby(['problem_id', 'language']).size().unstack(fill_value=0)
min_required = 100
valid_problems = counts[(counts >= min_required).all(axis=1)].index
filtered_df = df_limited[df_limited['problem_id'].isin(valid_problems)]
columns_to_drop = [
    "user_id", "date", "original_language", 
    "cpu_time", "memory", "code_size", 
    "accuracy", "status"
]
filtered_df = filtered_df.drop(columns=columns_to_drop, errors="ignore")
filtered_df.to_parquet(SCRIPT_DIR / "extracted data" / "accepted_submissions_V5.parquet", index=False)