In [11]:
from pathlib import Path
import pandas as pd
from itertools import product
import random
import string
import hashlib

In [13]:
BASE = Path("data")  # cambia si es necesario
N_ROWS_SMALL = 5
RECRUITER_NEW_COLS = ["bio", "company_site"]
CANDIDATE_NEW_COLS = ["skills"]

FILES = {
    "candidates": BASE / "candidates.csv",
    "recruiters": BASE / "recruiters.csv",
    "jobs": BASE / "jobs.csv",
    "applications_template": BASE / "applications_template.csv",
    "users_template": BASE / "users_template.csv",
}

OUT = {
    "applications": BASE / "applications.csv",
    "users": BASE / "users.csv",
    "candidates_reduced": BASE / "candidates_reduced.csv",
    "recruiters_reduced": BASE / "recruiters_reduced.csv",
    "jobs_reduced": BASE / "jobs_reduced.csv",
    "recruiters_updated": BASE / "recruiters.csv",
    "candidates_updated": BASE / "candidates.csv",
}

STATUS_VALUES = ["applied"]

In [14]:
def read_csv_required(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"No existe el archivo requerido: {path}")
    return pd.read_csv(path)

def pick_first_matching(colnames, candidates_list):
    """
    Retorna el primer nombre de columna que aparece en colnames
    y que coincide (case-insensitive) con alguno en candidates_list.
    """
    low = {c.lower(): c for c in colnames}
    for alias in candidates_list:
        if alias.lower() in low:
            return low[alias.lower()]
    return None

def ensure_columns_with_empty(df: pd.DataFrame, cols):
    for c in cols:
        if c not in df.columns:
            df[c] = ""
    return df

def cycle_pairs(values_a, values_b, n):
    pairs = list(product(values_a, values_b))
    out = []
    for i in range(n):
        out.append(pairs[i % len(pairs)])
    return out

def generar_password(longitud=10):
    """Contraseña aleatoria alfanumérica (sin símbolos)."""
    caracteres = string.ascii_letters + string.digits
    return ''.join(random.choice(caracteres) for _ in range(longitud))

def sha256_hex(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8")).hexdigest()

In [15]:
candidates = read_csv_required(FILES["candidates"])
recruiters = read_csv_required(FILES["recruiters"])
jobs = read_csv_required(FILES["jobs"])
applications_tmpl = read_csv_required(FILES["applications_template"])
users_tmpl = read_csv_required(FILES["users_template"])

In [16]:
print(len(candidates))
print(len(jobs))

962
223


In [5]:
# =========================
# 1) Normalizar
# =========================
recruiters = ensure_columns_with_empty(recruiters, RECRUITER_NEW_COLS)
recruiters.to_csv(OUT["recruiters_updated"], index=False)

candidates = ensure_columns_with_empty(candidates, CANDIDATE_NEW_COLS)
candidates.to_csv(OUT["candidates_updated"], index=False)

In [6]:
# =========================
# 2) Crear datasets reducidos base
# =========================
candidates_reduced = candidates.head(N_ROWS_SMALL).copy()
recruiters_reduced = recruiters.head(min(3, len(recruiters))).copy()

base_jobs_needed = 4
extra_jobs = max(0, min(2, len(jobs) - base_jobs_needed))
jobs_reduced = pd.concat(
    [jobs.head(base_jobs_needed), jobs.iloc[base_jobs_needed:base_jobs_needed + extra_jobs]],
    ignore_index=True
).copy()

In [7]:
posted_by_col = pick_first_matching(jobs_reduced.columns, ["posted_by", "created_by", "owner_email", "recruiter_email"])
if not posted_by_col:
    raise ValueError(f"No se encontró columna 'posted_by' (o equivalente) en jobs. columnas={jobs_reduced.columns.tolist()}")

# la columna en recruiters que representa el email del reclutador
recruiter_email_col = pick_first_matching(recruiters.columns, ["email", "recruiter_email", "mail"])
if not recruiter_email_col:
    raise ValueError(f"No se encontró columna de email en recruiters. columnas={recruiters.columns.tolist()}")

emails_needed = (
    jobs_reduced[posted_by_col]
    .dropna()
    .astype(str)
    .str.strip()
    .unique()
    .tolist()
)

recruiters_reduced = (
    recruiters[recruiters[recruiter_email_col].astype(str).str.strip().isin(emails_needed)]
    .copy()
)

# Fallback mínimo en caso de que no haya match (para no dejar vacío en pruebas)
if recruiters_reduced.empty and len(recruiters) > 0:
    recruiters_reduced = recruiters.head(min(3, len(recruiters))).copy()

# ---- candidates_reduced
candidates_reduced = candidates.head(N_ROWS_SMALL).copy()

# ---- Inferencia de columnas clave para relationships
cand_email_col = pick_first_matching(candidates_reduced.columns, ["candidate_email", "email", "mail"])
job_id_col = pick_first_matching(jobs_reduced.columns, ["job_id", "id", "jobid"])

if not cand_email_col:
    raise ValueError(f"No se encontró columna email en candidates. columnas={candidates_reduced.columns.tolist()}")
if not job_id_col:
    raise ValueError(f"No se encontró columna id en jobs. columnas={jobs_reduced.columns.tolist()}")

cand_emails = candidates_reduced[cand_email_col].dropna().astype(str).str.strip().tolist()
job_ids = jobs_reduced[job_id_col].dropna().astype(str).str.strip().tolist()

if len(cand_emails) == 0 or len(job_ids) == 0:
    raise ValueError("No hay suficientes candidates o jobs para crear applications de ejemplo.")

In [8]:
app_cols = list(applications_tmpl.columns)
apps_rows = []
for i, (email, jid) in enumerate(cycle_pairs(cand_emails, job_ids, N_ROWS_SMALL)):
    row = {col: "" for col in app_cols}
    for col in app_cols:
        cl = col.lower()
        if cl in ["candidate_email", "email"]:
            row[col] = email
        elif cl in ["job_id", "id", "jobid"]:
            row[col] = jid
        elif cl == "status":
            row[col] = STATUS_VALUES[i % len(STATUS_VALUES)]
        elif cl in ["created_at", "applied_at", "timestamp"]:
            row[col] = pd.Timestamp.utcnow().isoformat()
    apps_rows.append(row)
applications_small = pd.DataFrame(apps_rows, columns=app_cols)
applications_small.to_csv(OUT["applications"], index=False)

In [9]:
user_cols = list(users_tmpl.columns)

# Si el template no trae 'password', lo añadimos a las columnas de salida
if "password_hash" not in user_cols:
    user_cols.append("password_hash")

def make_user_row(email, role, i):
    row = {col: "" for col in user_cols}
    for col in user_cols:
        cl = col.lower()
        if cl in ["email", "user_email"]:
            row[col] = email
        elif cl in ["role", "user_role"]:
            row[col] = role
        elif cl in ["name", "full_name", "username"]:
            row[col] = f"User {i}"
        elif cl in ["is_active", "active"]:
            row[col] = True
        elif cl in ["created_at", "signup_at", "timestamp"]:
            row[col] = pd.Timestamp.utcnow().isoformat()
        elif cl == "provider":
            row[col] = "password"
        elif cl == "password_hash":
            row[col] = generar_password()  # aquí puedes meter un hash si quieres
    # Si no existía en el template, la añadimos manualmente
    if "password_hash" not in row or row["password_hash"] == "":
        row["password_hash"] = generar_password()
    return row

recr_emails = recruiters_reduced[recruiter_email_col].dropna().astype(str).str.strip().tolist()
cand_for_users = cand_emails[:min(3, len(cand_emails))]
recr_for_users = recr_emails[:max(0, N_ROWS_SMALL - len(cand_for_users))]

users_rows = []
idx = 1
for email in cand_for_users:
    users_rows.append(make_user_row(email, "candidate", idx)); idx += 1
for email in recr_for_users:
    users_rows.append(make_user_row(email, "recruiter", idx)); idx += 1

users_small = pd.DataFrame(users_rows, columns=user_cols).head(N_ROWS_SMALL)
users_small.to_csv(OUT["users"], index=False)

In [10]:
# =========================
# 6) Guardar reducidos
# =========================
candidates_reduced.to_csv(OUT["candidates_reduced"], index=False)
recruiters_reduced.to_csv(OUT["recruiters_reduced"], index=False)
jobs_reduced.to_csv(OUT["jobs_reduced"], index=False)

# =========================
# 7) Resumen en consola
# =========================
print("[OK] recruiters.csv normalizado con columnas: bio, company_site")
print(f"[OK] applications.csv -> {OUT['applications']}")
print(f"[OK] users.csv        -> {OUT['users']}")
print(f"[OK] candidates_reduced.csv -> {OUT['candidates_reduced']}")
print(f"[OK] recruiters_reduced.csv -> {OUT['recruiters_reduced']}")
print(f"[OK] jobs_reduced.csv       -> {OUT['jobs_reduced']}")

[OK] recruiters.csv normalizado con columnas: bio, company_site
[OK] applications.csv -> data\applications.csv
[OK] users.csv        -> data\users.csv
[OK] candidates_reduced.csv -> data\candidates_reduced.csv
[OK] recruiters_reduced.csv -> data\recruiters_reduced.csv
[OK] jobs_reduced.csv       -> data\jobs_reduced.csv
