In [1]:
import pandas as pd
from umap import UMAP

# Load CSV with latin-1 encoding and semicolon separator
df = pd.read_csv("downloads/lic_2020-1.csv", encoding="latin-1", sep=";")

# Identify numeric columns (excluding CodigoExterno)
# Convert columns to numeric where possible, handling comma decimal separators
numeric_columns = []
for col in df.columns:
    if col in (
        "CodigoExterno",
        "Codigo",
        "CodigoEstado",
        "EstadoEtapas",
        "CodigoUnidad",
        "Informada",
        "EsBaseTipo",
        "ValorTiempoRenovacion",
        "EsRenovable",
        "Codigoitem",
        "CodigoProductoONU",
        "CodigoSucursalProveedor",
        "Correlativo",
    ):
        continue
    # Try to convert to numeric, handling comma decimal separators
    # Replace comma with dot for decimal separator
    test_series = df[col].astype(str).str.replace(",", ".", regex=False)
    numeric_series = pd.to_numeric(test_series, errors="coerce")
    # Check if column is numeric (has valid numeric values and not all NaN)
    if numeric_series.notna().any():
        # Check if the column is actually numeric (most values are numeric)
        non_null_count = numeric_series.notna().sum()
        total_count = len(numeric_series)
        # Consider it numeric if at least 50% of values are numeric
        if non_null_count / total_count >= 0.5:
            numeric_columns.append(col)

# Prepare numeric data for UMAP
numeric_data = df[numeric_columns].copy()
# Convert to numeric, handling comma decimal separators
for col in numeric_columns:
    # Replace comma with dot for decimal separator, then convert to numeric
    numeric_data[col] = numeric_data[col].astype(str).str.replace(",", ".", regex=False)
    numeric_data[col] = pd.to_numeric(numeric_data[col], errors="coerce")
# Fill missing values with 0 (or could use median/mean)
numeric_data = numeric_data.fillna(0)
# Ensure all values are float (not object/string)
numeric_data = numeric_data.astype(float).drop_duplicates()

  from .autonotebook import tqdm as notebook_tqdm
  df = pd.read_csv("downloads/lic_2020-1.csv", encoding="latin-1", sep=";")


In [35]:
col_uniques = []
for column in numeric_data.columns:
    col_uniques.append((column, numeric_data[column].nunique()))

sorted(col_uniques, key=lambda x: x[1], reverse=True)

[('Valor Total Ofertado', 62251),
 ('MontoUnitarioOferta', 43954),
 ('MontoLineaAdjudica', 19674),
 ('CodigoProveedor', 11895),
 ('Monto Estimado Adjudicado', 6857),
 ('MontoEstimado', 3463),
 ('NumeroAprobacion', 2047),
 ('CantidadAdjudicada', 1306),
 ('Cantidad Ofertada', 1183),
 ('Cantidad', 1181),
 ('CodigoOrganismo', 724),
 ('CantidadReclamos', 407),
 ('TiempoDuracionContrato', 89),
 ('NumeroOferentes', 62),
 ('FechaTiempoEvaluacion', 37),
 ('CodigoEstadoLicitacion', 10),
 ('FechasUsuario', 8),
 ('TipoAprobacion', 7),
 ('UnidadTiempoDuracionContrato', 5),
 ('Estimacion', 4),
 ('TipoPago', 4),
 ('EstadoCS', 3),
 ('Contrato', 3),
 ('UnidadTiempoContratoLicitacion', 3),
 ('CodigoTipo', 2),
 ('TipoConvocatoria', 2),
 ('Etapas', 2),
 ('TomaRazon', 2),
 ('EstadoPublicidadOfertas', 2),
 ('Obras', 2),
 ('VisibilidadMonto', 2),
 ('SubContratacion', 2),
 ('ExtensionPlazo', 2)]

In [4]:
numeric_data_only_awards = numeric_data[numeric_data["CantidadAdjudicada"] > 0].drop_duplicates()
df_awards = df.iloc[numeric_data_only_awards.index].copy()

In [8]:
def format_text_for_embedding(row):
    return "\n\n".join(
        [
            (row["Nombre"] if not pd.isna(row["Nombre"]) else ""),
            (row["Descripcion"] if not pd.isna(row["Descripcion"]) else ""),
            (
                row["Nombre producto genrico"]
                if not pd.isna(row["Nombre producto genrico"])
                else ""
            ),
            (
                row["Descripcion linea Adquisicion"]
                if not pd.isna(row["Descripcion linea Adquisicion"])
                else ""
            ),
            (
                row["DescripcionProveedor"]
                if not pd.isna(row["DescripcionProveedor"])
                else ""
            ),
        ]
    )


df_awards.loc[:, "compiled_text"] = df_awards.apply(format_text_for_embedding, axis=1)
df_awards.loc[:, "supplier_rut"] = df_awards["RutProveedor"].map(
    lambda x: x.split("-")[0].replace(".", "")
)

In [None]:
# Compute text embeddings using SentenceTransformer
from sentence_transformers import SentenceTransformer
import numpy as np

# Load the model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Get compiled_text for awards data (matching the indices)
texts = df_awards["compiled_text"].fillna("").tolist()

# Compute embeddings efficiently in batches
print(f"Computing embeddings for {len(texts)} texts...")
text_embeddings = model.encode(
    texts, batch_size=128, show_progress_bar=True, convert_to_numpy=True
)

print(f"Text embeddings shape: {text_embeddings.shape}")
print(f"Embedding dimension: {text_embeddings.shape[1]}")


Computing embeddings for 42874 texts...


Batches: 100%|██████████| 335/335 [00:11<00:00, 28.37it/s]


Text embeddings shape: (42874, 384)
Embedding dimension: 384


In [7]:
# Concatenate text embeddings with numeric columns
# Ensure numeric_data_only_awards is aligned with text_embeddings
numeric_array = numeric_data_only_awards.values.astype(np.float32)

# Check for and handle infinite values
if np.any(np.isinf(numeric_array)):
    print("Warning: Found infinite values, replacing with NaN")
    numeric_array = np.where(np.isinf(numeric_array), np.nan, numeric_array)

# Replace any remaining NaN with 0
numeric_array = np.nan_to_num(numeric_array, nan=0.0, posinf=0.0, neginf=0.0)

# Concatenate text embeddings (text_embeddings) with numeric columns (numeric_array)
# Result: (n_samples, embedding_dim + n_numeric_features)
combined_features = np.concatenate([text_embeddings, numeric_array], axis=1)

print(f"Combined features shape: {combined_features.shape}")
print(f"  - Text embedding dimension: {text_embeddings.shape[1]}")
print(f"  - Numeric columns dimension: {numeric_array.shape[1]}")
print(f"  - Total dimension: {combined_features.shape[1]}")

# Check for duplicate rows (can cause issues with nearest neighbor search)
# Add tiny random noise to duplicate rows to make them unique
unique_rows, unique_indices, inverse_indices = np.unique(
    combined_features, axis=0, return_index=True, return_inverse=True
)
if len(unique_rows) < len(combined_features):
    print(
        f"Warning: Found {len(combined_features) - len(unique_rows)} duplicate rows, adding small noise"
    )
    # Add very small random noise to make duplicates unique
    np.random.seed(42)
    noise = np.random.normal(0, 1e-8, combined_features.shape).astype(np.float32)
    combined_features = combined_features + noise

# Apply Dimensionality Reduction to the combined features
reducer = UMAP(n_components=2, random_state=42)
umap_embedding = reducer.fit_transform(combined_features)

Combined features shape: (42874, 417)
  - Text embedding dimension: 384
  - Numeric columns dimension: 33
  - Total dimension: 417


  warn(


In [9]:
df_act = pd.read_csv("downloads/PUB_EMPRESAS_PJ_2020_A_2024.txt", sep="\t")
# Año comercial	RUT	DV	Razón social	Tramo según ventas
# Número de trabajadores dependie	Fecha inicio de actividades vige	Fecha término de giro
# Fecha primera inscripción de ac	Tipo término de giro
# Tipo de contribuyente	Subtipo de contribuyente	Tramo capital propio positivo	Tramo capital propio negativo
# Rubro económico	Subrubro económico	Actividad económica
# Región	Provincia	Comuna
# R_PRESUNTA	OTROS_REGIMENES
df_act.columns = [
    "fiscal_year",
    "rut",
    "dv",
    "company_name",
    "sales_bracket",
    "num_employees",
    "current_activity_start_date",
    "activity_end_date",
    "first_registration_date",
    "activity_end_type",
    "contributor_type",
    "contributor_subtype",
    "positive_equity_bracket",
    "negative_equity_bracket",
    "economic_sector",
    "economic_subsector",
    "economic_activity",
    "region",
    "province",
    "commune",
    "presumed_income",
    "other_regimes",
]
df_act.first_registration_date = pd.to_datetime(
    df_act.first_registration_date, errors="coerce"
)
df_act.current_activity_start_date = pd.to_datetime(
    df_act.current_activity_start_date, errors="coerce"
)
df_act.activity_end_date = pd.to_datetime(df_act.activity_end_date, errors="coerce")

rut_to_registration_date = {str(d["rut"]): d["first_registration_date"] for d in df_act[["rut", "first_registration_date"]].to_dict(orient="records")}
df_awards.loc[:, "first_activity_date"] = df_awards["supplier_rut"].map(rut_to_registration_date.get)

  df_act = pd.read_csv("downloads/PUB_EMPRESAS_PJ_2020_A_2024.txt", sep="\t")


In [10]:
# Create final DataFrame with CodigoExterno, numeric columns, and UMAP x, y
result_df = pd.DataFrame()
result_df["CodigoExterno"] = df_awards["CodigoExterno"]
result_df["tender_name"] = df_awards["Nombre"]
result_df["supplier_name"] = df_awards["RazonSocialProveedor"]
result_df["supplier_rut"] = df_awards["supplier_rut"]
result_df["first_activity_date"] = pd.to_datetime(df_awards["first_activity_date"])

for col in df_awards.columns:
    if col.startswith("Fecha"):
        result_df[col] = pd.to_datetime(df_awards[col])
# Add all numeric columns
for col in numeric_columns:
    result_df[col] = numeric_data_only_awards[col]
# Add UMAP x and y columns
result_df["x"] = umap_embedding[:, 0]
result_df["y"] = umap_embedding[:, 1]
result_df.reset_index(drop=True, inplace=True)
# Display result
print(f"Shape: {result_df.shape}")
print(f"Numeric columns found: {len(numeric_columns)}")
print(f"Columns: {list(result_df.columns[:5])}... (showing first 5)")
result_df.head()

Shape: (42874, 56)
Numeric columns found: 33
Columns: ['CodigoExterno', 'tender_name', 'supplier_name', 'supplier_rut', 'first_activity_date']... (showing first 5)


Unnamed: 0,CodigoExterno,tender_name,supplier_name,supplier_rut,first_activity_date,FechaCreacion,FechaCierre,FechaInicio,FechaFinal,FechaPubRespuestas,...,Cantidad,CodigoProveedor,Monto Estimado Adjudicado,Cantidad Ofertada,MontoUnitarioOferta,Valor Total Ofertado,CantidadAdjudicada,MontoLineaAdjudica,x,y
0,812030-5-LQ19,SUMINISTRO DE INMUNOGLOBULINA,GRIFOLS CHILE S A,96582310,1993-01-01,2019-01-18,2020-01-31,2020-01-21,2020-01-22,2020-01-24,...,10.0,54808.0,223973946.0,10.0,159200.0,1592000.0,10.0,1592000.0,14.278939,17.310768
1,812030-5-LQ19,SUMINISTRO DE INMUNOGLOBULINA,GRIFOLS CHILE S A,96582310,1993-01-01,2019-01-18,2020-01-31,2020-01-21,2020-01-22,2020-01-24,...,60.0,54808.0,223973946.0,60.0,322010.0,19320600.0,60.0,19320600.0,-4.153696,-7.208119
2,2258-58-LE19,Licitacion de BACK UP para laboratorio,BIOMERIEUX CHILE SPA,96659920,1993-01-01,2019-03-11,2020-01-20,2020-01-10,2020-01-15,2020-01-17,...,1.0,33139.0,20954265.0,1.0,1750.0,1750.0,5000.0,8750000.0,-1.487097,-0.097432
3,2258-58-LE19,Licitacion de BACK UP para laboratorio,QUORUX CHILE SPA,76131142,2011-01-26,2019-03-11,2020-01-20,2020-01-10,2020-01-15,2020-01-17,...,1.0,1333599.0,20954265.0,1.0,660.0,660.0,2400.0,1584000.0,-1.616803,0.211644
4,2258-58-LE19,Licitacion de BACK UP para laboratorio,QUORUX CHILE SPA,76131142,2011-01-26,2019-03-11,2020-01-20,2020-01-10,2020-01-15,2020-01-17,...,1.0,1333599.0,20954265.0,1.0,1100.0,1100.0,2400.0,2640000.0,-1.575541,0.177008


In [11]:
result_df.to_parquet('downloads/lic_2020-1_umap.parquet')

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm
from glob import glob
from typing import Tuple
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing as mp
from sklearn.preprocessing import StandardScaler

df_act = pd.read_csv("downloads/PUB_EMPRESAS_PJ_2020_A_2024.txt", sep="\t")
# Año comercial	RUT	DV	Razón social	Tramo según ventas
# Número de trabajadores dependie	Fecha inicio de actividades vige	Fecha término de giro
# Fecha primera inscripción de ac	Tipo término de giro
# Tipo de contribuyente	Subtipo de contribuyente	Tramo capital propio positivo	Tramo capital propio negativo
# Rubro económico	Subrubro económico	Actividad económica
# Región	Provincia	Comuna
# R_PRESUNTA	OTROS_REGIMENES
df_act.columns = [
    "fiscal_year",
    "rut",
    "dv",
    "company_name",
    "sales_bracket",
    "num_employees",
    "current_activity_start_date",
    "activity_end_date",
    "first_registration_date",
    "activity_end_type",
    "contributor_type",
    "contributor_subtype",
    "positive_equity_bracket",
    "negative_equity_bracket",
    "economic_sector",
    "economic_subsector",
    "economic_activity",
    "region",
    "province",
    "commune",
    "presumed_income",
    "other_regimes",
]
# Note: We'll use safe_to_datetime for dates in the processing function
# For df_act, we use errors="coerce" and let safe_to_datetime handle invalid dates later
df_act.first_registration_date = pd.to_datetime(
    df_act.first_registration_date, errors="coerce"
)
df_act.current_activity_start_date = pd.to_datetime(
    df_act.current_activity_start_date, errors="coerce"
)
df_act.activity_end_date = pd.to_datetime(df_act.activity_end_date, errors="coerce")

rut_to_registration_date = {
    str(d["rut"]): d["first_registration_date"]
    for d in df_act[["rut", "first_registration_date"]].to_dict(orient="records")
}

# Model will be initialized in each worker process
_model_cache = None


def get_model():
    """Lazy load model (one per process)."""
    global _model_cache
    if _model_cache is None:
        _model_cache = SentenceTransformer("all-MiniLM-L6-v2")
    return _model_cache


def safe_to_datetime(series, fallback_date="1900-01-01"):
    """
    Safely convert a series to datetime, using fallback_date for invalid dates.
    Handles out-of-bounds dates and other parsing errors.
    Vectorized version for better performance.
    """
    try:
        from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
    except ImportError:
        # OutOfBoundsDatetime might not be available in all pandas versions
        OutOfBoundsDatetime = ValueError

    fallback = pd.Timestamp(fallback_date)
    min_date = pd.Timestamp("1677-09-21")
    max_date = pd.Timestamp("2262-04-11")

    # Vectorized conversion: try to convert entire series at once
    try:
        result = pd.to_datetime(series, errors="coerce")
    except (ValueError, OverflowError, OutOfBoundsDatetime):
        # If vectorized conversion fails due to OutOfBoundsDatetime,
        # fall back to element-wise for problematic values
        # First, try to convert what we can
        result = pd.Series(index=series.index, dtype="datetime64[ns]")
        # Identify problematic indices by trying to convert each value
        problematic_indices = []
        for idx, val in series.items():
            try:
                if pd.isna(val):
                    result.loc[idx] = pd.NaT
                else:
                    parsed = pd.to_datetime(val, errors="coerce")
                    result.loc[idx] = parsed
            except (ValueError, OverflowError, OutOfBoundsDatetime):
                problematic_indices.append(idx)
                result.loc[idx] = pd.NaT
        
        # For problematic indices, use fallback
        if problematic_indices:
            result.loc[problematic_indices] = fallback

    # Replace NaT (invalid/unparseable dates) with fallback
    result = result.fillna(fallback)
    
    # Replace out-of-bounds dates with fallback using vectorized boolean indexing
    out_of_bounds = (result < min_date) | (result > max_date)
    result.loc[out_of_bounds] = fallback
    
    return result


def format_text_for_embedding(row):
    return "\n\n".join(
        [
            (row["Nombre"] if not pd.isna(row["Nombre"]) else ""),
            (row["Descripcion"] if not pd.isna(row["Descripcion"]) else ""),
            (
                row["Nombre producto genrico"]
                if not pd.isna(row["Nombre producto genrico"])
                else ""
            ),
            (
                row["Descripcion linea Adquisicion"]
                if not pd.isna(row["Descripcion linea Adquisicion"])
                else ""
            ),
            (
                row["DescripcionProveedor"]
                if not pd.isna(row["DescripcionProveedor"])
                else ""
            ),
        ]
    )


def identify_numeric_columns(df):
    """Identify numeric columns in a dataframe."""
    numeric_columns = []
    for col in df.columns:
        if col in (
            "CodigoExterno",
            "Codigo",
            "CodigoEstado",
            "EstadoEtapas",
            "CodigoUnidad",
            "Informada",
            "EsBaseTipo",
            "ValorTiempoRenovacion",
            "EsRenovable",
            "Codigoitem",
            "CodigoProductoONU",
            "CodigoSucursalProveedor",
            "Correlativo",
        ):
            continue
        # Try to convert to numeric, handling comma decimal separators
        # Replace comma with dot for decimal separator
        test_series = df[col].astype(str).str.replace(",", ".", regex=False)
        numeric_series = pd.to_numeric(test_series, errors="coerce")
        # Check if column is numeric (has valid numeric values and not all NaN)
        if numeric_series.notna().any():
            # Check if the column is actually numeric (most values are numeric)
            non_null_count = numeric_series.notna().sum()
            total_count = len(numeric_series)
            # Consider it numeric if at least 50% of values are numeric
            if non_null_count / total_count >= 0.5:
                numeric_columns.append(col)
    return numeric_columns


def get_all_numeric_columns(csv_files):
    """Scan all CSV files to identify the union of all numeric columns."""
    all_numeric_columns = set()
    print("Scanning all CSV files to identify numeric columns...")
    for file_path in tqdm(csv_files, desc="Scanning files"):
        try:
            df = pd.read_csv(
                file_path, encoding="latin-1", sep=";", nrows=1000
            )  # Sample first 1000 rows for speed
            numeric_cols = identify_numeric_columns(df)
            all_numeric_columns.update(numeric_cols)
        except Exception as e:
            print(f"Warning: Error scanning {file_path}: {e}")
            continue
    # Convert to sorted list for consistency
    numeric_columns_list = sorted(list(all_numeric_columns))
    print(f"Found {len(numeric_columns_list)} numeric columns across all files")
    return numeric_columns_list


def process_and_embed_one_file(
    file_path, numeric_columns: list, rut_to_registration_date: dict
) -> Tuple[pd.DataFrame, np.ndarray]:
    """Process a single file and return result_df and combined_features."""
    print(f"Processing file: {file_path}")
    # Load model in this process
    model = get_model()
    # Load CSV with latin-1 encoding and semicolon separator
    df = pd.read_csv(file_path, encoding="latin-1", sep=";")

    # Prepare numeric data for UMAP
    # Ensure all required numeric columns exist, fill missing ones with 0
    numeric_data = pd.DataFrame(index=df.index)
    for col in numeric_columns:
        if col in df.columns:
            numeric_data[col] = df[col]
        else:
            numeric_data[col] = 0

    # Convert to numeric, handling comma decimal separators
    for col in numeric_columns:
        # Replace comma with dot for decimal separator, then convert to numeric
        numeric_data[col] = (
            numeric_data[col].astype(str).str.replace(",", ".", regex=False)
        )
        numeric_data[col] = pd.to_numeric(numeric_data[col], errors="coerce")
    # Fill missing values with 0 (or could use median/mean)
    numeric_data = numeric_data.fillna(0)
    # Ensure all values are float (not object/string)
    numeric_data = numeric_data.astype(float).drop_duplicates()

    # Filter for awards (CantidadAdjudicada > 0)
    if "CantidadAdjudicada" not in numeric_data.columns:
        print(
            f"Warning: CantidadAdjudicada not found in numeric columns for {file_path}, skipping awards filter"
        )
        numeric_data_only_awards = numeric_data.drop_duplicates()
    else:
        numeric_data_only_awards = numeric_data[
            numeric_data["CantidadAdjudicada"] > 0
        ].drop_duplicates()
    df_awards = df.iloc[numeric_data_only_awards.index].copy()

    df_awards.loc[:, "compiled_text"] = df_awards.apply(
        format_text_for_embedding, axis=1
    )
    df_awards.loc[:, "supplier_rut"] = df_awards["RutProveedor"].map(
        lambda x: x.split("-")[0].replace(".", "")
    )

    # Get compiled_text for awards data (matching the indices)
    texts = df_awards["compiled_text"].fillna("").tolist()

    # Compute embeddings efficiently in batches
    print(f"Computing embeddings for {len(texts)} texts in {file_path}...")
    text_embeddings = model.encode(
        texts, batch_size=16, show_progress_bar=False, convert_to_numpy=True
    )

    print(f"Text embeddings shape: {text_embeddings.shape}")
    print(f"Embedding dimension: {text_embeddings.shape[1]}")

    # Concatenate text embeddings with numeric columns
    # Ensure numeric_data_only_awards is aligned with text_embeddings
    numeric_array = numeric_data_only_awards.values.astype(np.float32)

    # Check for and handle infinite values
    if np.any(np.isinf(numeric_array)):
        print("Warning: Found infinite values, replacing with NaN")
        numeric_array = np.where(np.isinf(numeric_array), np.nan, numeric_array)

    # Replace any remaining NaN with 0
    numeric_array = np.nan_to_num(numeric_array, nan=0.0, posinf=0.0, neginf=0.0)

    # Concatenate text embeddings (text_embeddings) with numeric columns (numeric_array)
    # Result: (n_samples, embedding_dim + n_numeric_features)
    combined_features = np.concatenate([text_embeddings, numeric_array], axis=1)

    print(f"Combined features shape: {combined_features.shape}")
    print(f"  - Text embedding dimension: {text_embeddings.shape[1]}")
    print(f"  - Numeric columns dimension: {numeric_array.shape[1]}")
    print(f"  - Total dimension: {combined_features.shape[1]}")

    df_awards.loc[:, "first_activity_date"] = df_awards["supplier_rut"].map(
        rut_to_registration_date.get
    )

    # Create final DataFrame with CodigoExterno, numeric columns (without UMAP x, y)
    result_df = pd.DataFrame()
    result_df["CodigoExterno"] = df_awards["CodigoExterno"]
    result_df["tender_name"] = df_awards["Nombre"]
    result_df["supplier_name"] = df_awards["RazonSocialProveedor"]
    result_df["supplier_rut"] = df_awards["supplier_rut"]
    result_df["first_activity_date"] = safe_to_datetime(
        df_awards["first_activity_date"]
    )

    for col in df_awards.columns:
        if col.startswith("Fecha"):
            result_df[col] = safe_to_datetime(df_awards[col])
    # Add all numeric columns
    for col in numeric_columns:
        result_df[col] = numeric_data_only_awards[col]
    result_df.reset_index(drop=True, inplace=True)
    # Display result
    print(f"Shape: {result_df.shape}")
    print(f"Numeric columns found: {len(numeric_columns)}")
    print(f"Columns: {list(result_df.columns[:5])}... (showing first 5)")

    return result_df, combined_features


  from .autonotebook import tqdm as notebook_tqdm
  df_act = pd.read_csv("downloads/PUB_EMPRESAS_PJ_2020_A_2024.txt", sep="\t")


In [5]:
# Find all CSV files matching the pattern
csv_files = sorted(glob("downloads/lic_*.csv"))[:8]
if not csv_files:
    print("No CSV files found matching downloads/lic_*.csv")
    exit(1)

print(f"Found {len(csv_files)} CSV files to process")

# First pass: identify all numeric columns across all files
all_numeric_columns = get_all_numeric_columns(csv_files)

# Process each file in parallel and collect results
all_result_dfs = []
all_combined_features = []

print("\nProcessing files and computing embeddings in parallel...")
max_workers = min(24, len(csv_files), mp.cpu_count())
print(f"Using {max_workers} worker processes")

with ProcessPoolExecutor(max_workers=max_workers) as executor:
    # Submit all tasks
    future_to_file = {
        executor.submit(
            process_and_embed_one_file,
            file_path,
            all_numeric_columns,
            rut_to_registration_date,
        ): file_path
        for file_path in csv_files
    }

    # Collect results as they complete
    for future in tqdm(
        as_completed(future_to_file), total=len(csv_files), desc="Processing files"
    ):
        file_path = future_to_file[future]
        try:
            result_df, combined_features = future.result()
            all_result_dfs.append(result_df)
            all_combined_features.append(combined_features)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            import traceback

            traceback.print_exc()
            continue

if not all_result_dfs:
    print("No files were successfully processed")
    exit(1)

# Concatenate all result DataFrames
print("\nConcatenating all result DataFrames...")
final_result_df = pd.concat(all_result_dfs, ignore_index=True)
print(f"Final result DataFrame shape: {final_result_df.shape}")

# Concatenate all combined features
print("\nConcatenating all combined features...")
all_combined_features_array = np.concatenate(all_combined_features, axis=0)
print(f"Combined features array shape: {all_combined_features_array.shape}")

# Apply global normalization to numeric features only (text embeddings are already normalized)
# Split features: text embeddings (first 384 cols) and numeric features (last 33 cols)
text_embedding_dim = 384
numeric_feature_dim = all_combined_features_array.shape[1] - text_embedding_dim

print(f"\nApplying global normalization to numeric features...")
print(f"  - Text embedding columns: 0-{text_embedding_dim-1} (keeping as-is)")
print(f"  - Numeric feature columns: {text_embedding_dim}-{all_combined_features_array.shape[1]-1} (normalizing)")

# Extract numeric features (last numeric_feature_dim columns)
numeric_features = all_combined_features_array[:, text_embedding_dim:]

# Normalize numeric features globally
scaler = StandardScaler()
numeric_features_normalized = scaler.fit_transform(numeric_features).astype(np.float32)

# Reconstruct combined features with normalized numeric part
all_combined_features_array = np.concatenate(
    [all_combined_features_array[:, :text_embedding_dim], numeric_features_normalized], 
    axis=1
)
print(f"Global normalization complete. Feature statistics:")
print(f"  - Text embeddings: mean={all_combined_features_array[:, :text_embedding_dim].mean():.4f}, std={all_combined_features_array[:, :text_embedding_dim].std():.4f}")
print(f"  - Numeric features: mean={numeric_features_normalized.mean():.4f}, std={numeric_features_normalized.std():.4f}")

# Check for duplicate rows (can cause issues with nearest neighbor search)
# Add tiny random noise to duplicate rows to make them unique
print("\nChecking for duplicate rows...")
unique_rows, unique_indices, inverse_indices = np.unique(
    all_combined_features_array, axis=0, return_index=True, return_inverse=True
)
if len(unique_rows) < len(all_combined_features_array):
    print(
        f"Warning: Found {len(all_combined_features_array) - len(unique_rows)} duplicate rows, adding small noise"
    )
    # Add very small random noise to make duplicates unique
    np.random.seed(42)
    noise = np.random.normal(0, 1e-8, all_combined_features_array.shape).astype(
        np.float32
    )
    all_combined_features_array = all_combined_features_array + noise


Found 8 CSV files to process
Scanning all CSV files to identify numeric columns...


Scanning files: 100%|██████████| 8/8 [00:00<00:00, 16.09it/s]

Found 35 numeric columns across all files

Processing files and computing embeddings in parallel...
Using 8 worker processes



Processing files:   0%|          | 0/8 [00:00<?, ?it/s]

Processing file: downloads/lic_2020-1.csv
Processing file: downloads/lic_2020-10.csv
Processing file: downloads/lic_2020-11.csv
Processing file: downloads/lic_2020-12.csv


  df = pd.read_csv(file_path, encoding="latin-1", sep=";")


Processing file: downloads/lic_2020-2.csv


  df = pd.read_csv(file_path, encoding="latin-1", sep=";")


Processing file: downloads/lic_2020-3.csv


  df = pd.read_csv(file_path, encoding="latin-1", sep=";")


Computing embeddings for 28562 texts in downloads/lic_2020-10.csv...


  df = pd.read_csv(file_path, encoding="latin-1", sep=";")


Processing file: downloads/lic_2020-4.csv
Computing embeddings for 42874 texts in downloads/lic_2020-1.csv...
Computing embeddings for 30787 texts in downloads/lic_2020-11.csv...
Processing file: downloads/lic_2020-5.csv
Computing embeddings for 23751 texts in downloads/lic_2020-12.csv...


  df = pd.read_csv(file_path, encoding="latin-1", sep=";")
  df = pd.read_csv(file_path, encoding="latin-1", sep=";")
  df = pd.read_csv(file_path, encoding="latin-1", sep=";")


Computing embeddings for 35939 texts in downloads/lic_2020-2.csv...


  df = pd.read_csv(file_path, encoding="latin-1", sep=";")


Computing embeddings for 32625 texts in downloads/lic_2020-3.csv...
Computing embeddings for 23944 texts in downloads/lic_2020-4.csv...
Computing embeddings for 24433 texts in downloads/lic_2020-5.csv...
Text embeddings shape: (23751, 384)
Embedding dimension: 384
Combined features shape: (23751, 419)
  - Text embedding dimension: 384
  - Numeric columns dimension: 35
  - Total dimension: 419
Shape: (23751, 56)
Numeric columns found: 35
Columns: ['CodigoExterno', 'tender_name', 'supplier_name', 'supplier_rut', 'first_activity_date']... (showing first 5)
Text embeddings shape: (28562, 384)
Embedding dimension: 384
Combined features shape: (28562, 419)
  - Text embedding dimension: 384
  - Numeric columns dimension: 35
  - Total dimension: 419


Processing files:  12%|█▎        | 1/8 [00:52<06:04, 52.12s/it]

Shape: (28562, 56)
Numeric columns found: 35
Columns: ['CodigoExterno', 'tender_name', 'supplier_name', 'supplier_rut', 'first_activity_date']... (showing first 5)


Processing files:  25%|██▌       | 2/8 [00:52<02:09, 21.62s/it]

Text embeddings shape: (23944, 384)
Embedding dimension: 384
Combined features shape: (23944, 419)
  - Text embedding dimension: 384
  - Numeric columns dimension: 35
  - Total dimension: 419
Shape: (23944, 56)
Numeric columns found: 35
Columns: ['CodigoExterno', 'tender_name', 'supplier_name', 'supplier_rut', 'first_activity_date']... (showing first 5)


Processing files:  38%|███▊      | 3/8 [00:56<01:09, 13.84s/it]

Text embeddings shape: (30787, 384)
Embedding dimension: 384
Combined features shape: (30787, 419)
  - Text embedding dimension: 384
  - Numeric columns dimension: 35
  - Total dimension: 419
Text embeddings shape: (24433, 384)
Embedding dimension: 384
Combined features shape: (24433, 419)
  - Text embedding dimension: 384
  - Numeric columns dimension: 35
  - Total dimension: 419
Shape: (30787, 56)
Numeric columns found: 35
Columns: ['CodigoExterno', 'tender_name', 'supplier_name', 'supplier_rut', 'first_activity_date']... (showing first 5)
Shape: (24433, 56)
Numeric columns found: 35
Columns: ['CodigoExterno', 'tender_name', 'supplier_name', 'supplier_rut', 'first_activity_date']... (showing first 5)


Processing files:  50%|█████     | 4/8 [00:57<00:34,  8.71s/it]

Text embeddings shape: (32625, 384)
Embedding dimension: 384
Combined features shape: (32625, 419)
  - Text embedding dimension: 384
  - Numeric columns dimension: 35
  - Total dimension: 419
Shape: (32625, 56)
Numeric columns found: 35
Columns: ['CodigoExterno', 'tender_name', 'supplier_name', 'supplier_rut', 'first_activity_date']... (showing first 5)


Processing files:  75%|███████▌  | 6/8 [01:01<00:10,  5.08s/it]

Text embeddings shape: (35939, 384)
Embedding dimension: 384
Combined features shape: (35939, 419)
  - Text embedding dimension: 384
  - Numeric columns dimension: 35
  - Total dimension: 419
Shape: (35939, 56)
Numeric columns found: 35
Columns: ['CodigoExterno', 'tender_name', 'supplier_name', 'supplier_rut', 'first_activity_date']... (showing first 5)
Text embeddings shape: (42874, 384)
Embedding dimension: 384
Combined features shape: (42874, 419)
  - Text embedding dimension: 384
  - Numeric columns dimension: 35
  - Total dimension: 419
Shape: (42874, 56)


Processing files:  88%|████████▊ | 7/8 [01:02<00:03,  3.92s/it]

Numeric columns found: 35
Columns: ['CodigoExterno', 'tender_name', 'supplier_name', 'supplier_rut', 'first_activity_date']... (showing first 5)


Processing files: 100%|██████████| 8/8 [01:02<00:00,  7.83s/it]



Concatenating all result DataFrames...
Final result DataFrame shape: (242915, 56)

Concatenating all combined features...
Combined features array shape: (242915, 419)

Applying global normalization to numeric features...
  - Text embedding columns: 0-383 (keeping as-is)
  - Numeric feature columns: 384-418 (normalizing)
Global normalization complete. Feature statistics:
  - Text embeddings: mean=-0.0024, std=0.0510
  - Numeric features: mean=-0.0000, std=1.0000

Checking for duplicate rows...


In [6]:
from openTSNE import TSNE
X_embedded = TSNE(n_components=2, perplexity=15, n_jobs=-1).fit(all_combined_features_array)

In [None]:
import torch

# Apply Dimensionality Reduction to the combined features (once for all data)
print("\nComputing UMAP on all combined features...")
reducer = UMAP(n_components=2, random_state=42, device="cuda")
reduced_embedding = reducer.fit_transform(
    torch.from_numpy(all_combined_features_array).to(device="cuda")
)

# Add UMAP x and y columns to the final result DataFrame
final_result_df["x"] = reduced_embedding[:, 0]
final_result_df["y"] = reduced_embedding[:, 1]

# Save final output
output_path = "downloads/all_months_umap_gpu.parquet"
print(f"\nSaving final result to {output_path}...")
final_result_df.to_parquet(output_path)

print(f"\nCompleted! Final dataset shape: {final_result_df.shape}")
print(f"Columns: {list(final_result_df.columns)}")



Computing UMAP on all combined features...




RuntimeError: bincount only supports 1-d non-negative integral inputs.