https://www.kaggle.com/competitions/aeroclub-recsys-2025/overview

<h2>Summary for Preprocessing & Imputation:<h2>

- I checked which columns had missing values and calculated the percentage of missing data for each.

- Where the percentage was very low, I applied simple rules (e.g., replacing with the median, replacing with random values to preserve the distribution, etc.).

- Where the percentage was high, I decided to impute the missing values. I tested various techniques using validation imputation, and found that XGBClassifier/XGBRegressor produced very good metrics (96%–99% accuracy, F1 score).

- For each column, I identified the most correlated/informative columns and used them to train an XGBClassifier/XGBRegressor model to impute the missing values.

- For the duration-related columns (arrivalAt, departureAt), I used datetime and extracted more informative features such as hour, day, month, minute, etc.

- For the column that determines the flight duration, I directly converted the format into minutes to make it easier to use later.

- I removed the column legs1_segments0_flightNumber because it contains over 5,000 unique classes and is very memory-intensive for models like XGBClassifier.

- I also removed all columns with more than 78% missing values.

Considerations:

- XGBClassifier/XGBRegressor are not very sensitive to the encoding of categorical numerical values. However, if I were to repeat the process, I would also encode the columns used for imputing missing values in order to take advantage of the additional 1–2% performance improvement.

# **Preprocessing & Imputation 1**

In [None]:
%%writefile nan_report.py
import pandas as pd
import os

# Function to load a file based on its extension
def load_file(filename):
    ext = os.path.splitext(filename)[1].lower()
    if ext == '.csv':
        return pd.read_csv(filename, low_memory=False)
    elif ext == '.parquet':
        return pd.read_parquet(filename)
    else:
        raise ValueError(f"Unsupported file extension: {ext}")

# Function that returns a string with NaN percentages
def get_nan_report(df, name):
    nan_percent = df.isna().mean() * 100
    nan_percent = nan_percent[nan_percent > 0].sort_values(ascending=False)
    report = f"NaN percentage in {name}:\n"
    report += nan_percent.to_string()
    return report

# Load files (adjust filenames as needed)
train = load_file('train.parquet')  # or 'train.parquet'
test = load_file('test.parquet')    # or 'test.parquet'

# Generate reports
train_report = get_nan_report(train, 'train')
test_report = get_nan_report(test, 'test')

# Write to .txt file
with open('nan_report.txt', 'w') as f:
    f.write(train_report + "\n\n" + test_report)

print("Report saved to 'nan_report.txt'")


Overwriting nan_report.py


In [None]:
%%writefile summary.py
import pandas as pd

# Column to analyze
feature_name = "legs0_segments0_arrivalTo_airport_iata"

# Load datasets

train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

report = []

def get_feature_report(df, dataset_name):
    if feature_name not in df.columns:
        return f"{dataset_name}: Column '{feature_name}' not found.\n"

    col = df[feature_name]
    total = len(col)
    missing = col.isna().sum()
    pct_missing = missing / total * 100
    nunique = col.nunique(dropna=True)
    dtype = col.dtype

    section = [
        f"📊 Dataset: {dataset_name}",
        f"Column: {feature_name}",
        f"Data type: {dtype}",
        f"Total rows: {total:,}",
        f"Missing values: {missing:,} ({pct_missing:.6f}%)",
        f"Unique values (non-null): {nunique:,}"
    ]

    if pd.api.types.is_numeric_dtype(col):
        section.append("\n🔹 Descriptive statistics:")
        section.append(str(col.describe()))
    else:
        section.append("\n🔹 Top 5 most frequent values:")
        section.append(str(col.value_counts(dropna=True).head()))

    return "\n".join(section)

# Build report
report.append(get_feature_report(train, "Train"))
report.append("\n" + "=" * 60 + "\n")
report.append(get_feature_report(test, "Test"))

# Save to file
output = "\n".join(report)
filename = f"{feature_name}_summary.txt"

with open(filename, "w", encoding="utf-8") as f:
    f.write(output)

print(f"✅ Summary saved to '{filename}'")

Writing summary.py


In [None]:
%%writefile summary.py
import pandas as pd

# Column to analyze
feature_name = "frequentFlyer"

# Load datasets from parquet
train = pd.read_parquet("train.parquet")
test = pd.read_parquet("test.parquet")

report = []

def get_feature_report(df, dataset_name):
    if feature_name not in df.columns:
        return f"{dataset_name}: Column '{feature_name}' not found.\n"

    col = df[feature_name]
    total = len(col)
    missing = col.isna().sum()
    pct_missing = missing / total * 100
    nunique = col.nunique(dropna=True)
    dtype = col.dtype

    section = [
        f"📊 Dataset: {dataset_name}",
        f"Column: {feature_name}",
        f"Data type: {dtype}",
        f"Total rows: {total:,}",
        f"Missing values: {missing:,} ({pct_missing:.6f}%)",
        f"Unique values (non-null): {nunique:,}"
    ]

    if pd.api.types.is_numeric_dtype(col):
        section.append("\n🔹 Descriptive statistics:")
        section.append(str(col.describe()))
    else:
        section.append("\n🔹 All unique values (with counts):")
        section.append(str(col.value_counts(dropna=True).to_string()))

    return "\n".join(section)

# Build report
report.append(get_feature_report(train, "Train"))
report.append("\n" + "=" * 60 + "\n")
report.append(get_feature_report(test, "Test"))

# Save to file
output = "\n".join(report)
filename = f"{feature_name}_summary.txt"

with open(filename, "w", encoding="utf-8") as f:
    f.write(output)

print(f"✅ Summary saved to '{filename}'")

Writing summary.py


In [None]:
%%writefile impute_legs0_segments0_arrivalTo_airport_iata.py
import pandas as pd

# Load train.csv
train = pd.read_csv("train.csv", low_memory=False)

# Target column
col = "legs0_segments0_arrivalTo_airport_iata"

# Fill missing values with "MISSING"
train[col] = train[col].fillna("MISSING")

# Overwrite the original train.csv file
train.to_csv("train.csv", index=False)

print(f"Missing values in '{col}' have been filled with 'MISSING' and saved back to 'train.csv'.")

Writing impute_legs0_segments0_arrivalTo_airport_iata.py


In [None]:
%%writefile impute_legs0_segments0_aircraft_code.py
import pandas as pd

# Load train.csv
train = pd.read_csv("train.csv", low_memory=False)

# Target column
col = "legs0_segments0_aircraft_code"

# Fill missing values with "MISSING"
train[col] = train[col].fillna("MISSING")

# Overwrite the original train.csv file
train.to_csv("train.csv", index=False)

print(f"Missing values in '{col}' have been filled with 'MISSING' and saved back to 'train.csv'.")

Writing impute_legs0_segments0_aircraft_code.py


In [None]:
%%writefile impute_legs0_segments0_departureFrom_airport_iata.py
import pandas as pd

# Load train.csv
train = pd.read_csv("train.csv", low_memory=False)

# Target column
col = "legs0_segments0_departureFrom_airport_iata"

# Fill missing values with "MISSING"
train[col] = train[col].fillna("MISSING")

# Overwrite the original train.csv file
train.to_csv("train.csv", index=False)

print(f"Missing values in '{col}' have been filled with 'MISSING' and saved back to 'train.csv'.")

In [None]:
%%writefile impute_legs0_segments0_arrivalTo_airport_city_iata.py
import pandas as pd

# Load datasets
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# Target column
col = "legs0_segments0_arrivalTo_airport_city_iata"

# Impute missing values with "MISSING"
train[col] = train[col].fillna("MISSING")
test[col] = test[col].fillna("MISSING")

# Save back
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

print(f"Missing values in '{col}' have been filled with 'MISSING' in both train and test datasets.")

In [None]:
%%writefile analyze_baggage_quantity_by_type.py
import pandas as pd

# Load train.csv (fără modificări)
df = pd.read_csv("train.csv", low_memory=False)

# Column names
qty_col = "legs0_segments0_baggageAllowance_quantity"
type_col = "legs0_segments0_baggageAllowance_weightMeasurementType"

# Filter rows where both values are present
df_filtered = df[[qty_col, type_col]].dropna()

# Grouped statistics
grouped_stats = df_filtered.groupby(type_col)[qty_col].describe()

# Format output
lines = []
lines.append("📊 Baggage Allowance Quantity Statistics by Measurement Type\n")

for measurement_type, stats in grouped_stats.iterrows():
    type_label = "PIECE (0.0)" if measurement_type == 0.0 else "WEIGHT (1.0)"
    lines.append(f"\n▶️ Type: {type_label}")
    lines.append(stats.to_string())

# Save to TXT
output_file = "baggage_quantity_by_type_stats.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print(f"✅ Analysis saved to '{output_file}'")

In [None]:
%%writefile nan_in_the_same_time.py
import pandas as pd

df = pd.read_csv("train.csv", low_memory=False)

qty_col = "legs0_segments0_baggageAllowance_quantity"
type_col = "legs0_segments0_baggageAllowance_weightMeasurementType"

missing_qty = df[qty_col].isna()
missing_type = df[type_col].isna()

print(f"❓ Missing only in quantity: {((missing_qty) & (~missing_type)).sum()}")
print(f"❓ Missing only in type:     {((missing_type) & (~missing_qty)).sum()}")
print(f"✅ Missing in both:         {((missing_qty) & (missing_type)).sum()}")
print(f"ℹ️ Total missing in quantity: {missing_qty.sum()}")
print(f"ℹ️ Total missing in type:     {missing_type.sum()}")

❓ Missing only in quantity: 0

❓ Missing only in type:     0

✅ Missing in both:         1064

ℹ️ Total missing in quantity: 1064

ℹ️ Total missing in type:     1064

In [None]:
%%writefile add_baggage_missing_flag.py
import pandas as pd

# Column names
qty_col = "legs0_segments0_baggageAllowance_quantity"
type_col = "legs0_segments0_baggageAllowance_weightMeasurementType"
flag_col = "legs0_segments0_baggageAllowance_missing_initially"

def process_file(path, name):
    df = pd.read_csv(path, low_memory=False)

    # Create flag: 1 if both values are present, 0 if both are missing
    df[flag_col] = (~df[qty_col].isna()) & (~df[type_col].isna())
    df[flag_col] = df[flag_col].astype(int)

    # Stats
    total = len(df)
    present = df[flag_col].sum()
    missing = total - present
    pct_present = present / total * 100
    pct_missing = missing / total * 100

    # Save updated CSV
    df.to_csv(path, index=False)

    # Return summary text
    return f"""📁 {name}
Total rows: {total:,}
Rows with both values present (flag = 1): {present:,} ({pct_present:.6f}%)
Rows with both values missing (flag = 0): {missing:,} ({pct_missing:.6f}%)
"""

# Process both files
train_report = process_file("train.csv", "Train")
test_report = process_file("test.csv", "Test")

# Save report to text file
with open("baggageAllowance_missing_flag_report.txt", "w", encoding="utf-8") as f:
    f.write("📊 Baggage Allowance Initial Presence Report\n\n")
    f.write(train_report)
    f.write("\n" + "="*60 + "\n\n")
    f.write(test_report)

print("✅ Column added to train.csv and test.csv.")
print("📝 Report saved to 'baggageAllowance_missing_flag_report.txt'")

In [None]:
%%writefile impute_legs0_segments0_baggageAllowance.py
import pandas as pd
import numpy as np

def impute_baggage(df):
    # Columns of interest
    quantity_col = 'legs0_segments0_baggageAllowance_quantity'
    type_col = 'legs0_segments0_baggageAllowance_weightMeasurementType'

    # Identify missing rows
    missing_idx = df[df[quantity_col].isna()].index
    n_missing = len(missing_idx)

    if n_missing == 0:
        return df

    # 94% PIECE (1 item), 6% WEIGHT (30 kg)
    n_piece = int(0.94 * n_missing)
    n_weight = n_missing - n_piece

    # Randomly assign indices
    piece_idx = np.random.choice(missing_idx, size=n_piece, replace=False)
    weight_idx = missing_idx.difference(piece_idx)

    # Fill values
    df.loc[piece_idx, quantity_col] = 1.0
    df.loc[piece_idx, type_col] = 0.0

    df.loc[weight_idx, quantity_col] = 30.0
    df.loc[weight_idx, type_col] = 1.0

    return df

def report_missing(df, filename):
    q_missing = df['legs0_segments0_baggageAllowance_quantity'].isna().sum()
    t_missing = df['legs0_segments0_baggageAllowance_weightMeasurementType'].isna().sum()
    print(f"\n✅ After imputation in {filename}:")
    print(f" - Missing quantity values: {q_missing}")
    print(f" - Missing type values: {t_missing}")

# Apply to train.csv
train = pd.read_csv("train.csv")
train = impute_baggage(train)
report_missing(train, "train.csv")
train.to_csv("train.csv", index=False)

# Apply to test.csv
test = pd.read_csv("test.csv")
test = impute_baggage(test)
report_missing(test, "test.csv")
test.to_csv("test.csv", index=False)

In [None]:
%%writefile check_mixed_features.py
import pandas as pd

df = pd.read_csv("train.csv", low_memory=False)

mixed_type_columns = []

for col in df.columns:
    types = df[col].map(type).value_counts()
    if len(types) > 1:
        mixed_type_columns.append((col, types))

# Save to a .txt file
with open("mixed_type_columns_report.txt", "w") as f:
    for col, types in mixed_type_columns:
        f.write(f"\n🔍 Column: {col}\n")
        f.write(types.to_string())
        f.write("\n")

In [None]:
%%writefile seatsAvailable_missing.py
import pandas as pd

# Load train.csv
df = pd.read_csv("train.csv", low_memory=False)

# Create indicator column: 1 if value is present, 0 if missing
df["legs0_segments0_seatsAvailable_missing_initially"] = df["legs0_segments0_seatsAvailable"].notna().astype(int)

# Display distribution of the new column
distribution = df["legs0_segments0_seatsAvailable_missing_initially"].value_counts()
print("📊 Distribution in 'legs0_segments0_seatsAvailable_missing_initially':")
print(distribution)

# Save changes back to train.csv
df.to_csv("train.csv", index=False)

print("\n✅ Column added and saved to train.csv.")

legs0_segments0_seatsAvailable_missing_initially

1    18065585

0       79787

Name: count, dtype: int64

In [None]:
%%writefile impute_legs0_segments0_seatsAvailable.py
import pandas as pd

# Load train.csv
df = pd.read_csv("train.csv", low_memory=False)

# Compute median
median_value = df["legs0_segments0_seatsAvailable"].median()

# Replace NaN with median
df["legs0_segments0_seatsAvailable"].fillna(median_value, inplace=True)

# Save changes back to train.csv
df.to_csv("train.csv", index=False)

print(f"✅ NaN values replaced with median ({median_value}) in 'legs0_segments0_seatsAvailable'.")

In [None]:
%%writefile corr_pricingInfo_isAccessTP.py
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from multiprocessing import Pool, cpu_count

# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Target
target = 'pricingInfo_isAccessTP'
y_train = train[target]
y_test = test[target]

# Drop irrelevant columns
drop_cols = [
    'Id', 'profileId', '__index_level_0__', 'requestDate',
    'searchRoute', 'legs0_arrivalAt', 'legs0_departureAt',
    'legs1_arrivalAt', 'legs1_departureAt'
]

# Select numeric, common, clean columns
common_cols = list(set(train.columns) & set(test.columns))
numeric_cols = train[common_cols].select_dtypes(include=['float64', 'int64', 'bool']).columns
numeric_cols = [col for col in numeric_cols if col not in drop_cols and col != target]
numeric_cols = [
    col for col in numeric_cols
    if train[col].isnull().sum() == 0 and test[col].isnull().sum() == 0
]

X_train = train[numeric_cols]
X_test = test[numeric_cols]

# Fisher Score
def fisher_score(X, y):
    scores = {}
    classes = np.unique(y.dropna())
    for col in X.columns:
        x = X[col]
        mean_overall = x.mean()
        numerator = 0
        denominator = 0
        for cls in classes:
            xi = x[y == cls]
            ni = len(xi)
            mean_i = xi.mean()
            var_i = xi.var()
            numerator += ni * (mean_i - mean_overall) ** 2
            denominator += ni * var_i
        scores[col] = numerator / denominator if denominator != 0 else 0
    return pd.Series(scores)

# Score calculator by ID
def compute_score(task_id):
    if task_id == 1:
        return "PEARSON_TRAIN", X_train.corrwith(y_train).sort_values(key=abs, ascending=False)
    elif task_id == 2:
        return "FISHER_TRAIN", fisher_score(X_train, y_train).sort_values(ascending=False)
    elif task_id == 3:
        return "MI_TRAIN", pd.Series(
            mutual_info_classif(X_train, y_train.fillna(0), random_state=0),
            index=X_train.columns
        ).sort_values(ascending=False)
    elif task_id == 4:
        return "PEARSON_TEST", X_test.corrwith(y_test).sort_values(key=abs, ascending=False)
    elif task_id == 5:
        return "FISHER_TEST", fisher_score(X_test, y_test).sort_values(ascending=False)
    elif task_id == 6:
        return "MI_TEST", pd.Series(
            mutual_info_classif(X_test, y_test.fillna(0), random_state=0),
            index=X_test.columns
        ).sort_values(ascending=False)

# Launch parallel tasks
if __name__ == "__main__":
    with Pool(processes=min(6, cpu_count())) as pool:
        results = pool.map(compute_score, [1, 2, 3, 4, 5, 6])

    # Save results to file
    with open("feature_scores_pricingInfo_isAccessTP.txt", "w") as f:
        for name, series in results:
            f.write(f"\n🔹 TOP 15 FEATURES BY {name.replace('_', ' ')}\n")
            f.write(series.head(15).to_string())
            f.write("\n")

    print("✅ Parallel feature score results saved to 'feature_scores_pricingInfo_isAccessTP.txt'")

In [None]:
%%writefile constants.py
import pandas as pd

# Load datasets with low_memory=False to avoid dtype warnings
train = pd.read_csv('train.csv', low_memory=False)
test = pd.read_csv('test.csv', low_memory=False)

# Find constant columns (only one unique value including NaNs)
constant_train = [col for col in train.columns if train[col].nunique(dropna=False) == 1]
constant_test = [col for col in test.columns if test[col].nunique(dropna=False) == 1]

# Find columns that are constant in both datasets
constant_both = list(set(constant_train) & set(constant_test))

# Save results to a .txt file
with open("constant_columns.txt", "w") as f:
    f.write("📌 Constant columns in train.csv:\n")
    f.write("\n".join(constant_train) + "\n\n")

    f.write("📌 Constant columns in test.csv:\n")
    f.write("\n".join(constant_test) + "\n\n")

    f.write("✅ Columns constant in BOTH train and test:\n")
    f.write("\n".join(constant_both) + "\n")

print("✅ Constant column results saved to 'constant_columns.txt'")

In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import multiprocessing
import time

# === Start the timer ===
start = time.time()

# === Load and save a safe copy of the dataset ===
train_original = pd.read_csv("train.csv", low_memory=False)
train_copy = train_original.copy(deep=True)

# === Select relevant columns ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType'
]
cols_for_imputation = ['pricingInfo_isAccessTP'] + selected_features

# === Filter rows with complete data ===
df_known = train_copy[cols_for_imputation].dropna(subset=cols_for_imputation).copy()

# === Select 10,000 random rows for validation ===
sample_original = df_known.sample(n=10000, random_state=42)
true_values = sample_original['pricingInfo_isAccessTP'].astype(int).copy()

# === Create a masked version (with NaN) for the target column ===
sample_masked = sample_original.copy()
sample_masked['pricingInfo_isAccessTP'] = np.nan

# === Remove these rows from df_known to avoid duplicates ===
df_known = df_known.drop(sample_original.index, errors='ignore')

# === Combine known + masked data for imputation ===
df_for_imputation = pd.concat([df_known, sample_masked])
df_for_imputation = df_for_imputation.sort_index()

# === Create the imputer using Random Forest (with parallel processing) ===
num_cores = 24
imputer = IterativeImputer(
    estimator=RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        n_jobs=num_cores,
        random_state=42
    ),
    max_iter=10,
    initial_strategy='most_frequent',
    random_state=42,
    verbose=1
)

# === Run the imputation ===
imputed_array = imputer.fit_transform(df_for_imputation)
imputed_df = pd.DataFrame(imputed_array, columns=cols_for_imputation, index=df_for_imputation.index)

# === Apply threshold for binary classification ===
predicted = (imputed_df.loc[sample_masked.index, 'pricingInfo_isAccessTP'] >= 0.5).astype(int)

# === Calculate metrics ===
acc = accuracy_score(true_values, predicted)
f1 = f1_score(true_values, predicted)
cm = confusion_matrix(true_values, predicted)
duration = round(time.time() - start, 2)

# === Save results to file ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Imputation evaluation results (on sample of 10,000):\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"F1 Score: {f1:.4f}\n")
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(cm))
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

# === Print to console as well ===
print("✅ Results saved to imputation_results.txt")


✅ Imputation evaluation results (on sample of 10,000):

Accuracy: 0.8188

F1 Score: 0.8221

Confusion Matrix:

[[4002 1079]

 [ 733 4186]]

⏱️ Duration: 4708.98 seconds


In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import time

# === Start the timer ===
start = time.time()

# === Load and save a safe copy of the dataset ===
train_original = pd.read_csv("train.csv", low_memory=False)
train_copy = train_original.copy(deep=True)

# === Select relevant columns ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType'
]
cols_for_imputation = ['pricingInfo_isAccessTP'] + selected_features

# === Filter rows with complete data ===
df_known = train_copy[cols_for_imputation].dropna(subset=cols_for_imputation).copy()

# === Randomly select 10,000 rows for validation ===
sample_original = df_known.sample(n=10000, random_state=42)
true_values = sample_original['pricingInfo_isAccessTP'].astype(int).copy()

# === Create a masked version (with NaN) for the target column ===
sample_masked = sample_original.copy()
sample_masked['pricingInfo_isAccessTP'] = np.nan

# === Remove these rows from df_known to avoid duplicates ===
df_known = df_known.drop(sample_original.index, errors='ignore')

# === Combine known + masked data for imputation ===
df_for_imputation = pd.concat([df_known, sample_masked])
df_for_imputation = df_for_imputation.sort_index()

# === Create the imputer with HistGradientBoostingRegressor ===
imputer = IterativeImputer(
    estimator=HistGradientBoostingRegressor(
        max_iter=100,
        max_depth=10,
        random_state=42
    ),
    max_iter=10,
    initial_strategy='most_frequent',
    random_state=42,
    verbose=1
)

# === Run the imputation ===
imputed_array = imputer.fit_transform(df_for_imputation)
imputed_df = pd.DataFrame(imputed_array, columns=cols_for_imputation, index=df_for_imputation.index)

# === Apply threshold for binary classification ===
predicted = (imputed_df.loc[sample_masked.index, 'pricingInfo_isAccessTP'] >= 0.5).astype(int)

# === Calculate evaluation metrics ===
acc = accuracy_score(true_values, predicted)
f1 = f1_score(true_values, predicted)
cm = confusion_matrix(true_values, predicted)
duration = round(time.time() - start, 2)

# === Save the results to a file ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Imputation evaluation results (HistGradientBoostingRegressor):\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"F1 Score: {f1:.4f}\n")
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(cm))
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

# === Also print the results to the console ===
print("✅ Results saved to imputation_results.txt")


✅ Imputation evaluation results (HistGradientBoostingRegressor):

Accuracy: 0.8682

F1 Score: 0.8671

Confusion Matrix:

[[4384  697]

 [ 621 4298]]

⏱️ Duration: 389.75 seconds

In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import time
from xgboost import XGBRegressor

# === Start the timer ===
start = time.time()

# === Load and save a safe copy of the dataset ===
train_original = pd.read_csv("train.csv", low_memory=False)
train_copy = train_original.copy(deep=True)

# === Select relevant columns ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType'
]
cols_for_imputation = ['pricingInfo_isAccessTP'] + selected_features

# === Filter rows with complete data ===
df_known = train_copy[cols_for_imputation].dropna(subset=cols_for_imputation).copy()

# === Randomly select 10,000 rows for validation ===
sample_original = df_known.sample(n=10000, random_state=42)
true_values = sample_original['pricingInfo_isAccessTP'].astype(int).copy()

# === Create a masked version (with NaN) for the target column ===
sample_masked = sample_original.copy()
sample_masked['pricingInfo_isAccessTP'] = np.nan

# === Remove these rows from df_known to avoid duplicates ===
df_known = df_known.drop(sample_original.index, errors='ignore')

# === Combine known + masked data for imputation ===
df_for_imputation = pd.concat([df_known, sample_masked])
df_for_imputation = df_for_imputation.sort_index()

# === Create the imputer with XGBRegressor ===
imputer = IterativeImputer(
    estimator=XGBRegressor(
        n_estimators=100,
        max_depth=10,
        learning_rate=0.1,
        n_jobs=24,
        tree_method='hist',
        random_state=42,
        verbosity=1
    ),
    max_iter=10,
    initial_strategy='most_frequent',
    random_state=42,
    verbose=1
)

# === Run the imputation ===
imputed_array = imputer.fit_transform(df_for_imputation)
imputed_df = pd.DataFrame(imputed_array, columns=cols_for_imputation, index=df_for_imputation.index)

# === Apply threshold for binary classification ===
predicted = (imputed_df.loc[sample_masked.index, 'pricingInfo_isAccessTP'] >= 0.5).astype(int)

# === Calculate metrics ===
acc = accuracy_score(true_values, predicted)
f1 = f1_score(true_values, predicted)
cm = confusion_matrix(true_values, predicted)
duration = round(time.time() - start, 2)

# === Save the results to a file ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Imputation evaluation results (XGBRegressor):\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"F1 Score: {f1:.4f}\n")
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(cm))
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

# === Also print to console ===
print("✅ Results saved to imputation_results.txt")


✅ Imputation evaluation results (XGBRegressor):

Accuracy: 0.9260

F1 Score: 0.9246

Confusion Matrix:

[[4725  356]

 [ 384 4535]]

⏱️ Duration: 443.35 seconds

In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import time

# === Start the timer ===
start = time.time()

# === Load the data ===
train_original = pd.read_csv("train.csv", low_memory=False)
train_copy = train_original.copy(deep=True)

# === Select relevant columns ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType'
]

target_col = 'pricingInfo_isAccessTP'
cols_needed = selected_features + [target_col]

# === Filter complete rows for training ===
df_full = train_copy[cols_needed].dropna(subset=cols_needed)

# === Randomly select 10,000 rows for validation ===
sample = df_full.sample(n=10000, random_state=42)
X_valid = sample[selected_features]
y_valid = sample[target_col].astype(int)

# === Remove those rows from the training set ===
df_train = df_full.drop(sample.index, errors='ignore')
X_train = df_train[selected_features]
y_train = df_train[target_col].astype(int)

# === Train a classification model ===
model = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Predict on the validation set ===
predicted = model.predict(X_valid)

# === Compute metrics ===
acc = accuracy_score(y_valid, predicted)
f1 = f1_score(y_valid, predicted)
cm = confusion_matrix(y_valid, predicted)
duration = round(time.time() - start, 2)

# === Save results to file ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Direct prediction results (XGBClassifier):\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"F1 Score: {f1:.4f}\n")
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(cm))
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

# === Print to console ===
print("✅ Results saved to imputation_results.txt")


✅ Direct prediction results (XGBClassifier):

Accuracy: 0.9251

F1 Score: 0.9236

Confusion Matrix:

[[4722  359]

 [ 390 4529]]

⏱️ Duration: 129.9 seconds

In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import time

# === Start the timer ===
start = time.time()

# === Load the data ===
train_original = pd.read_csv("train.csv", low_memory=False)
train_copy = train_original.copy(deep=True)

# === Select relevant columns ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP'
]

target_col = 'miniRules1_monetaryAmount'
cols_needed = selected_features + [target_col]

# === Filter complete rows for training ===
df_full = train_copy[cols_needed].dropna(subset=cols_needed)

# === Randomly select 1,000,000 rows for validation ===
sample = df_full.sample(n=1000000, random_state=42)
X_valid = sample[selected_features]
y_valid = sample[target_col].astype(int)

# === Remove these rows from the training data ===
df_train = df_full.drop(sample.index, errors='ignore')
X_train = df_train[selected_features]
y_train = df_train[target_col].astype(int)

# === Train a classification model ===
model = XGBClassifier(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Predict on the validation sample ===
predicted = model.predict(X_valid)

# === Compute evaluation metrics ===
acc = accuracy_score(y_valid, predicted)
f1 = f1_score(y_valid, predicted)
cm = confusion_matrix(y_valid, predicted)
duration = round(time.time() - start, 2)

# === Save results to a file ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Direct prediction results (XGBClassifier):\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"F1 Score: {f1:.4f}\n")
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(cm))
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

# === Also print to the console ===
print("✅ Results saved to imputation_results.txt")

✅ Direct prediction results (XGBClassifier):

Accuracy: 0.9635

F1 Score: 0.9633

Confusion Matrix:

[[484867  16174]

 [ 20340 478619]]

⏱️ Duration: 251.97 seconds

In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import time
from lightgbm import LGBMRegressor

# === Start the timer ===
start = time.time()

# === Load and save a safe copy of the dataset ===
train_original = pd.read_csv("train.csv", low_memory=False)
train_copy = train_original.copy(deep=True)

# === Select relevant columns ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType'
]
cols_for_imputation = ['pricingInfo_isAccessTP'] + selected_features

# === Filter rows with complete data ===
df_known = train_copy[cols_for_imputation].dropna(subset=cols_for_imputation).copy()

# === Randomly select 10,000 rows for validation ===
sample_original = df_known.sample(n=10000, random_state=42)
true_values = sample_original['pricingInfo_isAccessTP'].astype(int).copy()

# === Create a masked version (with NaN) for the target column ===
sample_masked = sample_original.copy()
sample_masked['pricingInfo_isAccessTP'] = np.nan

# === Remove these rows from df_known to avoid duplication ===
df_known = df_known.drop(sample_original.index, errors='ignore')

# === Combine known + masked data for imputation ===
df_for_imputation = pd.concat([df_known, sample_masked])
df_for_imputation = df_for_imputation.sort_index()

# === Create the imputer with LGBMRegressor ===
imputer = IterativeImputer(
    estimator=LGBMRegressor(
        n_estimators=100,
        max_depth=10,
        learning_rate=0.1,
        n_jobs=24,
        random_state=42,
        verbose=-1  # suppress logging
    ),
    max_iter=10,
    initial_strategy='most_frequent',
    random_state=42,
    verbose=1
)

# === Run the imputation ===
imputed_array = imputer.fit_transform(df_for_imputation)
imputed_df = pd.DataFrame(imputed_array, columns=cols_for_imputation, index=df_for_imputation.index)

# === Apply threshold for binary classification ===
predicted = (imputed_df.loc[sample_masked.index, 'pricingInfo_isAccessTP'] >= 0.5).astype(int)

# === Calculate evaluation metrics ===
acc = accuracy_score(true_values, predicted)
f1 = f1_score(true_values, predicted)
cm = confusion_matrix(true_values, predicted)
duration = round(time.time() - start, 2)

# === Save results to file ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Imputation evaluation results (LGBMRegressor):\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"F1 Score: {f1:.4f}\n")
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(cm))
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

# === Also print to console ===
print("✅ Results saved to imputation_results.txt")


✅ Imputation evaluation results (LGBMRegressor):

Accuracy: 0.8828

F1 Score: 0.8813

Confusion Matrix:

[[4479  602]

 [ 570 4349]]

⏱️ Duration: 229.4 seconds

In [None]:
%%writefile impute_pricingInfo_isAccessTP.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import time

# === Start timer ===
start = time.time()

# === Load files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Relevant columns ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP'
]
target = 'pricingInfo_isAccessTP'

# === Train on complete rows from train ===
train_valid = train.dropna(subset=features + [target])
X_train = train_valid[features]
y_train = train_valid[target].astype(int)

# === Train the model ===
model = XGBClassifier(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Imputation function ===
def impute_column(df, name="train"):
    missing_mask = df[target].isna()
    if missing_mask.sum() == 0:
        print(f"✅ {name}: No missing values in '{target}'")
        return df

    X_missing = df.loc[missing_mask, features]
    preds = model.predict(X_missing)
    df.loc[missing_mask, target] = preds
    print(f"✅ {name}: Imputed {missing_mask.sum()} missing values in '{target}'")
    return df

# === Perform imputation ===
train = impute_column(train, "train")
test = impute_column(test, "test")

# === Overwrite the files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === Post-imputation stats ===
def report(df, name):
    total = len(df)
    missing = df[target].isna().sum()
    value_counts = df[target].value_counts(dropna=False).sort_index()
    print(f"\n📊 {name} — '{target}':")
    print(f"  Total rows: {total}")
    print(f"  Missing: {missing} ({100 * missing / total:.2f}%)")
    print("  Value counts (including imputed):")
    print(value_counts)

report(train, "train")
report(test, "test")

# === Done ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")


Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)

✅ train: Imputed 905045 missing values in 'pricingInfo_isAccessTP'

✅ test: Imputed 245155 missing values in 'pricingInfo_isAccessTP'


📊 train — 'pricingInfo_isAccessTP':

  Total rows: 18145372

  Missing: 0 (0.00%)

  Value counts (including imputed):

pricingInfo_isAccessTP

0.0    8901943

1.0    9243429

Name: count, dtype: int64


📊 test — 'pricingInfo_isAccessTP':

  Total rows: 6897776

  Missing: 0 (0.00%)

  Value counts (including imputed):

pricingInfo_isAccessTP

0.0    2545795

1.0    4351981

Name: count, dtype: int64

⏱️ Total duration: 534.8 seconds

In [None]:
%%writefile corr_miniRules1_monetaryAmount.py
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import spearmanr, kendalltau
from multiprocessing import Pool, cpu_count

# === Load data ===
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

target = 'miniRules1_monetaryAmount'
y_train = train[target]
y_test = test[target]

# === Drop irrelevant columns ===
drop_cols = [
    'Id', 'profileId', '__index_level_0__', 'requestDate',
    'searchRoute', 'legs0_arrivalAt', 'legs0_departureAt',
    'legs1_arrivalAt', 'legs1_departureAt'
]

# === Select numeric columns present in both datasets ===
common_cols = list(set(train.columns) & set(test.columns))
numeric_cols = train[common_cols].select_dtypes(include=['float64', 'int64', 'bool']).columns
numeric_cols = [col for col in numeric_cols if col not in drop_cols and col != target]
numeric_cols = [col for col in numeric_cols
                if train[col].isnull().sum() == 0 and test[col].isnull().sum() == 0]

X_train = train[numeric_cols]
X_test = test[numeric_cols]

# === Correlation functions ===
def kendall_series(X, y):
    scores = {}
    for col in X.columns:
        try:
            tau, _ = kendalltau(X[col], y)
            scores[col] = tau
        except:
            scores[col] = 0
    return pd.Series(scores)

def spearman_series(X, y):
    scores = {}
    for col in X.columns:
        try:
            rho, _ = spearmanr(X[col], y)
            scores[col] = rho
        except:
            scores[col] = 0
    return pd.Series(scores)

# === Parallel computation ===
def compute_score(task_id):
    if task_id == 1:
        return "PEARSON_TRAIN", X_train.corrwith(y_train).sort_values(key=abs, ascending=False)
    elif task_id == 2:
        return "SPEARMAN_TRAIN", spearman_series(X_train, y_train).sort_values(key=abs, ascending=False)
    elif task_id == 3:
        return "KENDALL_TRAIN", kendall_series(X_train, y_train).sort_values(key=abs, ascending=False)
    elif task_id == 4:
        return "MI_TRAIN", pd.Series(
            mutual_info_regression(X_train, y_train.fillna(0), random_state=0),
            index=X_train.columns
        ).sort_values(ascending=False)
    elif task_id == 5:
        return "PEARSON_TEST", X_test.corrwith(y_test).sort_values(key=abs, ascending=False)
    elif task_id == 6:
        return "SPEARMAN_TEST", spearman_series(X_test, y_test).sort_values(key=abs, ascending=False)
    elif task_id == 7:
        return "KENDALL_TEST", kendall_series(X_test, y_test).sort_values(key=abs, ascending=False)
    elif task_id == 8:
        return "MI_TEST", pd.Series(
            mutual_info_regression(X_test, y_test.fillna(0), random_state=0),
            index=X_test.columns
        ).sort_values(ascending=False)

# === Run computations in parallel ===
if __name__ == "__main__":
    with Pool(processes=min(8, cpu_count())) as pool:
        results = pool.map(compute_score, list(range(1, 9)))

    # === Save to file ===
    with open("feature_scores_miniRules1_monetaryAmount.txt", "w", encoding="utf-8") as f:
        for name, series in results:
            f.write(f"\n🔹 TOP 15 FEATURES BY {name.replace('_', ' ')}\n")
            f.write(series.head(15).to_string())
            f.write("\n")

    print("✅ Correlation scores saved to 'feature_scores_miniRules1_monetaryAmount.txt'")


In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import time

# === Start the timer ===
start = time.time()

# === Load the data ===
train_original = pd.read_csv("train.csv", low_memory=False)
train_copy = train_original.copy(deep=True)

# === Select relevant columns ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP'
]

target_col = 'miniRules1_monetaryAmount'
cols_needed = selected_features + [target_col]

# === Filter complete rows for training ===
df_full = train_copy[cols_needed].dropna(subset=cols_needed)

# === Randomly select 1,000,000 rows for validation ===
sample = df_full.sample(n=1000000, random_state=42)
X_valid = sample[selected_features]
y_valid = sample[target_col]

# === Remove those rows from the training set ===
df_train = df_full.drop(sample.index, errors='ignore')
X_train = df_train[selected_features]
y_train = df_train[target_col]

# === Train the model ===
model = XGBRegressor(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42
)
model.fit(X_train, y_train)

# === Predict on the validation sample ===
y_pred = model.predict(X_valid)

# === Compute evaluation metrics ===
mae = mean_absolute_error(y_valid, y_pred)
rmse = mean_squared_error(y_valid, y_pred, squared=False)
r2 = r2_score(y_valid, y_pred)
duration = round(time.time() - start, 2)

# === Save results ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Imputation evaluation (XGBRegressor):\n")
    f.write(f"MAE: {mae:.2f}\n")
    f.write(f"RMSE: {rmse:.2f}\n")
    f.write(f"R² Score: {r2:.4f}\n")
    f.write(f"⏱️ Duration: {duration} seconds\n")

print("✅ Results saved to imputation_results.txt")


✅ Imputation evaluation (XGBRegressor):

MAE: 297.19

RMSE: 3959.18

R² Score: 0.6505

⏱️ Duration: 246.37 seconds

In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import time

# === Start the timer ===
start = time.time()

# === Load the data ===
train_original = pd.read_csv("train.csv", low_memory=False)
train_copy = train_original.copy(deep=True)

# === Select relevant columns ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP'
]

target_col = 'miniRules1_monetaryAmount'
cols_needed = selected_features + [target_col]

# === Filter complete rows for training ===
df_full = train_copy[cols_needed].dropna(subset=cols_needed)

# === Randomly select 1,000,000 rows for validation ===
sample = df_full.sample(n=1000000, random_state=42)
X_valid = sample[selected_features]
y_valid = sample[target_col]

# === Remove these rows from training data ===
df_train = df_full.drop(sample.index, errors='ignore')

# === Remove outliers from training target (1st to 99th percentiles) ===
q_low = df_train[target_col].quantile(0.01)
q_high = df_train[target_col].quantile(0.99)
df_train = df_train[(df_train[target_col] >= q_low) & (df_train[target_col] <= q_high)]

X_train = df_train[selected_features]
y_train = df_train[target_col]

# === Train the model ===
model = XGBRegressor(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42
)
model.fit(X_train, y_train)

# === Predict on the validation sample ===
y_pred = model.predict(X_valid)

# === Compute evaluation metrics ===
mae = mean_absolute_error(y_valid, y_pred)
rmse = mean_squared_error(y_valid, y_pred, squared=False)
r2 = r2_score(y_valid, y_pred)
duration = round(time.time() - start, 2)

# === Save results ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Imputation evaluation (XGBRegressor, with outlier removal):\n")
    f.write(f"MAE: {mae:.2f}\n")
    f.write(f"RMSE: {rmse:.2f}\n")
    f.write(f"R² Score: {r2:.4f}\n")
    f.write(f"⏱️ Duration: {duration} seconds\n")

print("✅ Results saved to imputation_results.txt")

✅ Imputation evaluation (XGBRegressor, with outlier removal):

MAE: 430.12

RMSE: 6263.20

R² Score: 0.1254

⏱️ Duration: 246.96 seconds

In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import time

# === Start the timer ===
start = time.time()

# === Load the data ===
train_original = pd.read_csv("train.csv", low_memory=False)
train_copy = train_original.copy(deep=True)

# === Select relevant columns ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP'
]

target_col = 'miniRules1_monetaryAmount'
cols_needed = selected_features + [target_col]

# === Filter complete rows for training ===
df_full = train_copy[cols_needed].dropna(subset=cols_needed)

# === Randomly select 1,000,000 rows for validation ===
sample = df_full.sample(n=1000000, random_state=42)
X_valid = sample[selected_features]
y_valid_raw = sample[target_col]  # keep for evaluation
y_valid = np.log1p(y_valid_raw)   # log-transform the target

# === Remove these rows from the training data ===
df_train = df_full.drop(sample.index, errors='ignore')
X_train = df_train[selected_features]
y_train = np.log1p(df_train[target_col])  # log-transform the target

# === Train the model ===
model = XGBRegressor(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42
)
model.fit(X_train, y_train)

# === Predict on the validation set ===
y_pred_log = model.predict(X_valid)
y_pred = np.expm1(y_pred_log)  # inverse of log1p

# === Compute metrics (compare to raw values) ===
mae = mean_absolute_error(y_valid_raw, y_pred)
rmse = mean_squared_error(y_valid_raw, y_pred, squared=False)
r2 = r2_score(y_valid_raw, y_pred)
duration = round(time.time() - start, 2)

# === Save results to file ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Imputation evaluation (XGBRegressor, log-transformed target):\n")
    f.write(f"MAE: {mae:.2f}\n")
    f.write(f"RMSE: {rmse:.2f}\n")
    f.write(f"R² Score: {r2:.4f}\n")
    f.write(f"⏱️ Duration: {duration} seconds\n")

print("✅ Results saved to imputation_results.txt")


✅ Imputation evaluation (XGBRegressor, log-transformed target):

MAE: 476.00

RMSE: 6041.37

R² Score: 0.1863

⏱️ Duration: 251.54 seconds

In [None]:
%%writefile impute_miniRules1_monetaryAmount.py
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
import time

start = time.time()

# === Load original data ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Features for modeling ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP'
]

target_col = 'miniRules1_monetaryAmount'
cols_needed = selected_features + [target_col]

# === Prepare complete rows for training ===
train_clean = train[cols_needed].dropna(subset=cols_needed)
X_train = train_clean[selected_features]
y_train = train_clean[target_col]

# === Train the model ===
model = XGBRegressor(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42
)
model.fit(X_train, y_train)

# === Function to impute missing values in a DataFrame ===
def impute(df, name):
    df_result = df.copy()
    missing_mask = df_result[target_col].isna()
    n_missing = missing_mask.sum()
    before_stats = df_result[target_col].describe()

    if n_missing > 0:
        to_impute = df_result.loc[missing_mask, selected_features]
        preds = model.predict(to_impute)
        df_result.loc[missing_mask, target_col] = preds
        after_stats = df_result[target_col].describe()
    else:
        after_stats = before_stats

    log = (
        f"\n📊 Dataset: {name}\n"
        f"Missing before: {n_missing}\n"
        f"Missing after: {df_result[target_col].isna().sum()}\n"
        f"\n🔹 Descriptive statistics after:\n{after_stats}\n"
    )
    return df_result, log

# === Imputation ===
train_filled, log_train = impute(train, "Train")
test_filled, log_test = impute(test, "Test")

# === Overwrite CSV files ===
train_filled.to_csv("train.csv", index=False)
test_filled.to_csv("test.csv", index=False)

# === Write summary log to file ===
duration = round(time.time() - start, 2)
with open("imputation_summary.txt", "w") as f:
    f.write("✅ Imputation Summary (XGBRegressor)\n")
    f.write(log_train)
    f.write(log_test)
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

print("✅ Imputation complete. Overwrote 'train.csv', 'test.csv'. Stats in 'imputation_summary.txt'")


In [None]:
✅ Imputation Summary (XGBRegressor)

📊 Dataset: Train
Missing before: 1395743
Missing after: 0

🔹 Descriptive statistics after:
count    1.814537e+07
mean     1.343276e+03
std      5.734428e+03
min     -1.068349e+04
25%      0.000000e+00
50%      0.000000e+00
75%      2.800000e+03
max      7.161273e+06
Name: miniRules1_monetaryAmount, dtype: float64

📊 Dataset: Test
Missing before: 504405
Missing after: 0

🔹 Descriptive statistics after:
count    6.897776e+06
mean     1.414765e+03
std      4.017597e+03
min     -8.029313e+03
25%      0.000000e+00
50%      0.000000e+00
75%      2.800000e+03
max      4.736350e+05
Name: miniRules1_monetaryAmount, dtype: float64

⏱️ Duration: 542.42 seconds

In [None]:
%%writefile clip_negative_values.py
import pandas as pd

target_col = "miniRules1_monetaryAmount"

# Load existing files
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# Replace negative values with 0
train[target_col] = train[target_col].clip(lower=0)
test[target_col] = test[target_col].clip(lower=0)

# Save updated files
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# Save post-transformation statistics
with open("clip_summary.txt", "w") as f:
    f.write("✅ Applied clip to miniRules1_monetaryAmount (values < 0 set to 0)\n")
    f.write("\n🔹 Train statistics:\n")
    f.write(train[target_col].describe().to_string())
    f.write("\n\n🔹 Test statistics:\n")
    f.write(test[target_col].describe().to_string())

print("✅ Negative values clipped. Updated files: train.csv, test.csv. Summary saved in clip_summary.txt.")


In [None]:
✅ Applied clip to miniRules1_monetaryAmount (values < 0 set to 0)

🔹 Train statistics:
count    1.814537e+07
mean     1.344945e+03
std      5.733923e+03
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      2.800000e+03
max      7.161273e+06

🔹 Test statistics:
count    6.897776e+06
mean     1.416297e+03
std      4.016969e+03
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      2.800000e+03
max      4.736350e+05

In [None]:
%%writefile impute_miniRules0_monetaryAmount.py
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
import time

start = time.time()

# === Load original datasets ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Features for modeling ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP'
]

target_col = 'miniRules0_monetaryAmount'
cols_needed = selected_features + [target_col]

# === Prepare complete rows for training ===
train_clean = train[cols_needed].dropna(subset=cols_needed)
X_train = train_clean[selected_features]
y_train = train_clean[target_col]

# === Train the model ===
model = XGBRegressor(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42
)
model.fit(X_train, y_train)

# === Function to impute missing values in a DataFrame ===
def impute(df, name):
    df_result = df.copy()
    missing_mask = df_result[target_col].isna()
    n_missing = missing_mask.sum()
    before_stats = df_result[target_col].describe()

    if n_missing > 0:
        to_impute = df_result.loc[missing_mask, selected_features]
        preds = model.predict(to_impute)
        df_result.loc[missing_mask, target_col] = preds
        after_stats = df_result[target_col].describe()
    else:
        after_stats = before_stats

    log = (
        f"\n📊 Dataset: {name}\n"
        f"Missing before: {n_missing}\n"
        f"Missing after: {df_result[target_col].isna().sum()}\n"
        f"\n🔹 Descriptive statistics after:\n{after_stats}\n"
    )
    return df_result, log

# === Perform imputation ===
train_filled, log_train = impute(train, "Train")
test_filled, log_test = impute(test, "Test")

# === Overwrite CSV files ===
train_filled.to_csv("train.csv", index=False)
test_filled.to_csv("test.csv", index=False)

# === Write log to file ===
duration = round(time.time() - start, 2)
with open("imputation_summary.txt", "w") as f:
    f.write("✅ Imputation Summary (XGBRegressor)\n")
    f.write(log_train)
    f.write(log_test)
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

print("✅ Imputation complete. Overwrote 'train.csv', 'test.csv'. Stats in 'imputation_summary.txt'")


In [None]:
✅ Imputation Summary (XGBRegressor)

📊 Dataset: Train
Missing before: 1395743
Missing after: 0

🔹 Descriptive statistics after:
count    1.814537e+07
mean     2.534357e+03
std      3.341679e+03
min     -6.132068e+03
25%      0.000000e+00
50%      2.800000e+03
75%      2.800000e+03
max      5.022370e+05
Name: miniRules0_monetaryAmount, dtype: float64

📊 Dataset: Test
Missing before: 504405
Missing after: 0

🔹 Descriptive statistics after:
count    6.897776e+06
mean     2.717628e+03
std      4.008379e+03
min     -7.482827e+03
25%      0.000000e+00
50%      2.800000e+03
75%      2.800000e+03
max      2.431870e+05
Name: miniRules0_monetaryAmount, dtype: float64

⏱️ Duration: 548.26 seconds

In [None]:
%%writefile clip_negative_values.py
import pandas as pd

target_col = "miniRules0_monetaryAmount"

# Load existing files
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# Replace negative values with 0
train[target_col] = train[target_col].clip(lower=0)
test[target_col] = test[target_col].clip(lower=0)

# Save updated files
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# Save post-transformation statistics
with open("clip_summary.txt", "w") as f:
    f.write("✅ Applied clip to miniRules0_monetaryAmount (values < 0 set to 0)\n")
    f.write("\n🔹 Train statistics:\n")
    f.write(train[target_col].describe().to_string())
    f.write("\n\n🔹 Test statistics:\n")
    f.write(test[target_col].describe().to_string())

print("✅ Negative values clipped. Updated files: train.csv, test.csv. Summary saved in clip_summary.txt.")


In [None]:
✅ Applied clip to miniRules0_monetaryAmount (values < 0 set to 0)

🔹 Train statistics:
count    1.814537e+07
mean     2.535514e+03
std      3.340498e+03
min      0.000000e+00
25%      0.000000e+00
50%      2.800000e+03
75%      2.800000e+03
max      5.022370e+05

🔹 Test statistics:
count    6.897776e+06
mean     2.719365e+03
std      4.006758e+03
min      0.000000e+00
25%      0.000000e+00
50%      2.800000e+03
75%      2.800000e+03
max      2.431870e+05

In [None]:
%%writefile corr_miniRules0_statusInfos.py
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from multiprocessing import Pool, cpu_count

# Load datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Target column
target = 'miniRules0_statusInfos'
y_train = train[target]
y_test = test[target]

# Drop irrelevant columns
drop_cols = [
    'Id', 'profileId', '__index_level_0__', 'requestDate',
    'searchRoute', 'legs0_arrivalAt', 'legs0_departureAt',
    'legs1_arrivalAt', 'legs1_departureAt'
]

# Select numeric, common, non-null columns
common_cols = list(set(train.columns) & set(test.columns))
numeric_cols = train[common_cols].select_dtypes(include=['float64', 'int64', 'bool']).columns
numeric_cols = [col for col in numeric_cols if col not in drop_cols and col != target]
numeric_cols = [
    col for col in numeric_cols
    if train[col].isnull().sum() == 0 and test[col].isnull().sum() == 0
]

X_train = train[numeric_cols]
X_test = test[numeric_cols]

# Fisher Score calculation
def fisher_score(X, y):
    scores = {}
    classes = np.unique(y.dropna())
    for col in X.columns:
        x = X[col]
        mean_overall = x.mean()
        numerator = 0
        denominator = 0
        for cls in classes:
            xi = x[y == cls]
            ni = len(xi)
            mean_i = xi.mean()
            var_i = xi.var()
            numerator += ni * (mean_i - mean_overall) ** 2
            denominator += ni * var_i
        scores[col] = numerator / denominator if denominator != 0 else 0
    return pd.Series(scores)

# Score computation based on task ID
def compute_score(task_id):
    if task_id == 1:
        return "PEARSON_TRAIN", X_train.corrwith(y_train).sort_values(key=abs, ascending=False)
    elif task_id == 2:
        return "FISHER_TRAIN", fisher_score(X_train, y_train).sort_values(ascending=False)
    elif task_id == 3:
        return "MI_TRAIN", pd.Series(
            mutual_info_classif(X_train, y_train.fillna(0), random_state=0),
            index=X_train.columns
        ).sort_values(ascending=False)
    elif task_id == 4:
        return "PEARSON_TEST", X_test.corrwith(y_test).sort_values(key=abs, ascending=False)
    elif task_id == 5:
        return "FISHER_TEST", fisher_score(X_test, y_test).sort_values(ascending=False)
    elif task_id == 6:
        return "MI_TEST", pd.Series(
            mutual_info_classif(X_test, y_test.fillna(0), random_state=0),
            index=X_test.columns
        ).sort_values(ascending=False)

# Run feature importance calculations in parallel
if __name__ == "__main__":
    with Pool(processes=min(6, cpu_count())) as pool:
        results = pool.map(compute_score, [1, 2, 3, 4, 5, 6])

    # Save results to text file
    with open("feature_scores_miniRules0_statusInfos.txt", "w") as f:
        for name, series in results:
            f.write(f"\n🔹 TOP 15 FEATURES BY {name.replace('_', ' ')}\n")
            f.write(series.head(15).to_string())
            f.write("\n")

    print("✅ Parallel feature score results saved to 'feature_scores_miniRules0_statusInfos.txt'")


In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import time

# === Start the timer ===
start = time.time()

# === Load the data ===
train_original = pd.read_csv("train.csv", low_memory=False)
train_copy = train_original.copy(deep=True)

# === Select relevant columns ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount'
]

target_col = 'miniRules0_statusInfos'
cols_needed = selected_features + [target_col]

# === Filter complete rows for training ===
df_full = train_copy[cols_needed].dropna(subset=cols_needed)

# === Randomly select 1,500,000 rows for validation ===
sample = df_full.sample(n=1500000, random_state=42)
X_valid = sample[selected_features]
y_valid = sample[target_col].astype(int)

# === Remove those rows from training data ===
df_train = df_full.drop(sample.index, errors='ignore')
X_train = df_train[selected_features]
y_train = df_train[target_col].astype(int)

# === Train a classification model ===
model = XGBClassifier(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Predict on the validation set ===
predicted = model.predict(X_valid)

# === Compute evaluation metrics ===
acc = accuracy_score(y_valid, predicted)
f1 = f1_score(y_valid, predicted)
cm = confusion_matrix(y_valid, predicted)
duration = round(time.time() - start, 2)

# === Save results to file ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Direct prediction results (XGBClassifier):\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"F1 Score: {f1:.4f}\n")
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(cm))
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

# === Print to console ===
print("✅ Results saved to imputation_results.txt")


✅ Direct prediction results (XGBClassifier):

Accuracy: 0.9999

F1 Score: 1.0000

Confusion Matrix:

[[  38388      40]

 [     47 1461525]]

⏱️ Duration: 211.24 seconds

In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import time

# === Start the timer ===
start = time.time()

# === Load the data ===
train_original = pd.read_csv("train.csv", low_memory=False)
train_copy = train_original.copy(deep=True)

# === Select relevant columns ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP'
]

target_col = 'miniRules0_statusInfos'
cols_needed = selected_features + [target_col]

# === Filter complete rows for training ===
df_full = train_copy[cols_needed].dropna(subset=cols_needed)

# === Randomly select 1,500,000 rows for validation ===
sample = df_full.sample(n=1500000, random_state=42)
X_valid = sample[selected_features]
y_valid = sample[target_col].astype(int)

# === Remove these rows from the training data ===
df_train = df_full.drop(sample.index, errors='ignore')
X_train = df_train[selected_features]
y_train = df_train[target_col].astype(int)

# === Train a classification model ===
model = XGBClassifier(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Predict on the validation set ===
predicted = model.predict(X_valid)

# === Compute evaluation metrics ===
acc = accuracy_score(y_valid, predicted)
f1 = f1_score(y_valid, predicted)
cm = confusion_matrix(y_valid, predicted)
duration = round(time.time() - start, 2)

# === Save results to file ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Direct prediction results (XGBClassifier):\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"F1 Score: {f1:.4f}\n")
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(cm))
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

# === Also print to console ===
print("✅ Results saved to imputation_results.txt")


✅ Direct prediction results (XGBClassifier):

Accuracy: 0.9996

F1 Score: 0.9998

Confusion Matrix:

[[  38090     338]

 [    312 1461260]]

⏱️ Duration: 231.31 seconds

In [None]:
%%writefile missing_miniRules_statusInfo.py
import pandas as pd

# Load existing files
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# Create binary columns indicating the presence of values
train["miniRules0_statusInfos_was_missing"] = train["miniRules0_statusInfos"].notna().astype(int)
train["miniRules1_statusInfos_was_missing"] = train["miniRules1_statusInfos"].notna().astype(int)

test["miniRules0_statusInfos_was_missing"] = test["miniRules0_statusInfos"].notna().astype(int)
test["miniRules1_statusInfos_was_missing"] = test["miniRules1_statusInfos"].notna().astype(int)

# Save updated files
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

print("✅ Columns *_was_missing have been added to 'train.csv' and 'test.csv'.")


In [None]:
%%writefile impute_miniRules0_statusInfos.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import time

# === Timer ===
start = time.time()

# === Load files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Relevant features ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount'
]
target = 'miniRules0_statusInfos'

# === Train on complete rows from train set ===
train_valid = train.dropna(subset=features + [target])
X_train = train_valid[features]
y_train = train_valid[target].astype(int)

# === Train the model ===
model = XGBClassifier(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Imputation function ===
def impute_column(df, name="train"):
    missing_mask = df[target].isna()
    if missing_mask.sum() == 0:
        print(f"✅ {name}: No missing values in '{target}'")
        return df

    X_missing = df.loc[missing_mask, features]
    preds = model.predict(X_missing)
    df.loc[missing_mask, target] = preds
    print(f"✅ {name}: Imputed {missing_mask.sum()} missing values in '{target}'")
    return df

# === Perform imputation ===
train = impute_column(train, "train")
test = impute_column(test, "test")

# === Overwrite updated CSV files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === Post-imputation statistics ===
def report(df, name):
    total = len(df)
    missing = df[target].isna().sum()
    value_counts = df[target].value_counts(dropna=False).sort_index()
    print(f"\n📊 {name} — '{target}':")
    print(f"  Total rows: {total}")
    print(f"  Missing: {missing} ({100 * missing / total:.2f}%)")
    print("  Value counts (including imputed):")
    print(value_counts)

report(train, "train")
report(test, "test")

# === Done ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")


In [None]:
%%writefile impute_miniRules0_statusInfos.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import time

# === Timer ===
start = time.time()

# === Load files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Relevant features ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount'
]
target = 'miniRules0_statusInfos'

# === Train on complete rows from train set ===
train_valid = train.dropna(subset=features + [target])
X_train = train_valid[features]
y_train = train_valid[target].astype(int)

# === Train the model ===
model = XGBClassifier(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Imputation function ===
def impute_column(df, name="train"):
    missing_mask = df[target].isna()
    if missing_mask.sum() == 0:
        print(f"✅ {name}: No missing values in '{target}'")
        return df

    X_missing = df.loc[missing_mask, features]
    preds = model.predict(X_missing)
    df.loc[missing_mask, target] = preds
    print(f"✅ {name}: Imputed {missing_mask.sum()} missing values in '{target}'")
    return df

# === Perform imputation ===
train = impute_column(train, "train")
test = impute_column(test, "test")

# === Overwrite updated CSV files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === Post-imputation statistics ===
def report(df, name):
    total = len(df)
    missing = df[target].isna().sum()
    value_counts = df[target].value_counts(dropna=False).sort_index()
    print(f"\n📊 {name} — '{target}':")
    print(f"  Total rows: {total}")
    print(f"  Missing: {missing} ({100 * missing / total:.2f}%)")
    print("  Value counts (including imputed):")
    print(value_counts)

report(train, "train")
report(test, "test")

# === Done ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")


In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import time

# === Start the timer ===
start = time.time()

# === Load the data ===
train_original = pd.read_csv("train.csv", low_memory=False)
train_copy = train_original.copy(deep=True)

# === Select relevant columns ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount'
]

target_col = 'miniRules1_statusInfos'
cols_needed = selected_features + [target_col]

# === Filter complete rows for training ===
df_full = train_copy[cols_needed].dropna(subset=cols_needed)

# === Randomly select 1,500,000 rows for validation ===
sample = df_full.sample(n=1500000, random_state=42)
X_valid = sample[selected_features]
y_valid = sample[target_col].astype(int)

# === Remove these rows from training data ===
df_train = df_full.drop(sample.index, errors='ignore')
X_train = df_train[selected_features]
y_train = df_train[target_col].astype(int)

# === Train a classification model ===
model = XGBClassifier(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Predict missing values from the sample ===
predicted = model.predict(X_valid)

# === Compute metrics ===
acc = accuracy_score(y_valid, predicted)
f1 = f1_score(y_valid, predicted)
cm = confusion_matrix(y_valid, predicted)
duration = round(time.time() - start, 2)

# === Save results to file ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Direct prediction results (XGBClassifier):\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"F1 Score: {f1:.4f}\n")
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(cm))
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

# === Also print to console ===
print("✅ Results saved to imputation_results.txt")


✅ Direct prediction results (XGBClassifier):

Accuracy: 0.9999

F1 Score: 0.9999

Confusion Matrix:

[[629422     65]

 [    69 870444]]

⏱️ Duration: 217.45 seconds

In [None]:
%%writefile impute_miniRules1_statusInfos.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
import time

# === Timer ===
start = time.time()

# === Load the files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Relevant columns ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount'
]
target = 'miniRules1_statusInfos'

# === Train on complete rows from train ===
train_valid = train.dropna(subset=features + [target])
X_train = train_valid[features]
y_train = train_valid[target].astype(int)

# === Train the model ===
model = XGBClassifier(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Imputation function ===
def impute_column(df, name="train"):
    missing_mask = df[target].isna()
    if missing_mask.sum() == 0:
        print(f"✅ {name}: No missing values in '{target}'")
        return df

    X_missing = df.loc[missing_mask, features]
    preds = model.predict(X_missing)
    df.loc[missing_mask, target] = preds
    print(f"✅ {name}: Imputed {missing_mask.sum()} missing values in '{target}'")
    return df

# === Perform imputation ===
train = impute_column(train, "train")
test = impute_column(test, "test")

# === Overwrite the CSV files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === Post-imputation statistics ===
def report(df, name):
    total = len(df)
    missing = df[target].isna().sum()
    value_counts = df[target].value_counts(dropna=False).sort_index()
    print(f"\n📊 {name} — '{target}':")
    print(f"  Total rows: {total}")
    print(f"  Missing: {missing} ({100 * missing / total:.2f}%)")
    print("  Value counts (including imputed):")
    print(value_counts)

report(train, "train")
report(test, "test")

# === Done ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")


In [None]:
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
✅ train: Imputed 1518169 missing values in 'miniRules1_statusInfos'
✅ test: Imputed 574432 missing values in 'miniRules1_statusInfos'

📊 train — 'miniRules1_statusInfos':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Value counts (including imputed):
miniRules1_statusInfos
0.0     7832405
1.0    10312967
Name: count, dtype: int64

📊 test — 'miniRules1_statusInfos':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Value counts (including imputed):
miniRules1_statusInfos
0.0    3173816
1.0    3723960
Name: count, dtype: int64

⏱️ Total duration: 518.28 seconds

In [None]:
%%writefile corr_legs1_segments0_flightNumber.py
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from scipy.stats import spearmanr
from multiprocessing import Pool, cpu_count

# === Load the data ===
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# === Define the target ===
target = 'legs1_segments0_flightNumber'
y_train = train[target]
y_test = test[target]

# === Drop irrelevant columns ===
drop_cols = [
    'Id', 'profileId', '__index_level_0__', 'requestDate',
    'searchRoute', 'legs0_arrivalAt', 'legs0_departureAt',
    'legs1_arrivalAt', 'legs1_departureAt'
]

# === Select common, clean numeric columns ===
common_cols = list(set(train.columns) & set(test.columns))
numeric_cols = train[common_cols].select_dtypes(include=['float64', 'int64', 'bool']).columns
numeric_cols = [col for col in numeric_cols if col not in drop_cols and col != target]
numeric_cols = [
    col for col in numeric_cols
    if train[col].isnull().sum() == 0 and test[col].isnull().sum() == 0
]

X_train = train[numeric_cols]
X_test = test[numeric_cols]

# === Fisher Score ===
def fisher_score(X, y):
    scores = {}
    classes = np.unique(y.dropna())
    for col in X.columns:
        x = X[col]
        mean_overall = x.mean()
        numerator = 0
        denominator = 0
        for cls in classes:
            xi = x[y == cls]
            ni = len(xi)
            mean_i = xi.mean()
            var_i = xi.var()
            numerator += ni * (mean_i - mean_overall) ** 2
            denominator += ni * var_i
        scores[col] = numerator / denominator if denominator != 0 else 0
    return pd.Series(scores)

# === Spearman correlation ===
def spearman_series(X, y):
    scores = {}
    for col in X.columns:
        try:
            rho, _ = spearmanr(X[col], y)
            scores[col] = rho
        except:
            scores[col] = 0
    return pd.Series(scores)

# === Compute scores ===
def compute_score(task_id):
    if task_id == 1:
        return "PEARSON_TRAIN", X_train.corrwith(y_train).sort_values(key=abs, ascending=False)
    elif task_id == 2:
        return "FISHER_TRAIN", fisher_score(X_train, y_train).sort_values(ascending=False)
    elif task_id == 3:
        return "MI_TRAIN", pd.Series(
            mutual_info_classif(X_train, y_train.fillna(0), random_state=0),
            index=X_train.columns
        ).sort_values(ascending=False)
    elif task_id == 4:
        return "PEARSON_TEST", X_test.corrwith(y_test).sort_values(key=abs, ascending=False)
    elif task_id == 5:
        return "FISHER_TEST", fisher_score(X_test, y_test).sort_values(ascending=False)
    elif task_id == 6:
        return "MI_TEST", pd.Series(
            mutual_info_classif(X_test, y_test.fillna(0), random_state=0),
            index=X_test.columns
        ).sort_values(ascending=False)
    elif task_id == 7:
        return "SPEARMAN_TRAIN", spearman_series(X_train, y_train).sort_values(key=abs, ascending=False)
    elif task_id == 8:
        return "SPEARMAN_TEST", spearman_series(X_test, y_test).sort_values(key=abs, ascending=False)

# === Run in parallel ===
if __name__ == "__main__":
    tasks = list(range(1, 9))  # Include Spearman as well
    with Pool(processes=min(len(tasks), cpu_count())) as pool:
        results = pool.map(compute_score, tasks)

    with open("feature_scores_legs1_segments0_flightNumber.txt", "w", encoding="utf-8") as f:
        for name, series in results:
            f.write(f"\n🔹 TOP 15 FEATURES BY {name.replace('_', ' ')}\n")
            f.write(series.head(15).to_string())
            f.write("\n")

    print("✅ Feature score results saved to 'feature_scores_legs1_segments0_flightNumber.txt'")


In [None]:
%%writefile check_frequencies.py
import pandas as pd

# === Load the files ===
train_df = pd.read_csv("train.csv", usecols=['legs1_segments0_flightNumber'])
test_df = pd.read_csv("test.csv", usecols=['legs1_segments0_flightNumber'])

# === Frequencies ===
train_counts = train_df['legs1_segments0_flightNumber'].value_counts().rename("train_count")
test_counts = test_df['legs1_segments0_flightNumber'].value_counts().rename("test_count")

# === Combine frequencies into a single DataFrame ===
freq_df = pd.concat([train_counts, test_counts], axis=1).fillna(0).astype(int)
freq_df.index.name = 'flight_number'
freq_df = freq_df.sort_values(by='train_count', ascending=False)

# === Write to text file ===
with open("class_frequencies.txt", "w") as f:
    f.write("📊 Class frequencies in 'train.csv' and 'test.csv'\n\n")
    f.write(f"{'Flight Number':<20}{'Train Count':<15}{'Test Count':<15}\n")
    f.write("=" * 50 + "\n")
    for idx, row in freq_df.iterrows():
        f.write(f"{str(idx):<20}{row['train_count']:<15}{row['test_count']:<15}\n")

print("✅ Class frequencies saved to 'class_frequencies.txt'")


In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
import time
from collections import Counter
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

start = time.time()

# === Load data ===
df = pd.read_csv("train.csv", low_memory=False)

# === Select columns ===
features = [
    'totalPrice', 'companyID', 'legs0_segments0_cabinClass',
    'isAccess3D', 'isVip', 'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber', 'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP', 'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount', 'miniRules0_statusInfos',
    'miniRules1_statusInfos'
]
target = 'legs1_segments0_flightNumber'
df = df[features + [target]].dropna()

# === Sample for validation ===
valid_sample = df.sample(n=100000, random_state=42)
train_df = df.drop(index=valid_sample.index, errors='ignore')

# === Separate X and y ===
X_train = train_df[features]
X_valid = valid_sample[features]

# === Label Encoding on target ===
le = LabelEncoder()
y_train_raw = train_df[target]
y_valid_raw = valid_sample[target]

# Fit encoder only on y_train
le.fit(y_train_raw)
y_train = le.transform(y_train_raw)

# Filter validation — only classes present in y_train
mask = y_valid_raw.isin(le.classes_)
X_valid = X_valid[mask]
y_valid = y_valid_raw[mask]
y_valid = le.transform(y_valid)

# === Model ===
model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=12,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Predictions ===
y_pred = model.predict(X_valid)

# === Metrics ===
acc = accuracy_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred, average='weighted')
cm_shape = confusion_matrix(y_valid, y_pred).shape
duration = round(time.time() - start, 2)

# === Output ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Direct prediction results (XGBClassifier + LabelEncoder):\n")
    f.write(f"Train examples: {len(X_train)}\n")
    f.write(f"Validation examples: {len(X_valid)}\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"Weighted F1 Score: {f1:.4f}\n")
    f.write(f"Confusion Matrix shape: {cm_shape}\n")
    top_preds = Counter(y_pred).most_common(10)
    f.write("Top 10 predicted labels (encoded):\n")
    for cls, count in top_preds:
        original_label = le.inverse_transform([cls])[0]
        f.write(f"{cls} (raw: {original_label}): {count} predictions\n")
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

print("✅ Results saved to imputation_results.txt")


In [None]:
%%writefile summary_folder.py
import os
import pandas as pd

# === Config ===
columns_to_analyze = [
    "legs1_segments0_duration", "legs1_segments0_aircraft_code", "legs1_arrivalAt",
    "legs1_departureAt", "legs1_segments0_marketingCarrier_code", "legs1_duration",
    "legs1_segments0_operatingCarrier_code", "legs1_segments0_arrivalTo_airport_iata",
    "legs1_segments0_departureFrom_airport_iata", "legs1_segments0_arrivalTo_airport_city_iata",
    "legs1_segments0_cabinClass", "legs1_segments0_baggageAllowance_weightMeasurementType",
    "legs1_segments0_baggageAllowance_quantity", "legs1_segments0_seatsAvailable",
    "corporateTariffCode", "frequentFlyer"
]

datasets = {
    "Train": pd.read_csv("train.csv", low_memory=False),
    "Test": pd.read_csv("test.csv", low_memory=False)
}

# === Output folder ===
output_dir = "results"
os.makedirs(output_dir, exist_ok=True)

# === Analysis function ===
def get_feature_report(df, dataset_name, feature_name):
    if feature_name not in df.columns:
        return f"{dataset_name}: Column '{feature_name}' not found.\n"

    col = df[feature_name]
    total = len(col)
    missing = col.isna().sum()
    pct_missing = missing / total * 100
    nunique = col.nunique(dropna=True)
    dtype = col.dtype

    section = [
        f"📊 Dataset: {dataset_name}",
        f"Column: {feature_name}",
        f"Data type: {dtype}",
        f"Total rows: {total:,}",
        f"Missing values: {missing:,} ({pct_missing:.6f}%)",
        f"Unique values (non-null): {nunique:,}"
    ]

    if pd.api.types.is_numeric_dtype(col):
        section.append("\n🔹 Descriptive statistics:")
        section.append(str(col.describe()))
    else:
        section.append("\n🔹 All unique non-null values:")
        unique_vals = col.dropna().unique()
        # Show max 100 unique values to avoid overload
        if len(unique_vals) > 100:
            section.append(", ".join(map(str, unique_vals[:100])) + ", ...")
        else:
            section.append(", ".join(map(str, unique_vals)))

    return "\n".join(section)

# === Generate reports ===
for feature in columns_to_analyze:
    report = []
    for name, df in datasets.items():
        report.append(get_feature_report(df, name, feature))
        report.append("\n" + "=" * 60 + "\n")

    output_text = "\n".join(report)
    output_file = os.path.join(output_dir, f"{feature}_summary.txt")
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(output_text)
    print(f"✅ Saved: {output_file}")

In [None]:
%%writefile corr_object_categorics.py
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from multiprocessing import Pool, cpu_count
import os

# === Ensure folder for results ===
os.makedirs("results", exist_ok=True)

# === Load data and create copies ===
train_orig = pd.read_csv('train.csv', low_memory=False)
test_orig = pd.read_csv('test.csv', low_memory=False)
train = train_orig.copy()
test = test_orig.copy()

# === List of categorical targets ===
categorical_targets = [
    "legs1_segments0_aircraft_code",
    "legs1_segments0_marketingCarrier_code",
    "legs1_segments0_operatingCarrier_code",
    "legs1_segments0_departureFrom_airport_iata",
    "legs1_segments0_arrivalTo_airport_city_iata"
]

# === Columns to ignore ===
drop_cols = [
    'Id', 'profileId', '__index_level_0__', 'requestDate',
    'searchRoute', 'legs0_arrivalAt', 'legs0_departureAt',
    'legs1_arrivalAt', 'legs1_departureAt'
]

# === Select common, clean numeric columns ===
common_cols = list(set(train.columns) & set(test.columns))
numeric_cols = train[common_cols].select_dtypes(include=['float64', 'int64', 'bool']).columns.tolist()
numeric_cols = [col for col in numeric_cols if col not in drop_cols]
numeric_cols = [
    col for col in numeric_cols
    if train[col].isnull().sum() == 0 and test[col].isnull().sum() == 0
]

X_train = train[numeric_cols]
X_test = test[numeric_cols]

# === Fisher Score ===
def fisher_score(X, y):
    scores = {}
    classes = np.unique(y)
    for col in X.columns:
        x = X[col]
        mean_overall = x.mean()
        numerator = 0
        denominator = 0
        for cls in classes:
            xi = x[y == cls]
            ni = len(xi)
            mean_i = xi.mean()
            var_i = xi.var()
            numerator += ni * (mean_i - mean_overall) ** 2
            denominator += ni * var_i
        scores[col] = numerator / denominator if denominator != 0 else 0
    return pd.Series(scores)

# === Function for parallel processing ===
def process_target(target):
    try:
        print(f"🔍 Processing: {target}")

        # Encode target
        combined = pd.concat([train[target], test[target]], axis=0).fillna("MISSING")
        le = LabelEncoder().fit(combined)

        y_train = le.transform(train[target].fillna("MISSING"))
        y_test = le.transform(test[target].fillna("MISSING"))

        # Scores
        fisher_train = fisher_score(X_train, y_train).sort_values(ascending=False)
        fisher_test = fisher_score(X_test, y_test).sort_values(ascending=False)

        mi_train = pd.Series(
            mutual_info_classif(X_train, y_train, random_state=0),
            index=X_train.columns
        ).sort_values(ascending=False)

        mi_test = pd.Series(
            mutual_info_classif(X_test, y_test, random_state=0),
            index=X_test.columns
        ).sort_values(ascending=False)

        # Save results
        file_path = f"results/corr_numerics_vs_{target}.txt"
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(f"🔹 TARGET: {target}\n")
            f.write(f"\n📊 TOP 25 FEATURES BY FISHER (TRAIN)\n")
            f.write(fisher_train.head(25).to_string())
            f.write(f"\n\n📊 TOP 25 FEATURES BY FISHER (TEST)\n")
            f.write(fisher_test.head(25).to_string())
            f.write(f"\n\n📊 TOP 25 FEATURES BY MUTUAL INFO (TRAIN)\n")
            f.write(mi_train.head(25).to_string())
            f.write(f"\n\n📊 TOP 25 FEATURES BY MUTUAL INFO (TEST)\n")
            f.write(mi_test.head(25).to_string())

        print(f"✅ Saved: {file_path}")

    except Exception as e:
        print(f"❌ Error with {target}: {e}")

# === Parallel execution ===
if __name__ == "__main__":
    with Pool(processes=min(cpu_count(), len(categorical_targets))) as pool:
        pool.map(process_target, categorical_targets)


In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
import time
from collections import Counter
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

start = time.time()

# === Load the data ===
df = pd.read_csv("train.csv", low_memory=False)

# === Select columns ===
features = [
    'totalPrice', 'companyID', 'legs0_segments0_cabinClass',
    'isAccess3D', 'isVip', 'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber', 'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP', 'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount', 'miniRules0_statusInfos',
    'miniRules1_statusInfos'
]
target = 'legs1_segments0_aircraft_code'
df = df[features + [target]].dropna()

# === Validation sample ===
valid_sample = df.sample(n=1_000_000, random_state=42)
train_df = df.drop(index=valid_sample.index, errors='ignore')

# === Split X and y ===
X_train = train_df[features]
X_valid = valid_sample[features]

# === Label Encoding on target ===
le = LabelEncoder()
y_train_raw = train_df[target]
y_valid_raw = valid_sample[target]

# Fit encoder only on y_train
le.fit(y_train_raw)
y_train = le.transform(y_train_raw)

# Filter validation — only classes present in y_train
mask = y_valid_raw.isin(le.classes_)
X_valid = X_valid[mask]
y_valid = y_valid_raw[mask]
y_valid = le.transform(y_valid)

# === Model ===
model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=12,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Predictions ===
y_pred = model.predict(X_valid)

# === Metrics ===
acc = accuracy_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred, average='weighted')
cm_shape = confusion_matrix(y_valid, y_pred).shape
duration = round(time.time() - start, 2)

# === Output ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Direct prediction results (XGBClassifier + LabelEncoder):\n")
    f.write(f"Train examples: {len(X_train)}\n")
    f.write(f"Validation examples: {len(X_valid)}\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"Weighted F1 Score: {f1:.4f}\n")
    f.write(f"Confusion Matrix shape: {cm_shape}\n")
    top_preds = Counter(y_pred).most_common(10)
    f.write("Top 10 predicted labels (encoded):\n")
    for cls, count in top_preds:
        original_label = le.inverse_transform([cls])[0]
        f.write(f"{cls} (raw: {original_label}): {count} predictions\n")
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

print("✅ Results saved to imputation_results.txt")


In [None]:
✅ Direct prediction results (XGBClassifier + LabelEncoder):
Train examples: 12758171
Validation examples: 1000000
Accuracy: 0.6711
Weighted F1 Score: 0.6538
Confusion Matrix shape: (89, 89)
Top 10 predicted labels (encoded):
87 (raw: SU9): 428925 predictions
10 (raw: 32A): 140160 predictions
11 (raw: 32B): 115466 predictions
8 (raw: 320): 81614 predictions
34 (raw: 73H): 72774 predictions
9 (raw: 321): 25888 predictions
75 (raw: E70): 22667 predictions
30 (raw: 738): 21992 predictions
51 (raw: 77W): 14813 predictions
18 (raw: 333): 14485 predictions

⏱️ Duration: 1312.76 seconds

In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
import time
from collections import Counter
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

start = time.time()

# === Load the data ===
df = pd.read_csv("train.csv", low_memory=False)

# === Select columns ===
features = [
    'totalPrice', 'companyID', 'legs0_segments0_cabinClass',
    'isAccess3D', 'isVip', 'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber', 'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP', 'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount', 'miniRules0_statusInfos',
    'miniRules1_statusInfos'
]
target = 'legs1_segments0_aircraft_code'
df = df[features + [target]].dropna()

# === Sample for validation ===
valid_sample = df.sample(n=1_000_000, random_state=42)
train_df = df.drop(index=valid_sample.index, errors='ignore')

# === Split X and y ===
X_train = train_df[features]
X_valid = valid_sample[features]

# === Label Encoding for the target ===
le = LabelEncoder()
y_train_raw = train_df[target]
y_valid_raw = valid_sample[target]

# Fit encoder only on y_train
le.fit(y_train_raw)
y_train = le.transform(y_train_raw)

# Filter validation — only keep classes present in y_train
mask = y_valid_raw.isin(le.classes_)
X_valid = X_valid[mask]
y_valid = y_valid_raw[mask]
y_valid = le.transform(y_valid)

# === Model ===
model = XGBClassifier(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Predictions ===
y_pred = model.predict(X_valid)

# === Metrics ===
acc = accuracy_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred, average='weighted')
cm_shape = confusion_matrix(y_valid, y_pred).shape
duration = round(time.time() - start, 2)

# === Output ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Direct prediction results (XGBClassifier + LabelEncoder):\n")
    f.write(f"Train examples: {len(X_train)}\n")
    f.write(f"Validation examples: {len(X_valid)}\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"Weighted F1 Score: {f1:.4f}\n")
    f.write(f"Confusion Matrix shape: {cm_shape}\n")
    top_preds = Counter(y_pred).most_common(10)
    f.write("Top 10 predicted labels (encoded):\n")
    for cls, count in top_preds:
        original_label = le.inverse_transform([cls])[0]
        f.write(f"{cls} (raw: {original_label}): {count} predictions\n")
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

print("✅ Results saved to imputation_results.txt")


In [None]:
✅ Direct prediction results (XGBClassifier + LabelEncoder):
Train examples: 12758171
Validation examples: 1000000
Accuracy: 0.7599
Weighted F1 Score: 0.7537
Confusion Matrix shape: (90, 90)
Top 10 predicted labels (encoded):
87 (raw: SU9): 415296 predictions
10 (raw: 32A): 137331 predictions
11 (raw: 32B): 114387 predictions
8 (raw: 320): 81964 predictions
34 (raw: 73H): 70497 predictions
9 (raw: 321): 31346 predictions
30 (raw: 738): 19720 predictions
75 (raw: E70): 16976 predictions
51 (raw: 77W): 14871 predictions
6 (raw: 319): 14477 predictions

⏱️ Duration: 8050.71 seconds

In [None]:
%%writefile add_column_legs1_segments0_aircraft_code_was_missing.py
import pandas as pd

# === Load the files ===
train_df = pd.read_csv("train.csv", low_memory=False)
test_df = pd.read_csv("test.csv", low_memory=False)

# === Target column ===
target_col = 'legs1_segments0_aircraft_code'
flag_col = 'legs1_segments0_aircraft_code_was_missing'

# === Add flag in TRAIN ===
train_df[flag_col] = train_df[target_col].notna().astype(int)

# === Add flag in TEST ===
test_df[flag_col] = test_df[target_col].notna().astype(int)

# === Overwrite original files ===
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)

# === Create report file ===
with open("missing_flag_info.txt", "w") as f:
    f.write("🛫 Flag: legs1_segments0_aircraft_code_was_missing\n\n")

    f.write("📊 TRAIN:\n")
    f.write(train_df[flag_col].value_counts().rename({1: "Values present", 0: "Values missing"}).to_string())
    f.write("\n\n")

    f.write("📊 TEST:\n")
    f.write(test_df[flag_col].value_counts().rename({1: "Values present", 0: "Values missing"}).to_string())
    f.write("\n")

print("✅ Column has been added to train.csv and test.csv.")
print("📝 Details saved in missing_flag_info.txt.")


In [None]:
%%writefile impute_legs1_segments0_aircraft_code.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import time

# === Timer ===
start = time.time()

# === Load files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Relevant columns ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount',
    'miniRules0_statusInfos',
    'miniRules1_statusInfos'
]
target = 'legs1_segments0_aircraft_code'

# === Train on complete rows from train ===
train_valid = train.dropna(subset=features + [target])
X_train = train_valid[features]
y_train_raw = train_valid[target]

# === Label encode target ===
le = LabelEncoder()
y_train = le.fit_transform(y_train_raw)

# === Model ===
model = XGBClassifier(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Imputation function ===
def impute_column(df, name="train"):
    missing_mask = df[target].isna()
    if missing_mask.sum() == 0:
        print(f"✅ {name}: No missing values in '{target}'")
        return df

    imputable = df.loc[missing_mask, features].dropna()
    imputable_indices = imputable.index

    if len(imputable_indices) == 0:
        print(f"⚠️ {name}: No imputable rows with all required features.")
        return df

    preds = model.predict(imputable)
    preds_labels = le.inverse_transform(preds)
    df.loc[imputable_indices, target] = preds_labels
    print(f"✅ {name}: Imputed {len(imputable_indices)} missing values in '{target}'")
    return df

# === Perform imputation ===
train = impute_column(train, "train")
test = impute_column(test, "test")

# === Overwrite files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === Post-imputation stats ===
def report(df, name):
    total = len(df)
    missing = df[target].isna().sum()
    value_counts = df[target].value_counts(dropna=False).sort_index()
    print(f"\n📊 {name} — '{target}':")
    print(f"  Total rows: {total}")
    print(f"  Missing: {missing} ({100 * missing / total:.2f}%)")
    print("  Top 10 value counts (including imputations):")
    print(value_counts.head(10))

report(train, "train")
report(test, "test")

# === Done ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")


In [None]:
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
✅ train: Imputed 4387201 missing values in 'legs1_segments0_aircraft_code'
✅ test: Imputed 1115840 missing values in 'legs1_segments0_aircraft_code'

📊 train — 'legs1_segments0_aircraft_code':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_aircraft_code
220         52
221        358
223       1235
290         39
295        356
318        112
319     430236
31N         20
320    1373672
321     661791
Name: count, dtype: int64

📊 test — 'legs1_segments0_aircraft_code':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_aircraft_code
0          65
220         2
221        78
223       897
290        27
295        56
318         1
319    114766
31N         8
320    542247
Name: count, dtype: int64

⏱️ Total duration: 9255.26 seconds

In [None]:
%%writefile impute_legs1_segments0_marketingCarrier_code.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import time

# === Timer ===
start = time.time()

# === Load files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Relevant columns ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount',
    'miniRules0_statusInfos',
    'miniRules1_statusInfos'
]
target = 'legs1_segments0_marketingCarrier_code'

# === Train on complete rows from train ===
train_valid = train.dropna(subset=features + [target])
X_train = train_valid[features]
y_train_raw = train_valid[target]

# === Label encode target ===
le = LabelEncoder()
y_train = le.fit_transform(y_train_raw)

# === Model ===
model = XGBClassifier(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Imputation function ===
def impute_column(df, name="train"):
    missing_mask = df[target].isna()
    if missing_mask.sum() == 0:
        print(f"✅ {name}: No missing values in '{target}'")
        return df

    imputable = df.loc[missing_mask, features].dropna()
    imputable_indices = imputable.index

    if len(imputable_indices) == 0:
        print(f"⚠️ {name}: No imputable rows with all required features.")
        return df

    preds = model.predict(imputable)
    preds_labels = le.inverse_transform(preds)
    df.loc[imputable_indices, target] = preds_labels
    print(f"✅ {name}: Imputed {len(imputable_indices)} missing values in '{target}'")
    return df

# === Impute ===
train = impute_column(train, "train")
test = impute_column(test, "test")

# === Overwrite files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === Post-imputation statistics ===
def report(df, name):
    total = len(df)
    missing = df[target].isna().sum()
    value_counts = df[target].value_counts(dropna=False).sort_index()
    print(f"\n📊 {name} — '{target}':")
    print(f"  Total rows: {total}")
    print(f"  Missing: {missing} ({100 * missing / total:.2f}%)")
    print("  Top 10 value counts (including imputations):")
    print(value_counts.head(10))

report(train, "train")
report(test, "test")

# === Done ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")


In [None]:
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
✅ train: Imputed 4387201 missing values in 'legs1_segments0_marketingCarrier_code'
✅ test: Imputed 1115840 missing values in 'legs1_segments0_marketingCarrier_code'

📊 train — 'legs1_segments0_marketingCarrier_code':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_marketingCarrier_code
3U      8705
4G        12
5F      2054
5G       374
5N    186241
6H         4
6R     69389
7R     17384
9B     16191
A3      1202
Name: count, dtype: int64

📊 test — 'legs1_segments0_marketingCarrier_code':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_marketingCarrier_code
3U     7057
4G        1
5F     1126
5G      395
5N    36423
6H        1
6R    27423
7R    12391
9B     4263
A3      351
Name: count, dtype: int64

⏱️ Total duration: 10785.08 seconds

In [None]:
%%writefile impute_legs1_segments0_targets.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import time

# === Timer ===
start = time.time()

# === Load the files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Relevant columns for prediction ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount',
    'miniRules0_statusInfos',
    'miniRules1_statusInfos'
]

# === Target columns to impute ===
target_columns = [
    'legs1_segments0_arrivalTo_airport_city_iata',
    'legs1_segments0_operatingCarrier_code',
    'legs1_segments0_departureFrom_airport_iata'
]

# === Train and impute function for a target column ===
def impute_column(target):
    print(f"\n🔧 Processing target: '{target}'")

    train_valid = train.dropna(subset=features + [target])
    if train_valid.empty:
        print(f"❌ Skipped '{target}': No complete training data.")
        return

    X_train = train_valid[features]
    y_train_raw = train_valid[target]

    # Label Encoding
    le = LabelEncoder()
    y_train = le.fit_transform(y_train_raw)

    # Model
    model = XGBClassifier(
        n_estimators=500,
        max_depth=12,
        learning_rate=0.1,
        tree_method='hist',
        n_jobs=24,
        random_state=42,
        eval_metric='logloss'
    )
    model.fit(X_train, y_train)

    # Impute train & test
    for df, name in [(train, "train"), (test, "test")]:
        missing_mask = df[target].isna()
        if missing_mask.sum() == 0:
            print(f"✅ {name}: No missing values in '{target}'")
            continue

        imputable = df.loc[missing_mask, features].dropna()
        imputable_indices = imputable.index

        if len(imputable_indices) == 0:
            print(f"⚠️ {name}: No imputable rows with all required features for '{target}'.")
            continue

        preds = model.predict(imputable)
        preds_labels = le.inverse_transform(preds)
        df.loc[imputable_indices, target] = preds_labels
        print(f"✅ {name}: Imputed {len(imputable_indices)} missing values in '{target}'")

    # Report
    for df, name in [(train, "train"), (test, "test")]:
        total = len(df)
        missing = df[target].isna().sum()
        value_counts = df[target].value_counts(dropna=False).sort_index()
        print(f"\n📊 {name} — '{target}':")
        print(f"  Total rows: {total}")
        print(f"  Missing: {missing} ({100 * missing / total:.2f}%)")
        print("  Top 10 value counts (including imputations):")
        print(value_counts.head(10))

# === Run for each target column ===
for target in target_columns:
    impute_column(target)

# === Save the final files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === Done ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")


In [None]:
🔧 Processing target: 'legs1_segments0_arrivalTo_airport_city_iata'
✅ train: Imputed 4387237 missing values in 'legs1_segments0_arrivalTo_airport_city_iata'
✅ test: Imputed 1115846 missing values in 'legs1_segments0_arrivalTo_airport_city_iata'

📊 train — 'legs1_segments0_arrivalTo_airport_city_iata':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_arrivalTo_airport_city_iata
ABA      1103
ADA        93
ADD      1730
AER    570004
AGP         6
AKX        37
ALA      5817
ALG       377
AMD       273
AMM       263
Name: count, dtype: int64

📊 test — 'legs1_segments0_arrivalTo_airport_city_iata':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_arrivalTo_airport_city_iata
ABA      348
ADA        5
ADD      185
AER    97793
AKX       43
ALA     4641
ALG      290
AMD       79
AMM       91
AMS     1806
Name: count, dtype: int64

🔧 Processing target: 'legs1_segments0_operatingCarrier_code'
✅ train: Imputed 4387201 missing values in 'legs1_segments0_operatingCarrier_code'
✅ test: Imputed 1115840 missing values in 'legs1_segments0_operatingCarrier_code'

📊 train — 'legs1_segments0_operatingCarrier_code':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_operatingCarrier_code
2C     387
2L     339
3F    1182
3K      55
3L     217
3U    7824
4G      12
4Z    1132
5F     844
5G     373
Name: count, dtype: int64

📊 test — 'legs1_segments0_operatingCarrier_code':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_operatingCarrier_code
2C      369
2L      207
3F      731
3K        4
3L      126
3U     3921
4Z        1
5F      390
5G      389
5N    38292
Name: count, dtype: int64

🔧 Processing target: 'legs1_segments0_departureFrom_airport_iata'
✅ train: Imputed 4387206 missing values in 'legs1_segments0_departureFrom_airport_iata'
✅ test: Imputed 1115840 missing values in 'legs1_segments0_departureFrom_airport_iata'

📊 train — 'legs1_segments0_departureFrom_airport_iata':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_departureFrom_airport_iata
ABA       5126
ADB       3781
ADD        471
AEP        527
AER    1092525
AKX        366
ALA      73262
AMS       5853
ARH      27852
ARN        909
Name: count, dtype: int64

📊 test — 'legs1_segments0_departureFrom_airport_iata':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_departureFrom_airport_iata
AAR       256
ABA      1954
ADB      1302
ADD         6
AEP        17
AER    384921
AGP        44
AKX         4
ALA     32775
AMS      2467
Name: count, dtype: int64

⏱️ Total duration: 59858.45 seconds

In [None]:
%%writefile impute_multiple_targets.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import time

# === Timer ===
start = time.time()

# === Load files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Relevant columns for prediction ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount',
    'miniRules0_statusInfos',
    'miniRules1_statusInfos'
]

# === Target columns to impute ===
target_columns = [
    'legs1_segments0_arrivalTo_airport_iata',
    'frequentFlyer',
    'legs1_segments0_seatsAvailable',
    'legs1_segments0_cabinClass'
]

# === Function to train and impute one target column ===
def impute_column(target):
    print(f"\n🔧 Processing target: '{target}'")

    train_valid = train.dropna(subset=features + [target])
    if train_valid.empty:
        print(f"❌ Skipped '{target}': No complete training data.")
        return

    X_train = train_valid[features]
    y_train_raw = train_valid[target]

    # Label Encoding
    le = LabelEncoder()
    y_train = le.fit_transform(y_train_raw)

    # Model
    model = XGBClassifier(
        n_estimators=500,
        max_depth=12,
        learning_rate=0.1,
        tree_method='hist',
        n_jobs=24,
        random_state=42,
        eval_metric='logloss'
    )
    model.fit(X_train, y_train)

    # Impute for train & test
    for df, name in [(train, "train"), (test, "test")]:
        missing_mask = df[target].isna()
        if missing_mask.sum() == 0:
            print(f"✅ {name}: No missing values in '{target}'")
            continue

        imputable = df.loc[missing_mask, features].dropna()
        imputable_indices = imputable.index

        if len(imputable_indices) == 0:
            print(f"⚠️ {name}: No imputable rows with all required features for '{target}'.")
            continue

        preds = model.predict(imputable)
        preds_labels = le.inverse_transform(preds)
        df.loc[imputable_indices, target] = preds_labels
        print(f"✅ {name}: Imputed {len(imputable_indices)} missing values in '{target}'")

    # Report
    for df, name in [(train, "train"), (test, "test")]:
        total = len(df)
        missing = df[target].isna().sum()
        value_counts = df[target].value_counts(dropna=False).sort_index()
        print(f"\n📊 {name} — '{target}':")
        print(f"  Total rows: {total}")
        print(f"  Missing: {missing} ({100 * missing / total:.2f}%)")
        print("  Top 10 value counts (including imputations):")
        print(value_counts.head(10))

# === Run for each target column ===
for target in target_columns:
    impute_column(target)

# === Save final files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === Done ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")


In [None]:
🔧 Processing target: 'legs1_segments0_arrivalTo_airport_iata'
✅ train: Imputed 4387203 missing values in 'legs1_segments0_arrivalTo_airport_iata'
✅ test: Imputed 1115840 missing values in 'legs1_segments0_arrivalTo_airport_iata'

📊 train — 'legs1_segments0_arrivalTo_airport_iata':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_arrivalTo_airport_iata
ABA      1457
ADA        47
ADB       311
ADD      1803
AER    583660
AGP         6
AKX        44
ALA      5901
ALG       389
AMD       740
Name: count, dtype: int64

📊 test — 'legs1_segments0_arrivalTo_airport_iata':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_arrivalTo_airport_iata
ABA       410
ADA         1
ADB        40
ADD       198
AER    100424
AKX        42
ALA      4576
ALG       286
AMD        46
AMM        94
Name: count, dtype: int64

🔧 Processing target: 'frequentFlyer'
✅ train: Imputed 12012727 missing values in 'frequentFlyer'
✅ test: Imputed 3974920 missing values in 'frequentFlyer'

📊 train — 'frequentFlyer':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
frequentFlyer
- ЮТэйр ЗАО                3854
- ЮТэйр ЗАО/KC/LH/S7/SU    1235
- ЮТэйр ЗАО/SU/S7          1948
2G                         1670
6W/S7/SU                    180
AB/LH                      4047
AF                          989
AF/SU                      1807
AF/SU/LH/BA/TK/EK          1314
AF/TK                        25
Name: count, dtype: int64

📊 test — 'frequentFlyer':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
frequentFlyer
- ЮТэйр ЗАО                2455
- ЮТэйр ЗАО/KC/LH/S7/SU     701
- ЮТэйр ЗАО/SU              571
- ЮТэйр ЗАО/SU/S7            34
2G                          138
6W/S7/SU                    126
AB/LH                      2092
AF                          285
AF/SU                       144
AF/SU/LH/BA/TK/EK            24
Name: count, dtype: int64

🔧 Processing target: 'legs1_segments0_seatsAvailable'
✅ train: Imputed 4580895 missing values in 'legs1_segments0_seatsAvailable'
✅ test: Imputed 1124817 missing values in 'legs1_segments0_seatsAvailable'

📊 train — 'legs1_segments0_seatsAvailable':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_seatsAvailable
1.0    2834013
2.0    2018940
3.0    1531366
4.0    1704667
5.0    1233873
6.0     883865
7.0     798732
8.0     604838
9.0    6535078
Name: count, dtype: int64

📊 test — 'legs1_segments0_seatsAvailable':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_seatsAvailable
1.0    1345117
2.0     878437
3.0     568834
4.0     518823
5.0     388642
6.0     288154
7.0     279193
8.0     226923
9.0    2403653
Name: count, dtype: int64

🔧 Processing target: 'legs1_segments0_cabinClass'
✅ train: Imputed 4525013 missing values in 'legs1_segments0_cabinClass'
✅ test: Imputed 1124817 missing values in 'legs1_segments0_cabinClass'

📊 train — 'legs1_segments0_cabinClass':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_cabinClass
1.0    14980700
2.0     2994301
3.0         891
4.0      169480
Name: count, dtype: int64

📊 test — 'legs1_segments0_cabinClass':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_cabinClass
1.0    6648042
2.0     247044
3.0        836
4.0       1854
Name: count, dtype: int64

⏱️ Total duration: 43699.51 seconds

In [None]:
%%writefile impute_multiple_targets.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import time

# === Timer ===
start = time.time()

# === Load the files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Relevant columns for prediction ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount',
    'miniRules0_statusInfos',
    'miniRules1_statusInfos'
]

# === Target columns to impute ===
target_columns = [
    'corporateTariffCode',
    'legs1_segments0_baggageAllowance_quantity'
]

# === Function to train and impute a target column ===
def impute_column(target):
    print(f"\n🔧 Processing target: '{target}'")

    train_valid = train.dropna(subset=features + [target])
    if train_valid.empty:
        print(f"❌ Skipped '{target}': No complete training data.")
        return

    X_train = train_valid[features]
    y_train_raw = train_valid[target]

    # Label Encoding
    le = LabelEncoder()
    y_train = le.fit_transform(y_train_raw)

    # Model
    model = XGBClassifier(
        n_estimators=500,
        max_depth=12,
        learning_rate=0.1,
        tree_method='hist',
        n_jobs=24,
        random_state=42,
        eval_metric='logloss'
    )
    model.fit(X_train, y_train)

    # Impute for train & test
    for df, name in [(train, "train"), (test, "test")]:
        missing_mask = df[target].isna()
        if missing_mask.sum() == 0:
            print(f"✅ {name}: No missing values in '{target}'")
            continue

        imputable = df.loc[missing_mask, features].dropna()
        imputable_indices = imputable.index

        if len(imputable_indices) == 0:
            print(f"⚠️ {name}: No imputable rows with all required features for '{target}'.")
            continue

        preds = model.predict(imputable)
        preds_labels = le.inverse_transform(preds)
        df.loc[imputable_indices, target] = preds_labels
        print(f"✅ {name}: Imputed {len(imputable_indices)} missing values in '{target}'")

    # Report
    for df, name in [(train, "train"), (test, "test")]:
        total = len(df)
        missing = df[target].isna().sum()
        value_counts = df[target].value_counts(dropna=False).sort_index()
        print(f"\n📊 {name} — '{target}':")
        print(f"  Total rows: {total}")
        print(f"  Missing: {missing} ({100 * missing / total:.2f}%)")
        print("  Top 10 value counts (including imputations):")
        print(value_counts.head(10))

# === Run for each target column ===
for target in target_columns:
    impute_column(target)

# === Save final files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === Done ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")


In [None]:

🔧 Processing target: 'corporateTariffCode'
✅ train: Imputed 9233925 missing values in 'corporateTariffCode'
✅ test: Imputed 3535335 missing values in 'corporateTariffCode'

📊 train — 'corporateTariffCode':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
corporateTariffCode
0.0    16821
1.0     2035
2.0       87
3.0    24452
4.0     1619
5.0    43595
6.0    16895
7.0     1496
8.0      724
9.0     3963
Name: count, dtype: int64

📊 test — 'corporateTariffCode':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
corporateTariffCode
0.0    1934
1.0      21
2.0       1
3.0    6369
4.0     212
5.0    6267
6.0    1455
7.0      19
8.0      22
9.0     457
Name: count, dtype: int64

🔧 Processing target: 'legs1_segments0_baggageAllowance_quantity'
✅ train: Imputed 4527297 missing values in 'legs1_segments0_baggageAllowance_quantity'
✅ test: Imputed 1125350 missing values in 'legs1_segments0_baggageAllowance_quantity'

📊 train — 'legs1_segments0_baggageAllowance_quantity':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_baggageAllowance_quantity
0.0     6292151
1.0     5893088
2.0     4958200
3.0        4310
10.0       3138
15.0       5773
20.0      86603
23.0      41322
25.0      57706
30.0     465936
Name: count, dtype: int64

📊 test — 'legs1_segments0_baggageAllowance_quantity':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_baggageAllowance_quantity
0.0     2871904
1.0     2130799
2.0     1544038
3.0        1246
10.0       1089
15.0       1448
20.0      40173
23.0       7014
25.0      41021
30.0     139836
Name: count, dtype: int64

⏱️ Total duration: 10105.78 seconds

In [None]:
%%writefile analyze_baggage_quantity_by_type.py
import pandas as pd

# Load train.csv
df = pd.read_csv("train.csv", low_memory=False)

# Column names
qty_col = "legs1_segments0_baggageAllowance_quantity"
type_col = "legs1_segments0_baggageAllowance_weightMeasurementType"

# Filter rows where both values are present
df_filtered = df[[qty_col, type_col]].dropna()

# Grouped statistics
grouped_stats = df_filtered.groupby(type_col)[qty_col].describe()
value_counts = df_filtered.groupby(type_col)[qty_col].value_counts().sort_index()

# Format output
lines = []
lines.append("📊 Baggage Allowance Quantity Analysis by Measurement Type\n")

for measurement_type, stats in grouped_stats.iterrows():
    type_label = "PIECE (0.0)" if measurement_type == 0.0 else "WEIGHT (1.0)"
    lines.append(f"\n▶️ Type: {type_label}")
    lines.append(stats.to_string())

    # Extra info: frequency of each quantity value
    lines.append("\n🔢 Frequency of Baggage Quantities:")
    qty_freq = value_counts[measurement_type]
    total = qty_freq.sum()

    for val, count in qty_freq.items():
        percent = 100 * count / total
        lines.append(f"  - {val}: {count} ({percent:.2f}%)")

    # Most frequent quantity (mode)
    mode = qty_freq.idxmax()
    lines.append(f"\n📌 Most frequent quantity: {mode} ({qty_freq[mode]} times)")

    # Count of baggage quantities > 2 (example threshold)
    over_two = qty_freq[qty_freq.index > 2].sum()
    lines.append(f"📦 Entries with quantity > 2: {over_two} ({100 * over_two / total:.2f}%)")

# Save to TXT
output_file = "baggage_quantity_by_type_stats.txt"
with open(output_file, "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print(f"✅ Detailed analysis saved to '{output_file}'")

In [None]:
%%writefile rule_based_impute_weight_type.py
import pandas as pd

# === Load files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Rules ===
piece_values = [1.0, 2.0, 3.0]
weight_values = [10.0, 15.0, 20.0, 23.0, 25.0, 30.0, 32.0, 33.0, 35.0, 40.0, 45.0, 50.0, 60.0]
target_col = 'legs1_segments0_baggageAllowance_weightMeasurementType'
quantity_col = 'legs1_segments0_baggageAllowance_quantity'

# === Imputation function and report ===
def rule_based_impute(df, name=""):
    log = []
    log.append(f"📄 Dataset: {name}")
    initial_missing = df[target_col].isna().sum()
    log.append(f"🔍 Missing before imputing: {initial_missing}")

    # Imputation rules
    mask_piece = df[target_col].isna() & df[quantity_col].isin(piece_values)
    mask_weight = df[target_col].isna() & df[quantity_col].isin(weight_values)

    df.loc[mask_piece, target_col] = 0.0
    df.loc[mask_weight, target_col] = 1.0

    piece_count = mask_piece.sum()
    weight_count = mask_weight.sum()

    log.append(f"✅ Imputed PIECE (0.0): {piece_count}")
    log.append(f"✅ Imputed WEIGHT (1.0): {weight_count}")

    final_missing = df[target_col].isna().sum()
    log.append(f"🧮 Missing after rule-based imputing: {final_missing}")

    # Post-imputation stats
    log.append("\n📊 Value counts after imputing:")
    log.append(df[target_col].value_counts(dropna=False).to_string())

    # If NaNs remain, show the distribution of quantity for those
    if final_missing > 0:
        log.append("\n🔎 Distribution of quantity for remaining NaN values:")
        remaining_nan = df[df[target_col].isna()]
        qty_counts = remaining_nan[quantity_col].value_counts(dropna=False).sort_index()
        log.append(qty_counts.to_string())

    return df, "\n".join(log)

# === Apply to both datasets and generate logs ===
train, train_log = rule_based_impute(train, "train")
test, test_log = rule_based_impute(test, "test")

# === Save updated files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === Write the log to a file ===
with open("rule_based_weight_type_imputation_log.txt", "w", encoding="utf-8") as f:
    f.write(train_log)
    f.write("\n\n" + "="*60 + "\n\n")
    f.write(test_log)

print("✅ Imputation complete.")
print("📝 Log saved to 'rule_based_weight_type_imputation_log.txt'")


In [None]:
%%writefile impute_legs1_segments0_baggageAllowance_weightMeasurementType.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import time

# === Timer ===
start = time.time()

# === Load files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Relevant features for prediction ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount',
    'miniRules0_statusInfos',
    'miniRules1_statusInfos',
    'legs1_segments0_baggageAllowance_quantity'
]

# === Target columns to impute ===
target_columns = [
    'legs1_segments0_baggageAllowance_weightMeasurementType'
]

# === Train and impute for one target column ===
def impute_column(target):
    print(f"\n🔧 Processing target: '{target}'")

    # === Special case: only impute when quantity == 0.0 ===
    if target == 'legs1_segments0_baggageAllowance_weightMeasurementType':
        train_subset = train[
            (train[target].notna()) &
            (train['legs1_segments0_baggageAllowance_quantity'] == 0.0)
        ]
    else:
        train_subset = train.dropna(subset=features + [target])

    if train_subset.empty:
        print(f"❌ Skipped '{target}': No complete training data.")
        return

    X_train = train_subset[features]
    y_train_raw = train_subset[target]

    # Label Encoding
    le = LabelEncoder()
    y_train = le.fit_transform(y_train_raw)

    # Model
    model = XGBClassifier(
        n_estimators=500,
        max_depth=12,
        learning_rate=0.1,
        tree_method='hist',
        n_jobs=24,
        random_state=42,
        eval_metric='logloss'
    )
    model.fit(X_train, y_train)

    # Impute train & test
    for df, name in [(train, "train"), (test, "test")]:
        if target == 'legs1_segments0_baggageAllowance_weightMeasurementType':
            missing_mask = df[target].isna() & (df['legs1_segments0_baggageAllowance_quantity'] == 0.0)
        else:
            missing_mask = df[target].isna()

        if missing_mask.sum() == 0:
            print(f"✅ {name}: No missing values in '{target}'")
            continue

        imputable = df.loc[missing_mask, features].dropna()
        imputable_indices = imputable.index

        if len(imputable_indices) == 0:
            print(f"⚠️ {name}: No imputable rows with all required features for '{target}'.")
            continue

        preds = model.predict(imputable)
        preds_labels = le.inverse_transform(preds)
        df.loc[imputable_indices, target] = preds_labels
        print(f"✅ {name}: Imputed {len(imputable_indices)} missing values in '{target}'")

    # Report
    for df, name in [(train, "train"), (test, "test")]:
        total = len(df)
        missing = df[target].isna().sum()
        value_counts = df[target].value_counts(dropna=False).sort_index()
        print(f"\n📊 {name} — '{target}':")
        print(f"  Total rows: {total}")
        print(f"  Missing: {missing} ({100 * missing / total:.2f}%)")
        print("  Top 10 value counts (including imputations):")
        print(value_counts.head(10))

# === Run for each target column ===
for target in target_columns:
    impute_column(target)

# === Save final files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === End ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")

In [None]:
🔧 Processing target: 'legs1_segments0_baggageAllowance_weightMeasurementType'
✅ train: Imputed 1314851 missing values in 'legs1_segments0_baggageAllowance_weightMeasurementType'
✅ test: Imputed 370664 missing values in 'legs1_segments0_baggageAllowance_weightMeasurementType'

📊 train — 'legs1_segments0_baggageAllowance_weightMeasurementType':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_baggageAllowance_weightMeasurementType
0.0    17089873
1.0     1055499
Name: count, dtype: int64

📊 test — 'legs1_segments0_baggageAllowance_weightMeasurementType':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_baggageAllowance_weightMeasurementType
0.0    6531933
1.0     365843
Name: count, dtype: int64

⏱️ Total duration: 440.47 seconds

In [None]:
%%writefile convert_duration.py
import pandas as pd
import numpy as np

# === Load files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Columns to convert ===
duration_columns = ["legs1_segments0_duration", "legs1_duration"]

# === Conversion function HH:MM:SS → minutes ===
def duration_to_minutes(duration_str):
    if pd.isna(duration_str):
        return np.nan
    try:
        h, m, s = map(int, duration_str.split(":"))
        return h * 60 + m + s / 60
    except:
        return np.nan

# === Text output for report ===
lines = []
lines.append("📄 Duration Conversion Report (HH:MM:SS → minutes)\n")

# === Conversion + statistics ===
for col in duration_columns:
    for df, name in [(train, "Train"), (test, "Test")]:
        orig_col = df[col].copy()
        df[col] = df[col].apply(duration_to_minutes)

        nan_before = orig_col.isna().sum()
        nan_after = df[col].isna().sum()

        lines.append(f"\n=== 📊 Dataset: {name} — Column: {col} ===")
        lines.append(f"🔢 Total rows: {len(df)}")
        lines.append(f"❌ Missing before: {nan_before}")
        lines.append(f"❌ Missing after:  {nan_after}")
        lines.append("\n📈 Descriptive stats:")
        lines.append(df[col].describe().to_string())

        value_counts = df[col].value_counts().sort_index()
        lines.append("\n🔟 Top 10 most common durations (minutes):")
        lines.append(value_counts.head(10).to_string())

# === Save updated files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === Write report to .txt file ===
with open("duration_conversion_report.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

print("✅ Conversion completed and saved to 'train.csv' and 'test.csv'")
print("📝 Report generated: 'duration_conversion_report.txt'")


In [None]:
%%writefile impute_multiple_targets.py
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
import time

# === Timer ===
start = time.time()

# === Load files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Relevant columns for prediction ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount',
    'miniRules0_statusInfos',
    'miniRules1_statusInfos'
]

# === Target columns to impute ===
target_columns = [
    'legs1_segments0_duration',
    'legs1_duration'
]

# === Training and imputing function for one target column ===
def impute_column(target):
    print(f"\n🔧 Processing target: '{target}'")

    train_valid = train.dropna(subset=features + [target])
    if train_valid.empty:
        print(f"❌ Skipped '{target}': No complete training data.")
        return

    X_train = train_valid[features]
    y_train = train_valid[target].astype(float)

    # Model
    model = XGBRegressor(
        n_estimators=500,
        max_depth=12,
        learning_rate=0.1,
        tree_method='hist',
        n_jobs=24,
        random_state=42,
        eval_metric='rmse'
    )
    model.fit(X_train, y_train)

    # Impute train & test
    for df, name in [(train, "train"), (test, "test")]:
        missing_mask = df[target].isna()
        if missing_mask.sum() == 0:
            print(f"✅ {name}: No missing values in '{target}'")
            continue

        imputable = df.loc[missing_mask, features].dropna()
        imputable_indices = imputable.index

        if len(imputable_indices) == 0:
            print(f"⚠️ {name}: No imputable rows with all required features for '{target}'.")
            continue

        preds = model.predict(imputable)
        preds_rounded = np.round(preds)  # Round to nearest minute
        df.loc[imputable_indices, target] = preds_rounded
        print(f"✅ {name}: Imputed {len(imputable_indices)} missing values in '{target}'")

    # Report
    for df, name in [(train, "train"), (test, "test")]:
        total = len(df)
        missing = df[target].isna().sum()
        value_counts = df[target].value_counts(dropna=False).sort_index()
        print(f"\n📊 {name} — '{target}':")
        print(f"  Total rows: {total}")
        print(f"  Missing: {missing} ({100 * missing / total:.2f}%)")
        print("  Top 10 value counts (including imputations):")
        print(value_counts.head(10))

# === Run for each target column ===
for target in target_columns:
    impute_column(target)

# === Save final files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === End ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")


In [None]:
🔧 Processing target: 'legs1_segments0_duration'
✅ train: Imputed 4387201 missing values in 'legs1_segments0_duration'
✅ test: Imputed 1115840 missing values in 'legs1_segments0_duration'

📊 train — 'legs1_segments0_duration':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_duration
-39.0      1
-36.0      1
-12.0      1
-11.0      1
-10.0      1
-7.0       1
-6.0       1
-2.0       9
-1.0       1
 0.0     208
Name: count, dtype: int64

📊 test — 'legs1_segments0_duration':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_segments0_duration
14.0    1
19.0    1
21.0    1
22.0    4
23.0    3
25.0    2
26.0    4
27.0    3
28.0    2
29.0    4
Name: count, dtype: int64

🔧 Processing target: 'legs1_duration'
✅ train: Imputed 5017122 missing values in 'legs1_duration'
✅ test: Imputed 1404203 missing values in 'legs1_duration'

📊 train — 'legs1_duration':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_duration
-311.0    1
-264.0    1
-257.0    2
-253.0    2
-242.0    4
-241.0    1
-232.0    2
-231.0    1
-223.0    7
-221.0    1
Name: count, dtype: int64

📊 test — 'legs1_duration':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_duration
-192.0     1
-159.0     1
-141.0     1
-121.0     1
-117.0     4
-114.0    10
-111.0     1
-108.0     1
-100.0     5
-99.0      1
Name: count, dtype: int64

⏱️ Total duration: 696.13 seconds

In [None]:
%%writefile check_neg.py
import pandas as pd

# Load files
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# Columns to check
cols = ['legs1_segments0_duration', 'legs1_duration']

# Function for checking
def check_nulls_and_negatives(df, df_name):
    print(f"\n📊 Checking {df_name}")
    for col in cols:
        if col not in df.columns:
            print(f"⚠️ Column '{col}' does not exist in {df_name}")
            continue
        try:
            # Convert to numeric (if needed)
            series = pd.to_numeric(df[col], errors='coerce')
            null_count = series.isna().sum()
            negative_count = (series < 0).sum()
            print(f"🔹 {col}:")
            print(f"   - Null values: {null_count}")
            print(f"   - Values < 0: {negative_count}")
        except Exception as e:
            print(f"❌ Error processing column '{col}': {e}")

# Run for both datasets
check_nulls_and_negatives(train, "train.csv")
check_nulls_and_negatives(test, "test.csv")


📊 Check for train.csv
🔹 legs1_segments0_duration:

Missing values: 0

Values < 0: 17

🔹 legs1_duration:

Missing values: 0

Values < 0: 2719

📊 Check for test.csv
🔹 legs1_segments0_duration:

Missing values: 0

Values < 0: 0

🔹 legs1_duration:

Missing values: 0

Values < 0: 642

In [None]:
%%writefile impute_neg_with_median.py
import pandas as pd

# === Load the data ===
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# === Columns to process ===
columns = ["legs1_segments0_duration", "legs1_duration"]

# === Prepare report ===
report_lines = ["📄 Report — Replacing values ≤ 0 with median\n"]

# === Process each column and dataset ===
for col in columns:
    report_lines.append(f"\n=== 🔧 Column: {col} ===")

    for df, name in [(train, "Train"), (test, "Test")]:
        mask = df[col] <= 0
        count_replaced = mask.sum()
        median = df.loc[df[col] > 0, col].median()

        df.loc[mask, col] = median

        report_lines.append(f"📊 {name}:")
        report_lines.append(f"  - Median used: {median}")
        report_lines.append(f"  - Values replaced (≤ 0): {count_replaced}")

# === Save modified files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === Save the report ===
with open("imputation_report.txt", "w") as f:
    f.write("\n".join(report_lines))

print("✅ Processing complete. Details saved in 'imputation_report.txt'.")


In [None]:
=== 🔧 Column: legs1_segments0_duration ===
📊 Train:
  - Median used: 117.0
  - Values replaced (≤ 0): 225
📊 Test:
  - Median used: 105.0
  - Values replaced (≤ 0): 0

=== 🔧 Column: legs1_duration ===
📊 Train:
  - Median used: 139.0
  - Values replaced (≤ 0): 2771
📊 Test:
  - Median used: 105.0
  - Values replaced (≤ 0): 659

In [None]:
%%writefile impute_legs1_segments0_flightNumber.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import time

# === Timer ===
start = time.time()

# === Load files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Relevant columns for prediction ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount',
    'miniRules0_statusInfos',
    'miniRules1_statusInfos'
]

# === Target columns to impute ===
target_columns = [
    'legs1_segments0_flightNumber'
]

# === Train and impute function for a target column ===
def impute_column(target):
    print(f"\n🔧 Processing target: '{target}'")

    train_valid = train.dropna(subset=features + [target])
    if train_valid.empty:
        print(f"❌ Skipped '{target}': No complete training data.")
        return

    # Sample: 10% of the data
    train_valid = train_valid.sample(frac=0.1, random_state=42)

    X_train = train_valid[features]
    y_train_raw = train_valid[target].astype(str)  # treat as identifier

    # Label Encoding
    le = LabelEncoder()
    y_train = le.fit_transform(y_train_raw)

    # Model
    model = XGBClassifier(
        n_estimators=50,
        max_depth=4,
        learning_rate=0.1,
        tree_method='hist',
        n_jobs=8,
        random_state=42,
        eval_metric='mlogloss'
    )
    model.fit(X_train, y_train)

    # Impute train & test
    for df, name in [(train, "train"), (test, "test")]:
        missing_mask = df[target].isna()
        if missing_mask.sum() == 0:
            print(f"✅ {name}: No missing values in '{target}'")
            continue

        imputable = df.loc[missing_mask, features].dropna()
        imputable_indices = imputable.index

        if len(imputable_indices) == 0:
            print(f"⚠️ {name}: No imputable rows with all required features for '{target}'.")
            continue

        preds = model.predict(imputable)
        preds_labels = le.inverse_transform(preds).astype(float)  # convert back to numeric
        df.loc[imputable_indices, target] = preds_labels
        print(f"✅ {name}: Imputed {len(imputable_indices)} missing values in '{target}'")

    # Report
    for df, name in [(train, "train"), (test, "test")]:
        total = len(df)
        missing = df[target].isna().sum()
        value_counts = df[target].value_counts(dropna=False).sort_index()
        print(f"\n📊 {name} — '{target}':")
        print(f"  Total rows: {total}")
        print(f"  Missing: {missing} ({100 * missing / total:.2f}%)")
        print("  Top 10 value counts (including imputations):")
        print(value_counts.head(10))

# === Run for each target column ===
for target in target_columns:
    impute_column(target)

# === Save final files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === Final ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")


In [None]:
%%writefile convert_leg1_arrival_departure.py
import pandas as pd
import numpy as np

# Read CSV files
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

def engineer_datetime_features(df, col):
    df[col] = pd.to_datetime(df[col], errors='coerce')
    prefix = f"{col}_"

    df[f"{prefix}dayofweek"] = df[col].dt.dayofweek
    df[f"{prefix}hour"] = df[col].dt.hour
    df[f"{prefix}minute"] = df[col].dt.minute
    df[f"{prefix}is_weekend"] = df[f"{prefix}dayofweek"].isin([5, 6]).astype(int)
    df[f"{prefix}day"] = df[col].dt.day
    df[f"{prefix}month"] = df[col].dt.month
    df[f"{prefix}year"] = df[col].dt.year
    df[f"{prefix}weekofyear"] = df[col].dt.isocalendar().week

    def get_part_of_day(hour):
        if pd.isna(hour): return np.nan
        if hour < 6: return "night"
        elif hour < 12: return "morning"
        elif hour < 18: return "afternoon"
        else: return "evening"

    df[f"{prefix}part_of_day"] = df[f"{prefix}hour"].apply(get_part_of_day).astype("category")
    df[f"{prefix}hour_sin"] = np.sin(2 * np.pi * df[f"{prefix}hour"] / 24)
    df[f"{prefix}hour_cos"] = np.cos(2 * np.pi * df[f"{prefix}hour"] / 24)

    return df

# Apply the function for both columns
for col in ["legs1_departureAt", "legs1_arrivalAt"]:
    train_df = engineer_datetime_features(train_df, col)
    test_df = engineer_datetime_features(test_df, col)

# Save a .txt file with important info
with open("leg1_features_info.txt", "w") as f:
    for col in ["legs1_departureAt", "legs1_arrivalAt"]:
        prefix = f"{col}_"
        f.write(f"\n==== {col} ====\n")
        for feature in [
            f"{prefix}dayofweek", f"{prefix}hour", f"{prefix}minute",
            f"{prefix}is_weekend", f"{prefix}day", f"{prefix}month",
            f"{prefix}year", f"{prefix}weekofyear", f"{prefix}part_of_day"
        ]:
            nulls = train_df[feature].isnull().sum()
            unique_vals = train_df[feature].dropna().unique()
            unique_count = len(unique_vals)
            f.write(f"{feature}: {unique_count} unique values, {nulls} missing values\n")
            preview_vals = unique_vals[:20]
            f.write(f"    Sample values: {', '.join(map(str, preview_vals))}\n")

# 🔥 Save modified DataFrames to CSV
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)


In [None]:
==== legs1_departureAt ====
legs1_departureAt_dayofweek: 7 unique values, 4387201 missing values
    Sample values: 1.0, 5.0, 2.0, 3.0, 4.0, 0.0, 6.0
legs1_departureAt_hour: 24 unique values, 4387201 missing values
    Sample values: 9.0, 22.0, 19.0, 17.0, 6.0, 14.0, 11.0, 12.0, 8.0, 16.0, 7.0, 4.0, 5.0, 13.0, 18.0, 1.0, 3.0, 15.0, 20.0, 21.0
legs1_departureAt_minute: 51 unique values, 4387201 missing values
    Sample values: 45.0, 5.0, 35.0, 55.0, 0.0, 25.0, 20.0, 30.0, 15.0, 40.0, 50.0, 10.0, 29.0, 38.0, 4.0, 58.0, 42.0, 57.0, 36.0, 32.0
legs1_departureAt_is_weekend: 2 unique values, 0 missing values
    Sample values: 0, 1
legs1_departureAt_day: 31 unique values, 4387201 missing values
    Sample values: 9.0, 25.0, 22.0, 29.0, 15.0, 30.0, 23.0, 24.0, 31.0, 5.0, 16.0, 19.0, 21.0, 7.0, 28.0, 26.0, 1.0, 20.0, 2.0, 4.0
legs1_departureAt_month: 11 unique values, 4387201 missing values
    Sample values: 7.0, 5.0, 8.0, 6.0, 9.0, 10.0, 11.0, 12.0, 1.0, 2.0, 3.0
legs1_departureAt_year: 2 unique values, 4387201 missing values
    Sample values: 2024.0, 2025.0
legs1_departureAt_weekofyear: 41 unique values, 4387201 missing values
    Sample values: 28, 21, 22, 33, 23, 38, 34, 25, 24, 26, 29, 27, 31, 35, 37, 30, 41, 39, 32, 36
legs1_departureAt_part_of_day: 4 unique values, 4387201 missing values
    Sample values: morning, evening, afternoon, night

==== legs1_arrivalAt ====
legs1_arrivalAt_dayofweek: 7 unique values, 4387201 missing values
    Sample values: 1.0, 2.0, 5.0, 3.0, 4.0, 0.0, 6.0
legs1_arrivalAt_hour: 24 unique values, 4387201 missing values
    Sample values: 14.0, 8.0, 11.0, 22.0, 18.0, 16.0, 12.0, 5.0, 19.0, 20.0, 0.0, 3.0, 17.0, 23.0, 15.0, 13.0, 21.0, 4.0, 1.0, 6.0
legs1_arrivalAt_minute: 24 unique values, 4387201 missing values
    Sample values: 20.0, 30.0, 5.0, 40.0, 35.0, 15.0, 55.0, 0.0, 25.0, 50.0, 45.0, 10.0, 59.0, 1.0, 56.0, 28.0, 4.0, 23.0, 12.0, 27.0
legs1_arrivalAt_is_weekend: 2 unique values, 0 missing values
    Sample values: 0, 1
legs1_arrivalAt_day: 31 unique values, 4387201 missing values
    Sample values: 9.0, 10.0, 25.0, 22.0, 23.0, 29.0, 30.0, 15.0, 16.0, 31.0, 24.0, 1.0, 5.0, 6.0, 17.0, 19.0, 21.0, 7.0, 8.0, 28.0
legs1_arrivalAt_month: 11 unique values, 4387201 missing values
    Sample values: 7.0, 5.0, 8.0, 6.0, 9.0, 10.0, 11.0, 12.0, 1.0, 2.0, 3.0
legs1_arrivalAt_year: 2 unique values, 4387201 missing values
    Sample values: 2024.0, 2025.0
legs1_arrivalAt_weekofyear: 43 unique values, 4387201 missing values
    Sample values: 28, 21, 22, 33, 23, 38, 34, 25, 24, 26, 29, 27, 31, 35, 37, 30, 41, 39, 32, 36
legs1_arrivalAt_part_of_day: 4 unique values, 4387201 missing values
    Sample values: afternoon, morning, evening, night

In [None]:
%%writefile check_leg1_columns.py
import pandas as pd

# === Load CSV files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Function to extract columns with a given prefix ===
def get_prefixed_columns(df, prefix):
    return sorted([col for col in df.columns if col.startswith(prefix)])

# === Extract relevant columns ===
prefixes = ["legs1_departureAt", "legs1_arrivalAt"]

for prefix in prefixes:
    train_cols = get_prefixed_columns(train, prefix)
    test_cols = get_prefixed_columns(test, prefix)

    print(f"\n📦 Prefix: {prefix}")
    print(f"✅ train.csv: {len(train_cols)} columns")
    print(train_cols)

    print(f"\n✅ test.csv: {len(test_cols)} columns")
    print(test_cols)

    # === Differences between train and test ===
    missing_in_train = set(test_cols) - set(train_cols)
    missing_in_test = set(train_cols) - set(test_cols)

    if missing_in_train:
        print(f"\n⚠️ Columns present in test but missing in train: {sorted(missing_in_train)}")
    if missing_in_test:
        print(f"⚠️ Columns present in train but missing in test: {sorted(missing_in_test)}")
    if not missing_in_train and not missing_in_test:
        print("✅ Columns match between train and test.")

print("\n✅ Final check complete.")


In [None]:
📦 Prefix: legs1_departureAt
✅ train.csv: 12 columns
['legs1_departureAt', 'legs1_departureAt_day', 'legs1_departureAt_dayofweek', 'legs1_departureAt_hour', 'legs1_departureAt_hour_cos', 'legs1_departureAt_hour_sin', 'legs1_departureAt_is_weekend', 'legs1_departureAt_minute', 'legs1_departureAt_month', 'legs1_departureAt_part_of_day', 'legs1_departureAt_weekofyear', 'legs1_departureAt_year']

✅ test.csv: 12 columns
['legs1_departureAt', 'legs1_departureAt_day', 'legs1_departureAt_dayofweek', 'legs1_departureAt_hour', 'legs1_departureAt_hour_cos', 'legs1_departureAt_hour_sin', 'legs1_departureAt_is_weekend', 'legs1_departureAt_minute', 'legs1_departureAt_month', 'legs1_departureAt_part_of_day', 'legs1_departureAt_weekofyear', 'legs1_departureAt_year']
✅ Coloanele se potrivesc între train și test.

📦 Prefix: legs1_arrivalAt
✅ train.csv: 12 columns
['legs1_arrivalAt', 'legs1_arrivalAt_day', 'legs1_arrivalAt_dayofweek', 'legs1_arrivalAt_hour', 'legs1_arrivalAt_hour_cos', 'legs1_arrivalAt_hour_sin', 'legs1_arrivalAt_is_weekend', 'legs1_arrivalAt_minute', 'legs1_arrivalAt_month', 'legs1_arrivalAt_part_of_day', 'legs1_arrivalAt_weekofyear', 'legs1_arrivalAt_year']

✅ test.csv: 12 columns
['legs1_arrivalAt', 'legs1_arrivalAt_day', 'legs1_arrivalAt_dayofweek', 'legs1_arrivalAt_hour', 'legs1_arrivalAt_hour_cos', 'legs1_arrivalAt_hour_sin', 'legs1_arrivalAt_is_weekend', 'legs1_arrivalAt_minute', 'legs1_arrivalAt_month', 'legs1_arrivalAt_part_of_day', 'legs1_arrivalAt_weekofyear', 'legs1_arrivalAt_year']
✅ Coloanele se potrivesc între train și test.

✅ Verificare finalizată.

In [None]:
%%writefile impute_arrival_departure.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import time

# === Timer ===
start = time.time()

# === Load files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Relevant columns for prediction ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount',
    'miniRules0_statusInfos',
    'miniRules1_statusInfos'
]

# === Target columns to impute ===
target_columns = [
    'legs1_departureAt_hour',
    'legs1_departureAt_day',
    'legs1_departureAt_year',
    'legs1_arrivalAt_hour',
    'legs1_arrivalAt_day',
    'legs1_arrivalAt_year',
]

# === Training and imputation function for a target column ===
def impute_column(target):
    print(f"\n🔧 Processing target: '{target}'")

    train_valid = train.dropna(subset=features + [target])
    if train_valid.empty:
        print(f"❌ Skipped '{target}': No complete training data.")
        return

    X_train = train_valid[features]
    y_train_raw = train_valid[target]

    # Label Encoding
    le = LabelEncoder()
    y_train = le.fit_transform(y_train_raw)

    # Model
    model = XGBClassifier(
        n_estimators=500,
        max_depth=12,
        learning_rate=0.1,
        tree_method='hist',
        n_jobs=24,
        random_state=42,
        eval_metric='logloss'
    )
    model.fit(X_train, y_train)

    # Impute train & test
    for df, name in [(train, "train"), (test, "test")]:
        missing_mask = df[target].isna()
        if missing_mask.sum() == 0:
            print(f"✅ {name}: No missing values in '{target}'")
            continue

        imputable = df.loc[missing_mask, features].dropna()
        imputable_indices = imputable.index

        if len(imputable_indices) == 0:
            print(f"⚠️ {name}: No imputable rows with all required features for '{target}'.")
            continue

        preds = model.predict(imputable)
        preds_labels = le.inverse_transform(preds)
        df.loc[imputable_indices, target] = preds_labels
        print(f"✅ {name}: Imputed {len(imputable_indices)} missing values in '{target}'")

    # Report
    for df, name in [(train, "train"), (test, "test")]:
        total = len(df)
        missing = df[target].isna().sum()
        value_counts = df[target].value_counts(dropna=False).sort_index()
        print(f"\n📊 {name} — '{target}':")
        print(f"  Total rows: {total}")
        print(f"  Missing: {missing} ({100 * missing / total:.2f}%)")
        print("  Top 10 value counts (including imputations):")
        print(value_counts.head(10))

# === Run for each target column ===
for target in target_columns:
    impute_column(target)

# === Save final files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === Done ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")


In [None]:
🔧 Processing target: 'legs1_departureAt_hour'
✅ train: Imputed 4387201 missing values in 'legs1_departureAt_hour'
✅ test: Imputed 1115840 missing values in 'legs1_departureAt_hour'

📊 train — 'legs1_departureAt_hour':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_departureAt_hour
0.0    393387
1.0    161969
2.0    115577
3.0     86281
4.0    106254
5.0    261518
6.0    719628
7.0    895037
8.0    760941
9.0    945254
Name: count, dtype: int64

📊 test — 'legs1_departureAt_hour':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_departureAt_hour
0.0     53296
1.0     43638
2.0     24363
3.0     14219
4.0     16397
5.0     72157
6.0    277442
7.0    458236
8.0    344471
9.0    419305
Name: count, dtype: int64

🔧 Processing target: 'legs1_departureAt_day'
✅ train: Imputed 4387201 missing values in 'legs1_departureAt_day'
✅ test: Imputed 1115840 missing values in 'legs1_departureAt_day'

📊 train — 'legs1_departureAt_day':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_departureAt_day
1.0     511800
2.0     508327
3.0     536821
4.0     685396
5.0     678663
6.0     689507
7.0     418581
8.0     524057
9.0     594026
10.0    499952
Name: count, dtype: int64

📊 test — 'legs1_departureAt_day':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_departureAt_day
1.0     106524
2.0      69987
3.0      96969
4.0     156785
5.0     180100
6.0     511520
7.0     186139
8.0     241671
9.0      83000
10.0    107924
Name: count, dtype: int64

🔧 Processing target: 'legs1_departureAt_year'
✅ train: Imputed 4387201 missing values in 'legs1_departureAt_year'
✅ test: Imputed 1115840 missing values in 'legs1_departureAt_year'

📊 train — 'legs1_departureAt_year':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_departureAt_year
2024.0    18083540
2025.0       61832
Name: count, dtype: int64

📊 test — 'legs1_departureAt_year':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_departureAt_year
2024.0    6057506
2025.0     840270
Name: count, dtype: int64

🔧 Processing target: 'legs1_arrivalAt_hour'
✅ train: Imputed 4387201 missing values in 'legs1_arrivalAt_hour'
✅ test: Imputed 1115840 missing values in 'legs1_arrivalAt_hour'

📊 train — 'legs1_arrivalAt_hour':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_arrivalAt_hour
0.0    683270
1.0    836862
2.0    274884
3.0    103193
4.0    280755
5.0    337765
6.0    472778
7.0    568381
8.0    762010
9.0    708097
Name: count, dtype: int64

📊 test — 'legs1_arrivalAt_hour':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_arrivalAt_hour
0.0    299374
1.0    156804
2.0     68247
3.0     42509
4.0     41576
5.0    121890
6.0    146090
7.0    181799
8.0    339903
9.0    346955
Name: count, dtype: int64

🔧 Processing target: 'legs1_arrivalAt_day'
✅ train: Imputed 4387201 missing values in 'legs1_arrivalAt_day'
✅ test: Imputed 1115840 missing values in 'legs1_arrivalAt_day'

📊 train — 'legs1_arrivalAt_day':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_arrivalAt_day
1.0     552349
2.0     522264
3.0     502369
4.0     609874
5.0     805291
6.0     606334
7.0     496193
8.0     512826
9.0     575112
10.0    517415
Name: count, dtype: int64

📊 test — 'legs1_arrivalAt_day':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_arrivalAt_day
1.0     122863
2.0      72214
3.0      84069
4.0     143204
5.0     186988
6.0     452903
7.0     223914
8.0     231200
9.0     118884
10.0    110787
Name: count, dtype: int64

🔧 Processing target: 'legs1_arrivalAt_year'
✅ train: Imputed 4387201 missing values in 'legs1_arrivalAt_year'
✅ test: Imputed 1115840 missing values in 'legs1_arrivalAt_year'

📊 train — 'legs1_arrivalAt_year':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_arrivalAt_year
2024.0    18083978
2025.0       61394
Name: count, dtype: int64

📊 test — 'legs1_arrivalAt_year':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_arrivalAt_year
2024.0    6057960
2025.0     839816
Name: count, dtype: int64

⏱️ Total duration: 19245.6 seconds

In [None]:
%%writefile impute_multiple_targets.py
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
import time

# === Timer ===
start = time.time()

# === Load files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Relevant columns for prediction ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount',
    'miniRules0_statusInfos',
    'miniRules1_statusInfos'
]

# === Target columns to impute ===
target_columns = [
    'legs1_departureAt_minute',
    'legs1_departureAt_month',
    'legs1_arrivalAt_minute',
    'legs1_arrivalAt_month'
]

# === Training and imputation function for a target column ===
def impute_column(target):
    print(f"\n🔧 Processing target: '{target}'")

    train_valid = train.dropna(subset=features + [target])
    if train_valid.empty:
        print(f"❌ Skipped '{target}': No complete training data.")
        return

    X_train = train_valid[features]
    y_train = train_valid[target].astype(float)

    # Model
    model = XGBRegressor(
        n_estimators=500,
        max_depth=12,
        learning_rate=0.1,
        tree_method='hist',
        n_jobs=24,
        random_state=42,
        eval_metric='rmse'
    )
    model.fit(X_train, y_train)

    # Impute train & test
    for df, name in [(train, "train"), (test, "test")]:
        missing_mask = df[target].isna()
        if missing_mask.sum() == 0:
            print(f"✅ {name}: No missing values in '{target}'")
            continue

        imputable = df.loc[missing_mask, features].dropna()
        imputable_indices = imputable.index

        if len(imputable_indices) == 0:
            print(f"⚠️ {name}: No imputable rows with all required features for '{target}'.")
            continue

        preds = model.predict(imputable)

        # Round and clip to valid ranges
        if 'minute' in target:
            preds_rounded = np.clip(np.round(preds), 0.0, 59.0)
        elif 'month' in target:
            preds_rounded = np.clip(np.round(preds), 1.0, 12.0)
        else:
            preds_rounded = np.round(preds)

        # Assign as float
        df.loc[imputable_indices, target] = preds_rounded
        print(f"✅ {name}: Imputed {len(imputable_indices)} missing values in '{target}'")

    # Report
    for df, name in [(train, "train"), (test, "test")]:
        total = len(df)
        missing = df[target].isna().sum()
        value_counts = df[target].value_counts(dropna=False).sort_index()
        print(f"\n📊 {name} — '{target}':")
        print(f"  Total rows: {total}")
        print(f"  Missing: {missing} ({100 * missing / total:.2f}%)")
        print("  Top 10 value counts (including imputations):")
        print(value_counts.head(10))

# === Run for each target column ===
for target in target_columns:
    impute_column(target)

# === Save final files ===
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)

# === Done ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")


In [None]:

🔧 Processing target: 'legs1_departureAt_minute'
✅ train: Imputed 4387201 missing values in 'legs1_departureAt_minute'
✅ test: Imputed 1115840 missing values in 'legs1_departureAt_minute'

📊 train — 'legs1_departureAt_minute':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_departureAt_minute
0.0    2858573
1.0        900
2.0       1381
3.0       1482
4.0       3034
5.0     670899
6.0       4307
7.0       6378
8.0       8066
9.0      10667
Name: count, dtype: int64

📊 test — 'legs1_departureAt_minute':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_departureAt_minute
0.0    1903999
1.0        461
2.0        295
3.0        532
4.0        708
5.0     239564
6.0       1379
7.0       3264
8.0       2601
9.0       3696
Name: count, dtype: int64

🔧 Processing target: 'legs1_departureAt_month'
✅ train: Imputed 4387201 missing values in 'legs1_departureAt_month'
✅ test: Imputed 1115840 missing values in 'legs1_departureAt_month'

📊 train — 'legs1_departureAt_month':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_departureAt_month
1.0       56630
2.0        2155
3.0         881
4.0        1770
5.0      197529
6.0     1150305
7.0     2119744
8.0     3810257
9.0     5018265
10.0    4268105
Name: count, dtype: int64

📊 test — 'legs1_departureAt_month':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_departureAt_month
1.0     516885
2.0     281334
3.0      26488
4.0       9676
5.0       4808
6.0       8236
7.0      53889
8.0     242772
9.0     464477
10.0    301265
Name: count, dtype: int64

🔧 Processing target: 'legs1_arrivalAt_minute'
✅ train: Imputed 4387201 missing values in 'legs1_arrivalAt_minute'
✅ test: Imputed 1115840 missing values in 'legs1_arrivalAt_minute'

📊 train — 'legs1_arrivalAt_minute':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_arrivalAt_minute
0.0    1935102
1.0       1456
2.0        983
3.0       1482
4.0       2212
5.0    1025889
6.0       4425
7.0       5654
8.0       8146
9.0      10985
Name: count, dtype: int64

📊 test — 'legs1_arrivalAt_minute':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_arrivalAt_minute
0.0    381944
1.0       132
2.0       208
3.0       317
4.0       392
5.0    530740
6.0       955
7.0      1225
8.0      1760
9.0      2303
Name: count, dtype: int64

🔧 Processing target: 'legs1_arrivalAt_month'
✅ train: Imputed 4387201 missing values in 'legs1_arrivalAt_month'
✅ test: Imputed 1115840 missing values in 'legs1_arrivalAt_month'

📊 train — 'legs1_arrivalAt_month':
  Total rows: 18145372
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_arrivalAt_month
1.0       56672
2.0        2188
3.0         901
4.0        1974
5.0      188626
6.0     1147816
7.0     2096783
8.0     3889195
9.0     5038136
10.0    4162021
Name: count, dtype: int64

📊 test — 'legs1_arrivalAt_month':
  Total rows: 6897776
  Missing: 0 (0.00%)
  Top 10 value counts (including imputations):
legs1_arrivalAt_month
1.0     499721
2.0     296704
3.0      28547
4.0       9830
5.0       5620
6.0      12366
7.0      60138
8.0     254093
9.0     454942
10.0    287553
Name: count, dtype: int64

⏱️ Total duration: 1180.55 seconds

In [None]:
%%writefile check_unique_values.py
import pandas as pd

# Load CSV files
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# Columns of interest
columns = [
    'legs1_departureAt_minute',
    'legs1_departureAt_month',
    'legs1_arrivalAt_minute',
    'legs1_arrivalAt_month'
]

# Display unique values for each column in train and test
for col in columns:
    print(f"\n🔍 {col} — train.csv:")
    print(sorted(train[col].dropna().unique()))

    print(f"🔍 {col} — test.csv:")
    print(sorted(test[col].dropna().unique()))


In [None]:
🔍 legs1_departureAt_minute — train.csv:
[np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0), np.float64(13.0), np.float64(14.0), np.float64(15.0), np.float64(16.0), np.float64(17.0), np.float64(18.0), np.float64(19.0), np.float64(20.0), np.float64(21.0), np.float64(22.0), np.float64(23.0), np.float64(24.0), np.float64(25.0), np.float64(26.0), np.float64(27.0), np.float64(28.0), np.float64(29.0), np.float64(30.0), np.float64(31.0), np.float64(32.0), np.float64(33.0), np.float64(34.0), np.float64(35.0), np.float64(36.0), np.float64(37.0), np.float64(38.0), np.float64(39.0), np.float64(40.0), np.float64(41.0), np.float64(42.0), np.float64(43.0), np.float64(44.0), np.float64(45.0), np.float64(46.0), np.float64(47.0), np.float64(48.0), np.float64(49.0), np.float64(50.0), np.float64(51.0), np.float64(52.0), np.float64(53.0), np.float64(54.0), np.float64(55.0), np.float64(56.0), np.float64(57.0), np.float64(58.0), np.float64(59.0)]
🔍 legs1_departureAt_minute — test.csv:
[np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0), np.float64(13.0), np.float64(14.0), np.float64(15.0), np.float64(16.0), np.float64(17.0), np.float64(18.0), np.float64(19.0), np.float64(20.0), np.float64(21.0), np.float64(22.0), np.float64(23.0), np.float64(24.0), np.float64(25.0), np.float64(26.0), np.float64(27.0), np.float64(28.0), np.float64(29.0), np.float64(30.0), np.float64(31.0), np.float64(32.0), np.float64(33.0), np.float64(34.0), np.float64(35.0), np.float64(36.0), np.float64(37.0), np.float64(38.0), np.float64(39.0), np.float64(40.0), np.float64(41.0), np.float64(42.0), np.float64(43.0), np.float64(44.0), np.float64(45.0), np.float64(46.0), np.float64(47.0), np.float64(48.0), np.float64(49.0), np.float64(50.0), np.float64(51.0), np.float64(52.0), np.float64(53.0), np.float64(54.0), np.float64(55.0), np.float64(56.0), np.float64(57.0), np.float64(58.0), np.float64(59.0)]

🔍 legs1_departureAt_month — train.csv:
[np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0)]
🔍 legs1_departureAt_month — test.csv:
[np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0)]

🔍 legs1_arrivalAt_minute — train.csv:
[np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0), np.float64(13.0), np.float64(14.0), np.float64(15.0), np.float64(16.0), np.float64(17.0), np.float64(18.0), np.float64(19.0), np.float64(20.0), np.float64(21.0), np.float64(22.0), np.float64(23.0), np.float64(24.0), np.float64(25.0), np.float64(26.0), np.float64(27.0), np.float64(28.0), np.float64(29.0), np.float64(30.0), np.float64(31.0), np.float64(32.0), np.float64(33.0), np.float64(34.0), np.float64(35.0), np.float64(36.0), np.float64(37.0), np.float64(38.0), np.float64(39.0), np.float64(40.0), np.float64(41.0), np.float64(42.0), np.float64(43.0), np.float64(44.0), np.float64(45.0), np.float64(46.0), np.float64(47.0), np.float64(48.0), np.float64(49.0), np.float64(50.0), np.float64(51.0), np.float64(52.0), np.float64(53.0), np.float64(54.0), np.float64(55.0), np.float64(56.0), np.float64(57.0), np.float64(58.0), np.float64(59.0)]
🔍 legs1_arrivalAt_minute — test.csv:
[np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0), np.float64(13.0), np.float64(14.0), np.float64(15.0), np.float64(16.0), np.float64(17.0), np.float64(18.0), np.float64(19.0), np.float64(20.0), np.float64(21.0), np.float64(22.0), np.float64(23.0), np.float64(24.0), np.float64(25.0), np.float64(26.0), np.float64(27.0), np.float64(28.0), np.float64(29.0), np.float64(30.0), np.float64(31.0), np.float64(32.0), np.float64(33.0), np.float64(34.0), np.float64(35.0), np.float64(36.0), np.float64(37.0), np.float64(38.0), np.float64(39.0), np.float64(40.0), np.float64(41.0), np.float64(42.0), np.float64(43.0), np.float64(44.0), np.float64(45.0), np.float64(46.0), np.float64(47.0), np.float64(48.0), np.float64(49.0), np.float64(50.0), np.float64(51.0), np.float64(52.0), np.float64(53.0), np.float64(54.0), np.float64(55.0), np.float64(56.0), np.float64(57.0), np.float64(58.0), np.float64(59.0)]

🔍 legs1_arrivalAt_month — train.csv:
[np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0)]
🔍 legs1_arrivalAt_month — test.csv:
[np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0)]

In [None]:
%%writefile complete_columns.py
import pandas as pd
import numpy as np

# Load files
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

def fill_missing_time_features(df, prefix):
    # 1. Create mask for missing values
    missing_mask = df[f"{prefix}_dayofweek"].isna() | df[f"{prefix}_weekofyear"].isna() | \
                   df[f"{prefix}_is_weekend"].isna() | df[f"{prefix}_part_of_day"].isna() | \
                   df[f"{prefix}_hour_sin"].isna() | df[f"{prefix}_hour_cos"].isna()

    # 2. Ensure all required components are available
    required_cols = [f"{prefix}_{comp}" for comp in ["year", "month", "day", "hour", "minute"]]
    if not all(col in df.columns for col in required_cols):
        print(f"⚠️ Missing components for {prefix}, skipping.")
        return df

    complete_mask = df[required_cols].notnull().all(axis=1)
    final_mask = missing_mask & complete_mask

    if final_mask.sum() == 0:
        print(f"✅ {prefix}: nothing to complete.")
        return df

    print(f"🔧 Filling {final_mask.sum()} rows for {prefix}")

    # 3. Create datetime only for missing rows
    temp_dt = pd.to_datetime(dict(
        year=df.loc[final_mask, f"{prefix}_year"].astype(int),
        month=df.loc[final_mask, f"{prefix}_month"].astype(int),
        day=df.loc[final_mask, f"{prefix}_day"].astype(int),
        hour=df.loc[final_mask, f"{prefix}_hour"].astype(int),
        minute=df.loc[final_mask, f"{prefix}_minute"].astype(int)
    ), errors='coerce')

    # 4. Fill in missing values
    df.loc[final_mask, f"{prefix}_dayofweek"] = temp_dt.dt.dayofweek
    df.loc[final_mask, f"{prefix}_weekofyear"] = temp_dt.dt.isocalendar().week
    df.loc[final_mask, f"{prefix}_is_weekend"] = temp_dt.dt.dayofweek.isin([5, 6]).astype(int)

    def get_part_of_day(hour):
        if pd.isna(hour): return np.nan
        if hour < 6: return "night"
        elif hour < 12: return "morning"
        elif hour < 18: return "afternoon"
        else: return "evening"

    df.loc[final_mask, f"{prefix}_part_of_day"] = df.loc[final_mask, f"{prefix}_hour"].apply(get_part_of_day)

    hour = df.loc[final_mask, f"{prefix}_hour"]
    df.loc[final_mask, f"{prefix}_hour_sin"] = np.sin(2 * np.pi * hour / 24)
    df.loc[final_mask, f"{prefix}_hour_cos"] = np.cos(2 * np.pi * hour / 24)

    return df

# Apply for both train and test, for departure and arrival
for df in [train, test]:
    df = fill_missing_time_features(df, "legs1_departureAt")
    df = fill_missing_time_features(df, "legs1_arrivalAt")

# Save the files
train.to_csv("train.csv", index=False)
test.to_csv("test.csv", index=False)


In [None]:
%%writefile check_columns.py
import pandas as pd

# Load the files
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

def check_leg_columns(df, name):
    print(f"\n🔍 Checks for: {name}")
    # Select only relevant columns
    relevant_cols = [col for col in df.columns if "departureAt" in col or "arrivalAt" in col]

    for col in sorted(relevant_cols):
        nan_count = df[col].isna().sum()
        unique_vals = df[col].dropna().unique()
        sample_vals = sorted(unique_vals[:20])
        total_unique = len(unique_vals)

        print(f"\n📌 Column: {col}")
        print(f"   - Missing values: {nan_count}")
        print(f"   - Number of unique values: {total_unique}")
        print(f"   - Sample unique values: {sample_vals}")

# Apply to both DataFrames
check_leg_columns(train, "train.csv")
check_leg_columns(test, "test.csv")


In [None]:
🔍 Verificări pentru: train.csv

📌 Coloana: legs0_arrivalAt
   - Valori lipsă: 0
   - Nr. valori unice: 60143
   - Valori unice (eșantion): ['2024-05-20T10:30:00', '2024-05-20T12:30:00', '2024-05-20T15:40:00', '2024-05-20T22:15:00', '2024-05-21T08:35:00', '2024-05-29T11:10:00', '2024-05-30T07:35:00', '2024-05-30T12:55:00', '2024-05-30T18:30:00', '2024-06-04T10:55:00', '2024-06-04T11:30:00', '2024-06-04T12:25:00', '2024-06-04T15:50:00', '2024-06-04T17:15:00', '2024-06-15T09:15:00', '2024-06-15T14:50:00', '2024-06-15T16:20:00', '2024-06-15T17:20:00', '2024-06-15T21:35:00', '2024-06-16T06:50:00']

📌 Coloana: legs0_departureAt
   - Valori lipsă: 0
   - Nr. valori unice: 56619
   - Valori unice (eșantion): ['2024-05-20T08:05:00', '2024-05-20T08:10:00', '2024-05-20T10:10:00', '2024-05-20T13:20:00', '2024-05-20T19:40:00', '2024-05-29T08:05:00', '2024-05-29T10:50:00', '2024-05-30T10:40:00', '2024-05-30T16:00:00', '2024-06-04T06:50:00', '2024-06-04T09:20:00', '2024-06-04T09:35:00', '2024-06-04T11:05:00', '2024-06-04T11:25:00', '2024-06-15T09:25:00', '2024-06-15T09:50:00', '2024-06-15T11:25:00', '2024-06-15T15:40:00', '2024-06-28T10:10:00', '2024-06-28T16:20:00']

📌 Coloana: legs1_arrivalAt
   - Valori lipsă: 4387201
   - Nr. valori unice: 50352
   - Valori unice (eșantion): ['2024-05-22 05:55:00', '2024-05-22 08:25:00', '2024-05-22 08:55:00', '2024-05-22 12:55:00', '2024-05-22 19:35:00', '2024-05-22 20:20:00', '2024-05-22 20:55:00', '2024-05-23 00:50:00', '2024-05-23 05:20:00', '2024-05-23 05:35:00', '2024-05-25 11:05:00', '2024-05-25 12:55:00', '2024-05-25 14:35:00', '2024-05-25 16:35:00', '2024-05-25 18:15:00', '2024-05-25 22:00:00', '2024-05-25 22:40:00', '2024-05-29 03:05:00', '2024-07-09 14:20:00', '2024-07-10 08:30:00']

📌 Coloana: legs1_arrivalAt_day
   - Valori lipsă: 0
   - Nr. valori unice: 31
   - Valori unice (eșantion): [np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(13.0), np.float64(16.0), np.float64(17.0), np.float64(18.0), np.float64(19.0), np.float64(21.0), np.float64(22.0), np.float64(23.0), np.float64(24.0), np.float64(26.0), np.float64(27.0), np.float64(28.0), np.float64(29.0), np.float64(30.0)]

📌 Coloana: legs1_arrivalAt_dayofweek
   - Valori lipsă: 23376
   - Nr. valori unice: 7
   - Valori unice (eșantion): [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0)]

📌 Coloana: legs1_arrivalAt_hour
   - Valori lipsă: 0
   - Nr. valori unice: 24
   - Valori unice (eșantion): [np.float64(2.0), np.float64(3.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0), np.float64(13.0), np.float64(14.0), np.float64(15.0), np.float64(16.0), np.float64(17.0), np.float64(18.0), np.float64(19.0), np.float64(20.0), np.float64(21.0), np.float64(23.0)]

📌 Coloana: legs1_arrivalAt_hour_cos
   - Valori lipsă: 0
   - Nr. valori unice: 23
   - Valori unice (eșantion): [np.float64(-1.0), np.float64(-0.9659258262890684), np.float64(-0.9659258262890682), np.float64(-0.8660254037844388), np.float64(-0.8660254037844387), np.float64(-0.7071067811865479), np.float64(-0.7071067811865475), np.float64(-0.5000000000000004), np.float64(-0.4999999999999998), np.float64(-0.2588190451025206), np.float64(-1.8369701987210294e-16), np.float64(6.123233995736766e-17), np.float64(0.2588190451025203), np.float64(0.2588190451025207), np.float64(0.5000000000000001), np.float64(0.7071067811865474), np.float64(0.7071067811865476), np.float64(0.8660254037844384), np.float64(0.8660254037844387), np.float64(0.965925826289068)]

📌 Coloana: legs1_arrivalAt_hour_sin
   - Valori lipsă: 0
   - Nr. valori unice: 21
   - Valori unice (eșantion): [np.float64(-1.0), np.float64(-0.9659258262890684), np.float64(-0.8660254037844386), np.float64(-0.8660254037844384), np.float64(-0.7071067811865477), np.float64(-0.7071067811865471), np.float64(-0.5000000000000004), np.float64(-0.4999999999999997), np.float64(-0.2588190451025215), np.float64(-0.2588190451025208), np.float64(1.2246467991473532e-16), np.float64(0.2588190451025207), np.float64(0.258819045102521), np.float64(0.4999999999999999), np.float64(0.7071067811865475), np.float64(0.7071067811865476), np.float64(0.8660254037844386), np.float64(0.8660254037844387), np.float64(0.9659258262890684), np.float64(1.0)]

📌 Coloana: legs1_arrivalAt_is_weekend
   - Valori lipsă: 0
   - Nr. valori unice: 2
   - Valori unice (eșantion): [np.int64(0), np.int64(1)]

📌 Coloana: legs1_arrivalAt_minute
   - Valori lipsă: 0
   - Nr. valori unice: 60
   - Valori unice (eșantion): [np.float64(18.0), np.float64(20.0), np.float64(21.0), np.float64(22.0), np.float64(24.0), np.float64(25.0), np.float64(26.0), np.float64(27.0), np.float64(28.0), np.float64(29.0), np.float64(30.0), np.float64(31.0), np.float64(32.0), np.float64(33.0), np.float64(34.0), np.float64(35.0), np.float64(37.0), np.float64(38.0), np.float64(39.0), np.float64(41.0)]

📌 Coloana: legs1_arrivalAt_month
   - Valori lipsă: 0
   - Nr. valori unice: 12
   - Valori unice (eșantion): [np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0)]

📌 Coloana: legs1_arrivalAt_part_of_day
   - Valori lipsă: 0
   - Nr. valori unice: 4
   - Valori unice (eșantion): ['afternoon', 'evening', 'morning', 'night']

📌 Coloana: legs1_arrivalAt_weekofyear
   - Valori lipsă: 23376
   - Nr. valori unice: 52
   - Valori unice (eșantion): [np.float64(19.0), np.float64(20.0), np.float64(22.0), np.float64(23.0), np.float64(25.0), np.float64(26.0), np.float64(28.0), np.float64(29.0), np.float64(30.0), np.float64(31.0), np.float64(32.0), np.float64(33.0), np.float64(34.0), np.float64(35.0), np.float64(37.0), np.float64(38.0), np.float64(39.0), np.float64(40.0), np.float64(41.0), np.float64(42.0)]

📌 Coloana: legs1_arrivalAt_year
   - Valori lipsă: 0
   - Nr. valori unice: 2
   - Valori unice (eșantion): [np.float64(2024.0), np.float64(2025.0)]

📌 Coloana: legs1_departureAt
   - Valori lipsă: 4387201
   - Nr. valori unice: 48606
   - Valori unice (eșantion): ['2024-05-22 04:45:00', '2024-05-22 05:15:00', '2024-05-22 07:45:00', '2024-05-22 08:25:00', '2024-05-22 11:10:00', '2024-05-22 11:45:00', '2024-05-22 13:40:00', '2024-05-22 18:50:00', '2024-05-25 06:00:00', '2024-05-25 08:25:00', '2024-05-25 08:30:00', '2024-05-25 11:30:00', '2024-05-25 12:05:00', '2024-05-25 14:20:00', '2024-05-25 16:30:00', '2024-05-25 22:25:00', '2024-07-09 09:45:00', '2024-07-09 17:55:00', '2024-07-09 19:35:00', '2024-07-09 22:05:00']

📌 Coloana: legs1_departureAt_day
   - Valori lipsă: 0
   - Nr. valori unice: 31
   - Valori unice (eșantion): [np.float64(1.0), np.float64(3.0), np.float64(5.0), np.float64(6.0), np.float64(9.0), np.float64(11.0), np.float64(13.0), np.float64(16.0), np.float64(17.0), np.float64(18.0), np.float64(19.0), np.float64(20.0), np.float64(21.0), np.float64(22.0), np.float64(24.0), np.float64(25.0), np.float64(27.0), np.float64(28.0), np.float64(29.0), np.float64(30.0)]

📌 Coloana: legs1_departureAt_dayofweek
   - Valori lipsă: 33113
   - Nr. valori unice: 7
   - Valori unice (eșantion): [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0)]

📌 Coloana: legs1_departureAt_hour
   - Valori lipsă: 0
   - Nr. valori unice: 24
   - Valori unice (eșantion): [np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(12.0), np.float64(13.0), np.float64(14.0), np.float64(15.0), np.float64(17.0), np.float64(18.0), np.float64(19.0), np.float64(20.0), np.float64(21.0), np.float64(22.0), np.float64(23.0)]

📌 Coloana: legs1_departureAt_hour_cos
   - Valori lipsă: 0
   - Nr. valori unice: 23
   - Valori unice (eșantion): [np.float64(-1.0), np.float64(-0.9659258262890684), np.float64(-0.9659258262890682), np.float64(-0.8660254037844388), np.float64(-0.8660254037844387), np.float64(-0.7071067811865479), np.float64(-0.7071067811865475), np.float64(-0.5000000000000004), np.float64(-0.4999999999999998), np.float64(-0.2588190451025206), np.float64(-1.8369701987210294e-16), np.float64(6.123233995736766e-17), np.float64(0.2588190451025203), np.float64(0.2588190451025207), np.float64(0.5000000000000001), np.float64(0.7071067811865474), np.float64(0.7071067811865476), np.float64(0.8660254037844384), np.float64(0.8660254037844387), np.float64(0.965925826289068)]

📌 Coloana: legs1_departureAt_hour_sin
   - Valori lipsă: 0
   - Nr. valori unice: 21
   - Valori unice (eșantion): [np.float64(-1.0), np.float64(-0.9659258262890684), np.float64(-0.8660254037844386), np.float64(-0.8660254037844384), np.float64(-0.7071067811865477), np.float64(-0.7071067811865471), np.float64(-0.5000000000000004), np.float64(-0.4999999999999997), np.float64(-0.2588190451025215), np.float64(-0.2588190451025208), np.float64(1.2246467991473532e-16), np.float64(0.2588190451025207), np.float64(0.258819045102521), np.float64(0.4999999999999999), np.float64(0.7071067811865475), np.float64(0.7071067811865476), np.float64(0.8660254037844386), np.float64(0.8660254037844387), np.float64(0.9659258262890684), np.float64(1.0)]

📌 Coloana: legs1_departureAt_is_weekend
   - Valori lipsă: 0
   - Nr. valori unice: 2
   - Valori unice (eșantion): [np.int64(0), np.int64(1)]

📌 Coloana: legs1_departureAt_minute
   - Valori lipsă: 0
   - Nr. valori unice: 60
   - Valori unice (eșantion): [np.float64(5.0), np.float64(19.0), np.float64(20.0), np.float64(21.0), np.float64(22.0), np.float64(23.0), np.float64(24.0), np.float64(25.0), np.float64(27.0), np.float64(28.0), np.float64(29.0), np.float64(30.0), np.float64(31.0), np.float64(32.0), np.float64(33.0), np.float64(35.0), np.float64(38.0), np.float64(39.0), np.float64(45.0), np.float64(55.0)]

📌 Coloana: legs1_departureAt_month
   - Valori lipsă: 0
   - Nr. valori unice: 12
   - Valori unice (eșantion): [np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0)]

📌 Coloana: legs1_departureAt_part_of_day
   - Valori lipsă: 0
   - Nr. valori unice: 4
   - Valori unice (eșantion): ['afternoon', 'evening', 'morning', 'night']

📌 Coloana: legs1_departureAt_weekofyear
   - Valori lipsă: 33113
   - Nr. valori unice: 52
   - Valori unice (eșantion): [np.float64(21.0), np.float64(22.0), np.float64(23.0), np.float64(24.0), np.float64(25.0), np.float64(26.0), np.float64(27.0), np.float64(28.0), np.float64(29.0), np.float64(30.0), np.float64(31.0), np.float64(32.0), np.float64(33.0), np.float64(34.0), np.float64(35.0), np.float64(36.0), np.float64(37.0), np.float64(38.0), np.float64(39.0), np.float64(42.0)]

📌 Coloana: legs1_departureAt_year
   - Valori lipsă: 0
   - Nr. valori unice: 2
   - Valori unice (eșantion): [np.float64(2024.0), np.float64(2025.0)]

🔍 Verificări pentru: test.csv

📌 Coloana: legs0_arrivalAt
   - Valori lipsă: 0
   - Nr. valori unice: 29117
   - Valori unice (eșantion): ['2024-11-13T07:45:00', '2024-11-13T21:00:00', '2024-11-14T04:30:00', '2024-12-19T11:20:00', '2024-12-19T12:30:00', '2024-12-19T12:45:00', '2024-12-19T14:20:00', '2024-12-19T17:25:00', '2024-12-19T18:00:00', '2024-12-19T20:10:00', '2024-12-19T22:05:00', '2024-12-19T22:10:00', '2024-12-20T00:55:00', '2024-12-20T02:00:00', '2024-12-20T02:20:00', '2024-12-20T03:45:00', '2024-12-20T03:50:00', '2024-12-20T04:05:00', '2024-12-20T04:20:00', '2024-12-20T06:45:00']

📌 Coloana: legs0_departureAt
   - Valori lipsă: 0
   - Nr. valori unice: 27294
   - Valori unice (eșantion): ['2024-11-13T01:10:00', '2024-12-19T06:50:00', '2024-12-19T07:50:00', '2024-12-19T08:25:00', '2024-12-19T09:40:00', '2024-12-19T10:00:00', '2024-12-19T12:50:00', '2024-12-19T13:35:00', '2024-12-19T15:40:00', '2024-12-19T16:45:00', '2024-12-19T17:40:00', '2024-12-19T17:45:00', '2024-12-19T20:35:00', '2024-12-19T21:15:00', '2024-12-19T21:30:00', '2024-12-19T22:00:00', '2024-12-19T23:10:00', '2024-12-19T23:25:00', '2024-12-19T23:45:00', '2024-12-19T23:55:00']

📌 Coloana: legs1_arrivalAt
   - Valori lipsă: 1115840
   - Nr. valori unice: 22232
   - Valori unice (eșantion): ['2024-11-14 20:45:00', '2024-11-15 01:20:00', '2024-12-21 06:15:00', '2024-12-21 07:30:00', '2024-12-21 07:35:00', '2024-12-21 07:45:00', '2024-12-21 08:55:00', '2024-12-21 09:20:00', '2024-12-21 10:10:00', '2024-12-21 12:35:00', '2024-12-21 13:00:00', '2024-12-21 13:20:00', '2024-12-21 14:35:00', '2024-12-21 16:00:00', '2024-12-21 16:40:00', '2024-12-21 19:05:00', '2024-12-21 19:40:00', '2024-12-21 21:10:00', '2024-12-21 21:55:00', '2024-12-21 23:50:00']

📌 Coloana: legs1_arrivalAt_day
   - Valori lipsă: 0
   - Nr. valori unice: 31
   - Valori unice (eșantion): [np.float64(2.0), np.float64(3.0), np.float64(5.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(13.0), np.float64(14.0), np.float64(15.0), np.float64(16.0), np.float64(17.0), np.float64(18.0), np.float64(19.0), np.float64(21.0), np.float64(22.0), np.float64(23.0), np.float64(25.0), np.float64(26.0), np.float64(28.0)]

📌 Coloana: legs1_arrivalAt_dayofweek
   - Valori lipsă: 7022
   - Nr. valori unice: 7
   - Valori unice (eșantion): [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0)]

📌 Coloana: legs1_arrivalAt_hour
   - Valori lipsă: 0
   - Nr. valori unice: 24
   - Valori unice (eșantion): [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0), np.float64(13.0), np.float64(14.0), np.float64(16.0), np.float64(17.0), np.float64(19.0), np.float64(20.0), np.float64(21.0), np.float64(22.0), np.float64(23.0)]

📌 Coloana: legs1_arrivalAt_hour_cos
   - Valori lipsă: 0
   - Nr. valori unice: 23
   - Valori unice (eșantion): [np.float64(-1.0), np.float64(-0.9659258262890684), np.float64(-0.9659258262890682), np.float64(-0.8660254037844388), np.float64(-0.8660254037844387), np.float64(-0.7071067811865475), np.float64(-0.5000000000000004), np.float64(-0.4999999999999998), np.float64(-0.2588190451025206), np.float64(6.123233995736766e-17), np.float64(0.2588190451025203), np.float64(0.2588190451025207), np.float64(0.5000000000000001), np.float64(0.7071067811865474), np.float64(0.7071067811865476), np.float64(0.8660254037844384), np.float64(0.8660254037844387), np.float64(0.965925826289068), np.float64(0.9659258262890684), np.float64(1.0)]

📌 Coloana: legs1_arrivalAt_hour_sin
   - Valori lipsă: 0
   - Nr. valori unice: 21
   - Valori unice (eșantion): [np.float64(-1.0), np.float64(-0.9659258262890684), np.float64(-0.8660254037844386), np.float64(-0.8660254037844384), np.float64(-0.7071067811865477), np.float64(-0.7071067811865471), np.float64(-0.5000000000000004), np.float64(-0.4999999999999997), np.float64(-0.2588190451025215), np.float64(-0.2588190451025208), np.float64(0.0), np.float64(1.2246467991473532e-16), np.float64(0.2588190451025207), np.float64(0.258819045102521), np.float64(0.4999999999999999), np.float64(0.7071067811865475), np.float64(0.7071067811865476), np.float64(0.8660254037844387), np.float64(0.9659258262890684), np.float64(1.0)]

📌 Coloana: legs1_arrivalAt_is_weekend
   - Valori lipsă: 0
   - Nr. valori unice: 2
   - Valori unice (eșantion): [np.int64(0), np.int64(1)]

📌 Coloana: legs1_arrivalAt_minute
   - Valori lipsă: 0
   - Nr. valori unice: 60
   - Valori unice (eșantion): [np.float64(0.0), np.float64(5.0), np.float64(10.0), np.float64(15.0), np.float64(20.0), np.float64(23.0), np.float64(24.0), np.float64(25.0), np.float64(26.0), np.float64(29.0), np.float64(30.0), np.float64(31.0), np.float64(32.0), np.float64(33.0), np.float64(34.0), np.float64(35.0), np.float64(40.0), np.float64(45.0), np.float64(50.0), np.float64(55.0)]

📌 Coloana: legs1_arrivalAt_month
   - Valori lipsă: 0
   - Nr. valori unice: 12
   - Valori unice (eșantion): [np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0)]

📌 Coloana: legs1_arrivalAt_part_of_day
   - Valori lipsă: 0
   - Nr. valori unice: 4
   - Valori unice (eșantion): ['afternoon', 'evening', 'morning', 'night']

📌 Coloana: legs1_arrivalAt_weekofyear
   - Valori lipsă: 7022
   - Nr. valori unice: 52
   - Valori unice (eșantion): [np.float64(27.0), np.float64(28.0), np.float64(31.0), np.float64(32.0), np.float64(33.0), np.float64(34.0), np.float64(35.0), np.float64(36.0), np.float64(37.0), np.float64(38.0), np.float64(39.0), np.float64(41.0), np.float64(42.0), np.float64(43.0), np.float64(44.0), np.float64(45.0), np.float64(46.0), np.float64(47.0), np.float64(48.0), np.float64(51.0)]

📌 Coloana: legs1_arrivalAt_year
   - Valori lipsă: 0
   - Nr. valori unice: 2
   - Valori unice (eșantion): [np.float64(2024.0), np.float64(2025.0)]

📌 Coloana: legs1_departureAt
   - Valori lipsă: 1115840
   - Nr. valori unice: 21221
   - Valori unice (eșantion): ['2024-11-14 08:50:00', '2024-11-14 18:05:00', '2024-11-14 22:20:00', '2024-12-21 05:30:00', '2024-12-21 06:50:00', '2024-12-21 07:00:00', '2024-12-21 07:20:00', '2024-12-21 08:15:00', '2024-12-21 08:30:00', '2024-12-21 09:30:00', '2024-12-21 12:00:00', '2024-12-21 12:15:00', '2024-12-21 13:55:00', '2024-12-21 15:20:00', '2024-12-21 16:00:00', '2024-12-21 18:25:00', '2024-12-21 19:00:00', '2024-12-21 20:30:00', '2024-12-21 21:10:00', '2024-12-21 23:10:00']

📌 Coloana: legs1_departureAt_day
   - Valori lipsă: 0
   - Nr. valori unice: 31
   - Valori unice (eșantion): [np.float64(2.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(10.0), np.float64(11.0), np.float64(13.0), np.float64(14.0), np.float64(15.0), np.float64(16.0), np.float64(17.0), np.float64(18.0), np.float64(20.0), np.float64(21.0), np.float64(22.0), np.float64(24.0), np.float64(26.0), np.float64(29.0), np.float64(31.0)]

📌 Coloana: legs1_departureAt_dayofweek
   - Valori lipsă: 7492
   - Nr. valori unice: 7
   - Valori unice (eșantion): [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0)]

📌 Coloana: legs1_departureAt_hour
   - Valori lipsă: 0
   - Nr. valori unice: 24
   - Valori unice (eșantion): [np.float64(2.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0), np.float64(13.0), np.float64(14.0), np.float64(15.0), np.float64(16.0), np.float64(17.0), np.float64(18.0), np.float64(19.0), np.float64(20.0), np.float64(21.0), np.float64(22.0), np.float64(23.0)]

📌 Coloana: legs1_departureAt_hour_cos
   - Valori lipsă: 0
   - Nr. valori unice: 23
   - Valori unice (eșantion): [np.float64(-1.0), np.float64(-0.9659258262890684), np.float64(-0.9659258262890682), np.float64(-0.8660254037844388), np.float64(-0.8660254037844387), np.float64(-0.7071067811865479), np.float64(-0.7071067811865475), np.float64(-0.5000000000000004), np.float64(-0.4999999999999998), np.float64(-0.2588190451025206), np.float64(-1.8369701987210294e-16), np.float64(-1.8369701987210287e-16), np.float64(6.123233995736766e-17), np.float64(0.2588190451025203), np.float64(0.2588190451025207), np.float64(0.5000000000000001), np.float64(0.7071067811865474), np.float64(0.8660254037844384), np.float64(0.8660254037844387), np.float64(0.965925826289068)]

📌 Coloana: legs1_departureAt_hour_sin
   - Valori lipsă: 0
   - Nr. valori unice: 21
   - Valori unice (eșantion): [np.float64(-1.0), np.float64(-0.9659258262890684), np.float64(-0.8660254037844386), np.float64(-0.8660254037844384), np.float64(-0.7071067811865477), np.float64(-0.7071067811865471), np.float64(-0.5000000000000004), np.float64(-0.4999999999999997), np.float64(-0.2588190451025215), np.float64(-0.2588190451025208), np.float64(0.0), np.float64(1.2246467991473532e-16), np.float64(0.2588190451025207), np.float64(0.258819045102521), np.float64(0.4999999999999999), np.float64(0.7071067811865476), np.float64(0.8660254037844386), np.float64(0.8660254037844387), np.float64(0.9659258262890684), np.float64(1.0)]

📌 Coloana: legs1_departureAt_is_weekend
   - Valori lipsă: 0
   - Nr. valori unice: 2
   - Valori unice (eșantion): [np.int64(0), np.int64(1)]

📌 Coloana: legs1_departureAt_minute
   - Valori lipsă: 0
   - Nr. valori unice: 60
   - Valori unice (eșantion): [np.float64(0.0), np.float64(5.0), np.float64(10.0), np.float64(15.0), np.float64(19.0), np.float64(20.0), np.float64(21.0), np.float64(22.0), np.float64(24.0), np.float64(25.0), np.float64(26.0), np.float64(27.0), np.float64(29.0), np.float64(30.0), np.float64(31.0), np.float64(35.0), np.float64(40.0), np.float64(45.0), np.float64(50.0), np.float64(55.0)]

📌 Coloana: legs1_departureAt_month
   - Valori lipsă: 0
   - Nr. valori unice: 12
   - Valori unice (eșantion): [np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0)]

📌 Coloana: legs1_departureAt_part_of_day
   - Valori lipsă: 0
   - Nr. valori unice: 4
   - Valori unice (eșantion): ['afternoon', 'evening', 'morning', 'night']

📌 Coloana: legs1_departureAt_weekofyear
   - Valori lipsă: 7492
   - Nr. valori unice: 52
   - Valori unice (eșantion): [np.float64(29.0), np.float64(30.0), np.float64(31.0), np.float64(32.0), np.float64(33.0), np.float64(34.0), np.float64(35.0), np.float64(36.0), np.float64(37.0), np.float64(38.0), np.float64(39.0), np.float64(40.0), np.float64(41.0), np.float64(42.0), np.float64(43.0), np.float64(44.0), np.float64(45.0), np.float64(46.0), np.float64(47.0), np.float64(51.0)]

📌 Coloana: legs1_departureAt_year
   - Valori lipsă: 0
   - Nr. valori unice: 2
   - Valori unice (eșantion): [np.float64(2024.0), np.float64(2025.0)]

In [None]:
%%writefile impute_missing_dayofweek_and_weekofyear.py
import pandas as pd
import numpy as np

# === Load CSV files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Function to selectively fill dayofweek and weekofyear ===
def fill_missing_dayofweek_and_weekofyear(df, prefix):
    # Check if all components are present
    components = [f"{prefix}_{p}" for p in ["year", "month", "day", "hour", "minute"]]
    if not all(c in df.columns for c in components):
        print(f"⚠️ Required components missing for {prefix}.")
        return df

    # Create a mask for rows where all components are available
    complete_mask = df[components].notnull().all(axis=1)

    # Rows where either dayofweek or weekofyear is missing
    needs_impute = df[f"{prefix}_dayofweek"].isna() | df[f"{prefix}_weekofyear"].isna()

    # Final mask: components available, but target values missing
    mask = complete_mask & needs_impute

    if mask.sum() == 0:
        print(f"✅ {prefix}: nothing to impute.")
        return df

    print(f"🔧 {prefix}: imputing {mask.sum()} missing rows")

    # Build temporary datetime for selected rows
    temp_dt = pd.to_datetime(dict(
        year=df.loc[mask, f"{prefix}_year"].astype(int),
        month=df.loc[mask, f"{prefix}_month"].astype(int),
        day=df.loc[mask, f"{prefix}_day"].astype(int),
        hour=df.loc[mask, f"{prefix}_hour"].astype(int),
        minute=df.loc[mask, f"{prefix}_minute"].astype(int)
    ), errors='coerce')

    # Assign computed values
    df.loc[mask, f"{prefix}_dayofweek"] = temp_dt.dt.dayofweek
    df.loc[mask, f"{prefix}_weekofyear"] = temp_dt.dt.isocalendar().week.astype(float)

    return df

# === Apply to both datasets and both columns ===
for df_name, df in [("train", train), ("test", test)]:
    print(f"\n📂 Processing: {df_name}.csv")
    df = fill_missing_dayofweek_and_weekofyear(df, "legs1_departureAt")
    df = fill_missing_dayofweek_and_weekofyear(df, "legs1_arrivalAt")

    # Save result
    df.to_csv(f"{df_name}.csv", index=False)


Summary of Imputation:
For train.csv:

legs1_departureAt_dayofweek & legs1_departureAt_weekofyear: 33,113 rows filled

legs1_arrivalAt_dayofweek & legs1_arrivalAt_weekofyear: 23,376 rows filled

For test.csv:

legs1_departureAt_dayofweek & legs1_departureAt_weekofyear: 7,492 rows filled

legs1_arrivalAt_dayofweek & legs1_arrivalAt_weekofyear: 7,022 rows filled

In [None]:
%%writefile remove_columns.py
import pandas as pd

# List of column names to be removed
columns_to_drop = [
    'legs1_segments0_flightNumber',
    'legs1_departureAt',
    'legs1_arrivalAt',
    'legs1_departureAt_dayofweek',
    'legs1_departureAt_weekofyear',
    'legs1_arrivalAt_dayofweek',
    'legs1_arrivalAt_weekofyear'
]

# Load and modify train.csv
train_df = pd.read_csv('train.csv')
train_df = train_df.drop(columns=[col for col in columns_to_drop if col in train_df.columns])
train_df.to_csv('train.csv', index=False)  # overwrite the original file

# Load and modify test.csv
test_df = pd.read_csv('test.csv')
test_df = test_df.drop(columns=[col for col in columns_to_drop if col in test_df.columns])
test_df.to_csv('test.csv', index=False)  # overwrite the original file


In [None]:
%%writefile remove_columns_2.py
import pandas as pd

# List of column names to be removed (copied from your output)
columns_to_drop = [
    'legs1_segments3_cabinClass',
    'legs1_segments3_baggageAllowance_weightMeasurementType',
    'legs1_segments3_arrivalTo_airport_iata',
    'legs1_segments3_baggageAllowance_quantity',
    'legs1_segments3_aircraft_code',
    'legs1_segments3_arrivalTo_airport_city_iata',
    'legs1_segments3_marketingCarrier_code',
    'legs1_segments3_flightNumber',
    'legs1_segments3_seatsAvailable',
    'legs1_segments3_operatingCarrier_code',
    'legs1_segments3_duration',
    'legs1_segments3_departureFrom_airport_iata',
    'legs0_segments3_aircraft_code',
    'legs0_segments3_cabinClass',
    'legs0_segments3_seatsAvailable',
    'legs0_segments3_baggageAllowance_quantity',
    'legs0_segments3_baggageAllowance_weightMeasurementType',
    'legs0_segments3_flightNumber',
    'legs0_segments3_arrivalTo_airport_iata',
    'legs0_segments3_arrivalTo_airport_city_iata',
    'legs0_segments3_operatingCarrier_code',
    'legs0_segments3_marketingCarrier_code',
    'legs0_segments3_duration',
    'legs0_segments3_departureFrom_airport_iata',
    'legs1_segments2_baggageAllowance_quantity',
    'legs1_segments2_baggageAllowance_weightMeasurementType',
    'legs1_segments2_seatsAvailable',
    'legs1_segments2_cabinClass',
    'legs1_segments2_aircraft_code',
    'legs1_segments2_operatingCarrier_code',
    'legs1_segments2_arrivalTo_airport_iata',
    'legs1_segments2_arrivalTo_airport_city_iata',
    'legs1_segments2_duration',
    'legs1_segments2_departureFrom_airport_iata',
    'legs1_segments2_marketingCarrier_code',
    'legs1_segments2_flightNumber',
    'legs0_segments2_baggageAllowance_quantity',
    'legs0_segments2_baggageAllowance_weightMeasurementType',
    'legs0_segments2_seatsAvailable',
    'legs0_segments2_cabinClass',
    'legs0_segments2_aircraft_code',
    'legs0_segments2_marketingCarrier_code',
    'legs0_segments2_duration',
    'legs0_segments2_operatingCarrier_code',
    'legs0_segments2_arrivalTo_airport_iata',
    'legs0_segments2_flightNumber',
    'legs0_segments2_arrivalTo_airport_city_iata',
    'legs0_segments2_departureFrom_airport_iata',
    'miniRules1_percentage',
    'miniRules0_percentage',
    'legs1_segments1_seatsAvailable',
    'legs1_segments1_baggageAllowance_weightMeasurementType',
    'legs1_segments1_baggageAllowance_quantity',
    'legs1_segments1_cabinClass',
    'legs1_segments1_departureFrom_airport_iata',
    'legs1_segments1_flightNumber',
    'legs1_segments1_marketingCarrier_code',
    'legs1_segments1_aircraft_code',
    'legs1_segments1_duration',
    'legs1_segments1_operatingCarrier_code',
    'legs1_segments1_arrivalTo_airport_iata',
    'legs1_segments1_arrivalTo_airport_city_iata',
    'legs0_segments1_seatsAvailable',
    'legs0_segments1_baggageAllowance_weightMeasurementType',
    'legs0_segments1_baggageAllowance_quantity',
    'legs0_segments1_cabinClass',
    'legs0_segments1_aircraft_code',
    'legs0_segments1_arrivalTo_airport_city_iata',
    'legs0_segments1_departureFrom_airport_iata',
    'legs0_segments1_arrivalTo_airport_iata',
    'legs0_segments1_marketingCarrier_code',
    'legs0_segments1_flightNumber',
    'legs0_segments1_duration',
    'legs0_segments1_operatingCarrier_code'
]

# Load and modify train.csv
train_df = pd.read_csv('train.csv')
train_df = train_df.drop(columns=[col for col in columns_to_drop if col in train_df.columns])
train_df.to_csv('train.csv', index=False)

# Load and modify test.csv
test_df = pd.read_csv('test.csv')
test_df = test_df.drop(columns=[col for col in columns_to_drop if col in test_df.columns])
test_df.to_csv('test.csv', index=False)


In [None]:
%%writefile analyze_data.py
import pandas as pd
import numpy as np

# Load files
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Function to analyze a DataFrame
def analyze_dataframe(df, name="DataFrame"):
    print(f"\n===== {name.upper()} INFO =====")
    print(f"Number of rows: {df.shape[0]}")
    print(f"Number of columns: {df.shape[1]}")
    print("\n--- Data types and missing values ---")
    print(df.info())

    print("\n--- Descriptive Statistics (Numerical) ---")
    print(df.describe().T)

    print("\n--- Descriptive Statistics (Categorical) ---")
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    if len(cat_cols) > 0:
        for col in cat_cols:
            print(f"\nColumn: {col}")
            print(f"Unique values ({df[col].nunique()}): {df[col].unique()[:10]}")
            print(f"Frequency:\n{df[col].value_counts(dropna=False).head(5)}")
    else:
        print("No categorical columns.")

    print("\n--- Distributions, Min, Max, and Unique values for all columns ---")
    for col in df.columns:
        print(f"\nColumn: {col}")
        print(f"  Type: {df[col].dtype}")
        print(f"  Missing values: {df[col].isnull().sum()} ({(df[col].isnull().mean()*100):.2f}%)")
        print(f"  Unique values: {df[col].nunique()}")
        if df[col].nunique() < 20:
            print(f"  Examples: {df[col].unique()}")
        else:
            print(f"  Examples: {df[col].dropna().unique()[:5]}")
        if np.issubdtype(df[col].dtype, np.number):
            print(f"  Min: {df[col].min()}, Max: {df[col].max()}, Mean: {df[col].mean():.2f}, Std: {df[col].std():.2f}")

# Analysis for train.csv
analyze_dataframe(train, name="train.csv")

# Analysis for test.csv
analyze_dataframe(test, name="test.csv")


In [None]:
===== TRAIN.CSV INFO =====
Număr de rânduri: 18145372
Număr de coloane: 73

--- Tipuri de date și valori lipsă ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18145372 entries, 0 to 18145371
Data columns (total 73 columns):
 #   Column                                                  Dtype
---  ------                                                  -----
 0   Id                                                      int64
 1   bySelf                                                  bool
 2   companyID                                               int64
 3   corporateTariffCode                                     float64
 4   frequentFlyer                                           object
 5   nationality                                             int64
 6   isAccess3D                                              bool
 7   isVip                                                   bool
 8   legs0_arrivalAt                                         object
 9   legs0_departureAt                                       object
 10  legs0_duration                                          object
 11  legs0_segments0_aircraft_code                           object
 12  legs0_segments0_arrivalTo_airport_city_iata             object
 13  legs0_segments0_arrivalTo_airport_iata                  object
 14  legs0_segments0_baggageAllowance_quantity               float64
 15  legs0_segments0_baggageAllowance_weightMeasurementType  float64
 16  legs0_segments0_cabinClass                              float64
 17  legs0_segments0_departureFrom_airport_iata              object
 18  legs0_segments0_duration                                object
 19  legs0_segments0_flightNumber                            int64
 20  legs0_segments0_marketingCarrier_code                   object
 21  legs0_segments0_operatingCarrier_code                   object
 22  legs0_segments0_seatsAvailable                          float64
 23  legs1_duration                                          float64
 24  legs1_segments0_aircraft_code                           object
 25  legs1_segments0_arrivalTo_airport_city_iata             object
 26  legs1_segments0_arrivalTo_airport_iata                  object
 27  legs1_segments0_baggageAllowance_quantity               float64
 28  legs1_segments0_baggageAllowance_weightMeasurementType  float64
 29  legs1_segments0_cabinClass                              float64
 30  legs1_segments0_departureFrom_airport_iata              object
 31  legs1_segments0_duration                                float64
 32  legs1_segments0_marketingCarrier_code                   object
 33  legs1_segments0_operatingCarrier_code                   object
 34  legs1_segments0_seatsAvailable                          float64
 35  miniRules0_monetaryAmount                               float64
 36  miniRules0_statusInfos                                  float64
 37  miniRules1_monetaryAmount                               float64
 38  miniRules1_statusInfos                                  float64
 39  pricingInfo_isAccessTP                                  float64
 40  pricingInfo_passengerCount                              int64
 41  profileId                                               int64
 42  ranker_id                                               object
 43  requestDate                                             object
 44  searchRoute                                             object
 45  sex                                                     bool
 46  taxes                                                   float64
 47  totalPrice                                              float64
 48  selected                                                int64
 49  __index_level_0__                                       int64
 50  legs0_segments0_baggageAllowance_missing_initially      int64
 51  legs0_segments0_seatsAvailable_missing_initially        int64
 52  miniRules0_statusInfos_was_missing                      int64
 53  miniRules1_statusInfos_was_missing                      int64
 54  legs1_segments0_aircraft_code_was_missing               int64
 55  legs1_departureAt_hour                                  float64
 56  legs1_departureAt_minute                                float64
 57  legs1_departureAt_is_weekend                            int64
 58  legs1_departureAt_day                                   float64
 59  legs1_departureAt_month                                 float64
 60  legs1_departureAt_year                                  float64
 61  legs1_departureAt_part_of_day                           object
 62  legs1_departureAt_hour_sin                              float64
 63  legs1_departureAt_hour_cos                              float64
 64  legs1_arrivalAt_hour                                    float64
 65  legs1_arrivalAt_minute                                  float64
 66  legs1_arrivalAt_is_weekend                              int64
 67  legs1_arrivalAt_day                                     float64
 68  legs1_arrivalAt_month                                   float64
 69  legs1_arrivalAt_year                                    float64
 70  legs1_arrivalAt_part_of_day                             object
 71  legs1_arrivalAt_hour_sin                                float64
 72  legs1_arrivalAt_hour_cos                                float64
dtypes: bool(4), float64(32), int64(15), object(22)
memory usage: 9.4+ GB
None

--- Descriptive Statistics (Numerice) ---
                                                         count  ...         max
Id                                                  18145372.0  ...  18146431.0
companyID                                           18145372.0  ...     63482.0
corporateTariffCode                                 18145372.0  ...       181.0
nationality                                         18145372.0  ...        48.0
legs0_segments0_baggageAllowance_quantity           18145372.0  ...        60.0
legs0_segments0_baggageAllowance_weightMeasurem...  18145372.0  ...         1.0
legs0_segments0_cabinClass                          18145372.0  ...         4.0
legs0_segments0_flightNumber                        18145372.0  ...      9996.0
legs0_segments0_seatsAvailable                      18145372.0  ...        10.0
legs1_duration                                      18145372.0  ...      1646.0
legs1_segments0_baggageAllowance_quantity           18145372.0  ...        60.0
legs1_segments0_baggageAllowance_weightMeasurem...  18145372.0  ...         1.0
legs1_segments0_cabinClass                          18145372.0  ...         4.0
legs1_segments0_duration                            18145372.0  ...      1175.0
legs1_segments0_seatsAvailable                      18145372.0  ...         9.0
miniRules0_monetaryAmount                           18145372.0  ...    502237.0
miniRules0_statusInfos                              18145372.0  ...         1.0
miniRules1_monetaryAmount                           18145372.0  ...   7161273.0
miniRules1_statusInfos                              18145372.0  ...         1.0
pricingInfo_isAccessTP                              18145372.0  ...         1.0
pricingInfo_passengerCount                          18145372.0  ...         1.0
profileId                                           18145372.0  ...   3604410.0
taxes                                               18145372.0  ...    897921.0
totalPrice                                          18145372.0  ...   9944355.0
selected                                            18145372.0  ...         1.0
__index_level_0__                                   18145372.0  ...  18146431.0
legs0_segments0_baggageAllowance_missing_initially  18145372.0  ...         1.0
legs0_segments0_seatsAvailable_missing_initially    18145372.0  ...         1.0
miniRules0_statusInfos_was_missing                  18145372.0  ...         1.0
miniRules1_statusInfos_was_missing                  18145372.0  ...         1.0
legs1_segments0_aircraft_code_was_missing           18145372.0  ...         1.0
legs1_departureAt_hour                              18145372.0  ...        23.0
legs1_departureAt_minute                            18145372.0  ...        59.0
legs1_departureAt_is_weekend                        18145372.0  ...         1.0
legs1_departureAt_day                               18145372.0  ...        31.0
legs1_departureAt_month                             18145372.0  ...        12.0
legs1_departureAt_year                              18145372.0  ...      2025.0
legs1_departureAt_hour_sin                          18145372.0  ...         1.0
legs1_departureAt_hour_cos                          18145372.0  ...         1.0
legs1_arrivalAt_hour                                18145372.0  ...        23.0
legs1_arrivalAt_minute                              18145372.0  ...        59.0
legs1_arrivalAt_is_weekend                          18145372.0  ...         1.0
legs1_arrivalAt_day                                 18145372.0  ...        31.0
legs1_arrivalAt_month                               18145372.0  ...        12.0
legs1_arrivalAt_year                                18145372.0  ...      2025.0
legs1_arrivalAt_hour_sin                            18145372.0  ...         1.0
legs1_arrivalAt_hour_cos                            18145372.0  ...         1.0

[47 rows x 8 columns]

--- Descriptive Statistics (Categorice) ---

Coloana: frequentFlyer
Valori unice (371): ['S7/SU/UT' 'S7/SU/I8' 'SU/S7/UT/N4' 'S7/UT/SU' 'SU/UT/S7' 'SU' 'S7'
 'UT/SU/S7' 'SU/S7' 'S7/SU']
Frecvență:
frequentFlyer
SU       11203125
SU/S7     2317383
S7/SU      951785
S7         721072
SU/TK      335349
Name: count, dtype: int64

Coloana: legs0_arrivalAt
Valori unice (60143): ['2024-06-15T16:20:00' '2024-06-15T14:50:00' '2024-06-15T17:20:00'
 '2024-06-15T21:35:00' '2024-06-15T09:15:00' '2024-06-16T06:50:00'
 '2024-05-30T18:30:00' '2024-05-30T12:55:00' '2024-06-04T15:50:00'
 '2024-06-04T11:30:00']
Frecvență:
legs0_arrivalAt
2024-11-06T17:10:00    9708
2024-11-06T15:45:00    9605
2024-08-21T02:00:00    7346
2024-09-18T09:30:00    7128
2024-08-14T02:00:00    6969
Name: count, dtype: int64

Coloana: legs0_departureAt
Valori unice (56619): ['2024-06-15T15:40:00' '2024-06-15T09:25:00' '2024-06-15T11:25:00'
 '2024-06-15T09:50:00' '2024-05-30T16:00:00' '2024-05-30T10:40:00'
 '2024-06-04T11:25:00' '2024-06-04T09:20:00' '2024-06-04T06:50:00'
 '2024-06-04T11:05:00']
Frecvență:
legs0_departureAt
2024-11-05T08:05:00    11663
2024-11-05T08:55:00     9022
2024-10-28T07:00:00     8081
2024-08-21T00:30:00     7461
2024-10-01T06:20:00     7196
Name: count, dtype: int64

Coloana: legs0_duration
Valori unice (1542): ['02:40:00' '07:25:00' '09:55:00' '12:10:00' '01:25:00' '21:25:00'
 '02:30:00' '02:15:00' '09:25:00' '07:10:00']
Frecvență:
legs0_duration
01:30:00    5286759
01:40:00    1002231
01:25:00     892348
01:35:00     816454
01:45:00     443707
Name: count, dtype: int64

Coloana: legs0_segments0_aircraft_code
Valori unice (108): ['YK2' 'E70' 'CRJ' 'AN4' '32N' '32A' '319' 'DH4' '321' '73H']
Frecvență:
legs0_segments0_aircraft_code
SU9    7514233
32A    2093755
32B    1666842
73H    1596430
320    1280482
Name: count, dtype: int64

Coloana: legs0_segments0_arrivalTo_airport_city_iata
Valori unice (499): ['KJA' 'OVB' 'NJC' 'IKT' 'AER' 'MOW' 'UUS' 'BQS' 'KHV' 'ODO']
Frecvență:
legs0_segments0_arrivalTo_airport_city_iata
MOW    6521261
LED    4767952
AER    1128843
SVX     893210
IST     686558
Name: count, dtype: int64

Coloana: legs0_segments0_arrivalTo_airport_iata
Valori unice (533): ['KJA' 'OVB' 'NJC' 'IKT' 'AER' 'DME' 'UUS' 'BQS' 'KHV' 'SVO']
Frecvență:
legs0_segments0_arrivalTo_airport_iata
LED    4767952
SVO    4321862
VKO    1398528
AER    1128843
SVX     893210
Name: count, dtype: int64

Coloana: legs0_segments0_departureFrom_airport_iata
Valori unice (393): ['TLK' 'TOF' 'ODO' 'IKT' 'KHV' 'GDX' 'KJA' 'OVB' 'CEK' 'HTA']
Frecvență:
legs0_segments0_departureFrom_airport_iata
SVO    5468076
LED    3803234
VKO    2750619
DME    1256576
AER     653059
Name: count, dtype: int64

Coloana: legs0_segments0_duration
Valori unice (296): ['02:40:00' '02:50:00' '00:50:00' '01:25:00' '02:30:00' '02:15:00'
 '07:10:00' '02:35:00' '06:10:00' '06:15:00']
Frecvență:
legs0_segments0_duration
01:30:00    5483181
01:40:00    1091580
01:25:00     981998
01:35:00     894720
01:45:00     499612
Name: count, dtype: int64

Coloana: legs0_segments0_marketingCarrier_code
Valori unice (164): ['KV' 'S7' '7R' 'IO' 'ИК' 'SU' 'HZ' 'UT' 'DP' '5N']
Frecvență:
legs0_segments0_marketingCarrier_code
SU    12465824
S7     1286981
U6     1160466
TK      734996
DP      637209
Name: count, dtype: int64

Coloana: legs0_segments0_operatingCarrier_code
Valori unice (220): ['KV' 'S7' '7R' 'IO' 'ИК' 'HZ' 'FV' 'UT' 'SU' 'DP']
Frecvență:
legs0_segments0_operatingCarrier_code
FV    7768836
SU    4660441
S7    1285536
U6    1160464
TK     731413
Name: count, dtype: int64

Coloana: legs1_segments0_aircraft_code
Valori unice (93): ['YK2' 'E70' '73H' 'SU9' '321' '32A' '32N' '319' '738' '773']
Frecvență:
legs1_segments0_aircraft_code
SU9    7274698
32A    2138416
73H    1965832
32B    1551218
320    1373672
Name: count, dtype: int64

Coloana: legs1_segments0_arrivalTo_airport_city_iata
Valori unice (312): ['TLK' 'OVB' 'ABA' 'NJC' 'KZN' 'MOW' 'DEL' 'IKT' 'FRU' 'VVO']
Frecvență:
legs1_segments0_arrivalTo_airport_city_iata
MOW    9133651
LED    3541867
SVX     820109
OVB     669045
AER     570004
Name: count, dtype: int64

Coloana: legs1_segments0_arrivalTo_airport_iata
Valori unice (338): ['TLK' 'OVB' 'ABA' 'NJC' 'KZN' 'LED' 'AMD' 'IKT' 'DME' 'KHV']
Frecvență:
legs1_segments0_arrivalTo_airport_iata
SVO    5856081
LED    3425824
VKO    1949143
DME    1104426
SVX     846886
Name: count, dtype: int64

Coloana: legs1_segments0_departureFrom_airport_iata
Valori unice (343): ['KJA' 'OVB' 'SVO' 'LED' 'HTA' 'SVX' 'IKT' 'UUD' 'DME' 'AER']
Frecvență:
legs1_segments0_departureFrom_airport_iata
LED    4828009
SVO    3652051
VKO    1332010
AER    1092525
DME     816334
Name: count, dtype: int64

Coloana: legs1_segments0_marketingCarrier_code
Valori unice (144): ['KV' 'S7' 'DP' 'SU' 'N4' '5N' 'R3' 'UT' '6R' 'U6']
Frecvență:
legs1_segments0_marketingCarrier_code
SU    11971226
S7     1288880
U6     1130770
DP      845103
TK      718166
Name: count, dtype: int64

Coloana: legs1_segments0_operatingCarrier_code
Valori unice (186): ['KV' 'S7' 'EO' 'N4' 'U6' 'FV' 'HZ' 'DP' 'SU' 'UT']
Frecvență:
legs1_segments0_operatingCarrier_code
FV    6642739
SU    5384702
S7    1281893
U6    1131912
DP     833079
Name: count, dtype: int64

Coloana: ranker_id
Valori unice (105539): ['98ce0dabf6964640b63079fbafd42cbe' '905909166d934c618ad55ab7f5cea598'
 '7ec8ce3fdebd4c9699b03582ebd60d5d' 'd2906d4d6a4a4b8ea7406a96080c7a44'
 'e04b757602824a4dbe227f1e67dbdbd3' '6dabbda422034b089089b2b719004191'
 'e0f9319a8b3048cdb1c974395e599e8d' 'e109b50aca4a43908dd146c55733e354'
 '7fe752f09aad420b8dfe753dfb713aba' 'a2343d6f691a42a68316eb7b7d08bec2']
Frecvență:
ranker_id
f9833fe7d58441c8a8feed74fec32a2c    8236
796854b386874b40b4a8843f70a2b0f7    7841
4d0bee7eede2454187405709ea187702    7793
db85d3d24c164a798b518be8fbce1f1c    7678
92e54e4a04f94eec9c30a59cb729ed5a    7676
Name: count, dtype: int64

Coloana: requestDate
Valori unice (104428): ['2024-05-17T03:03:08.000000000' '2024-05-17T03:09:59.000000000'
 '2024-05-17T03:10:04.000000000' '2024-05-17T03:31:00.000000000'
 '2024-05-17T03:40:53.000000000' '2024-05-17T03:51:28.000000000'
 '2024-05-17T04:02:26.000000000' '2024-05-17T04:06:51.000000000'
 '2024-05-17T04:08:12.000000000' '2024-05-17T04:10:23.000000000']
Frecvență:
requestDate
2024-07-17T15:29:28.000000000    11673
2024-09-08T09:26:02.000000000     8236
2024-09-03T17:40:10.000000000     7841
2024-09-03T18:10:14.000000000     7793
2024-09-12T12:53:40.000000000     7678
Name: count, dtype: int64

Coloana: searchRoute
Valori unice (5769): ['TLKKJA/KJATLK' 'TOFNJC' 'ODOIKT' 'IKTAER' 'KHVUUS' 'KHVBQS' 'GDXOVB'
 'KJAIKT' 'IKTODO' 'OVBGDX']
Frecvență:
searchRoute
MOWLED/LEDMOW    3250607
LEDMOW/MOWLED    2031821
MOWLED           1161009
LEDMOW           1126614
MOWAER/AERMOW     811476
Name: count, dtype: int64

Coloana: legs1_departureAt_part_of_day
Valori unice (4): ['morning' 'evening' 'afternoon' 'night']
Frecvență:
legs1_departureAt_part_of_day
evening      6657728
afternoon    5421663
morning      4940995
night        1124986
Name: count, dtype: int64

Coloana: legs1_arrivalAt_part_of_day
Valori unice (4): ['afternoon' 'morning' 'night' 'evening']
Frecvență:
legs1_arrivalAt_part_of_day
evening      6503826
afternoon    5188524
morning      3936293
night        2516729
Name: count, dtype: int64

--- Distribuții, Min, Max și Valori unice pentru toate coloanele ---

Coloana: Id
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 18145372
  Exemple: [0 1 2 3 4]
  Min: 0, Max: 18146431, Mean: 9072685.54, Std: 5238117.92

Coloana: bySelf
  Tip: bool
  Valori lipsă: 0 (0.00%)
  Valori unice: 1
  Exemple: [ True]

Coloana: companyID
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 641
  Exemple: [57323 53407 59096 62836 25312]
  Min: 16636, Max: 63482, Mean: 47293.87, Std: 12119.86

Coloana: corporateTariffCode
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 170
  Exemple: [ 42. 123. 161. 101. 130.]
  Min: 0.0, Max: 181.0, Mean: 107.23, Std: 47.04

Coloana: frequentFlyer
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 371
  Exemple: ['S7/SU/UT' 'S7/SU/I8' 'SU/S7/UT/N4' 'S7/UT/SU' 'SU/UT/S7']

Coloana: nationality
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 48
  Exemple: [36  6 21 46  8]
  Min: 0, Max: 48, Mean: 35.70, Std: 2.92

Coloana: isAccess3D
  Tip: bool
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [False  True]

Coloana: isVip
  Tip: bool
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [False  True]

Coloana: legs0_arrivalAt
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 60143
  Exemple: ['2024-06-15T16:20:00' '2024-06-15T14:50:00' '2024-06-15T17:20:00'
 '2024-06-15T21:35:00' '2024-06-15T09:15:00']

Coloana: legs0_departureAt
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 56619
  Exemple: ['2024-06-15T15:40:00' '2024-06-15T09:25:00' '2024-06-15T11:25:00'
 '2024-06-15T09:50:00' '2024-05-30T16:00:00']

Coloana: legs0_duration
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 1542
  Exemple: ['02:40:00' '07:25:00' '09:55:00' '12:10:00' '01:25:00']

Coloana: legs0_segments0_aircraft_code
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 108
  Exemple: ['YK2' 'E70' 'CRJ' 'AN4' '32N']

Coloana: legs0_segments0_arrivalTo_airport_city_iata
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 499
  Exemple: ['KJA' 'OVB' 'NJC' 'IKT' 'AER']

Coloana: legs0_segments0_arrivalTo_airport_iata
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 533
  Exemple: ['KJA' 'OVB' 'NJC' 'IKT' 'AER']

Coloana: legs0_segments0_baggageAllowance_quantity
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 18
  Exemple: [ 1.  0. 20.  2. 25. 23. 30. 40. 35. 50. 10. 60. 15.  3. 33. 45. 46. 32.]
  Min: 0.0, Max: 60.0, Mean: 2.63, Std: 7.25

Coloana: legs0_segments0_baggageAllowance_weightMeasurementType
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [0. 1.]
  Min: 0.0, Max: 1.0, Mean: 0.06, Std: 0.24

Coloana: legs0_segments0_cabinClass
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 4
  Exemple: [1. 2. 4. 3.]
  Min: 1.0, Max: 4.0, Mean: 1.21, Std: 0.49

Coloana: legs0_segments0_departureFrom_airport_iata
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 393
  Exemple: ['TLK' 'TOF' 'ODO' 'IKT' 'KHV']

Coloana: legs0_segments0_duration
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 296
  Exemple: ['02:40:00' '02:50:00' '00:50:00' '01:25:00' '02:30:00']

Coloana: legs0_segments0_flightNumber
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 7531
  Exemple: [ 216 5358 5322  816  142]
  Min: 1, Max: 9996, Mean: 3627.11, Std: 2724.58

Coloana: legs0_segments0_marketingCarrier_code
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 164
  Exemple: ['KV' 'S7' '7R' 'IO' 'ИК']

Coloana: legs0_segments0_operatingCarrier_code
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 220
  Exemple: ['KV' 'S7' '7R' 'IO' 'ИК']

Coloana: legs0_segments0_seatsAvailable
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 10
  Exemple: [ 9.  4.  7.  6.  2.  3.  1.  5.  8. 10.]
  Min: 1.0, Max: 10.0, Mean: 5.15, Std: 3.11

Coloana: legs1_duration
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 1478
  Exemple: [155. 505. 655. 755.  72.]
  Min: 1.0, Max: 1646.0, Mean: 255.47, Std: 283.73

Coloana: legs1_segments0_aircraft_code
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 93
  Exemple: ['YK2' 'E70' '73H' 'SU9' '321']

Coloana: legs1_segments0_arrivalTo_airport_city_iata
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 312
  Exemple: ['TLK' 'OVB' 'ABA' 'NJC' 'KZN']

Coloana: legs1_segments0_arrivalTo_airport_iata
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 338
  Exemple: ['TLK' 'OVB' 'ABA' 'NJC' 'KZN']

Coloana: legs1_segments0_baggageAllowance_quantity
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 17
  Exemple: [ 1.  0. 20.  2. 10. 23. 30. 40. 35. 50. 25. 45. 15.  3. 60. 33. 32.]
  Min: 0.0, Max: 60.0, Mean: 2.60, Std: 7.19

Coloana: legs1_segments0_baggageAllowance_weightMeasurementType
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [0. 1.]
  Min: 0.0, Max: 1.0, Mean: 0.06, Std: 0.23

Coloana: legs1_segments0_cabinClass
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 4
  Exemple: [1. 2. 4. 3.]
  Min: 1.0, Max: 4.0, Mean: 1.19, Std: 0.46

Coloana: legs1_segments0_departureFrom_airport_iata
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 343
  Exemple: ['KJA' 'OVB' 'SVO' 'LED' 'HTA']

Coloana: legs1_segments0_duration
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 754
  Exemple: [155.  85.  69.  96. 117.]
  Min: 3.0, Max: 1175.0, Mean: 148.23, Std: 82.29

Coloana: legs1_segments0_marketingCarrier_code
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 144
  Exemple: ['KV' 'S7' 'DP' 'SU' 'N4']

Coloana: legs1_segments0_operatingCarrier_code
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 186
  Exemple: ['KV' 'S7' 'EO' 'N4' 'U6']

Coloana: legs1_segments0_seatsAvailable
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 9
  Exemple: [9. 2. 1. 4. 5. 3. 8. 6. 7.]
  Min: 1.0, Max: 9.0, Mean: 5.46, Std: 3.16

Coloana: miniRules0_monetaryAmount
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 640652
  Exemple: [3944.20117188 2300.            0.         4000.         2122.98583984]
  Min: 0.0, Max: 502237.0, Mean: 2535.51, Std: 3340.50

Coloana: miniRules0_statusInfos
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [1. 0.]
  Min: 0.0, Max: 1.0, Mean: 0.98, Std: 0.15

Coloana: miniRules1_monetaryAmount
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 527962
  Exemple: [ 686.99304199 3500.            0.         1377.74414062 1215.37109375]
  Min: 0.0, Max: 7161273.0, Mean: 1344.94, Std: 5733.92

Coloana: miniRules1_statusInfos
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [0. 1.]
  Min: 0.0, Max: 1.0, Mean: 0.57, Std: 0.50

Coloana: pricingInfo_isAccessTP
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [1. 0.]
  Min: 0.0, Max: 1.0, Mean: 0.51, Std: 0.50

Coloana: pricingInfo_passengerCount
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 1
  Exemple: [1]
  Min: 1, Max: 1, Mean: 1.00, Std: 0.00

Coloana: profileId
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 32922
  Exemple: [2087645 2087904 2447853 2384252 3382768]
  Min: 813, Max: 3604410, Mean: 2494203.03, Std: 950391.40

Coloana: ranker_id
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 105539
  Exemple: ['98ce0dabf6964640b63079fbafd42cbe' '905909166d934c618ad55ab7f5cea598'
 '7ec8ce3fdebd4c9699b03582ebd60d5d' 'd2906d4d6a4a4b8ea7406a96080c7a44'
 'e04b757602824a4dbe227f1e67dbdbd3']

Coloana: requestDate
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 104428
  Exemple: ['2024-05-17T03:03:08.000000000' '2024-05-17T03:09:59.000000000'
 '2024-05-17T03:10:04.000000000' '2024-05-17T03:31:00.000000000'
 '2024-05-17T03:40:53.000000000']

Coloana: searchRoute
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 5769
  Exemple: ['TLKKJA/KJATLK' 'TOFNJC' 'ODOIKT' 'IKTAER' 'KHVUUS']

Coloana: sex
  Tip: bool
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [ True False]

Coloana: taxes
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 44448
  Exemple: [ 370. 2240.  444.  870. 1185.]
  Min: 0.0, Max: 897921.0, Mean: 4284.70, Std: 11839.75

Coloana: totalPrice
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 230704
  Exemple: [16884. 51125. 53695. 81880. 86070.]
  Min: 770.0, Max: 9944355.0, Mean: 46314.44, Std: 75068.08

Coloana: selected
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [1 0]
  Min: 0, Max: 1, Mean: 0.01, Std: 0.08

Coloana: __index_level_0__
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 18145372
  Exemple: [0 1 2 3 4]
  Min: 0, Max: 18146431, Mean: 9072685.54, Std: 5238117.92

Coloana: legs0_segments0_baggageAllowance_missing_initially
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [1 0]
  Min: 0, Max: 1, Mean: 1.00, Std: 0.01

Coloana: legs0_segments0_seatsAvailable_missing_initially
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [1 0]
  Min: 0, Max: 1, Mean: 1.00, Std: 0.07

Coloana: miniRules0_statusInfos_was_missing
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [0 1]
  Min: 0, Max: 1, Mean: 0.92, Std: 0.27

Coloana: miniRules1_statusInfos_was_missing
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [0 1]
  Min: 0, Max: 1, Mean: 0.92, Std: 0.28

Coloana: legs1_segments0_aircraft_code_was_missing
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [1 0]
  Min: 0, Max: 1, Mean: 0.76, Std: 0.43

Coloana: legs1_departureAt_hour
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 24
  Exemple: [ 9. 22. 19. 17. 14.]
  Min: 0.0, Max: 23.0, Mean: 14.43, Std: 6.02

Coloana: legs1_departureAt_minute
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 60
  Exemple: [45.  5. 35. 55. 29.]
  Min: 0.0, Max: 59.0, Mean: 25.00, Std: 15.74

Coloana: legs1_departureAt_is_weekend
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [0 1]
  Min: 0, Max: 1, Mean: 0.22, Std: 0.41

Coloana: legs1_departureAt_day
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 31
  Exemple: [ 9.  3. 11. 13. 24.]
  Min: 1.0, Max: 31.0, Mean: 16.02, Std: 8.83

Coloana: legs1_departureAt_month
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 12
  Exemple: [ 7.  9. 10.  8.  5.  6.  4. 11.  3. 12.  2.  1.]
  Min: 1.0, Max: 12.0, Mean: 8.71, Std: 1.47

Coloana: legs1_departureAt_year
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [2024. 2025.]
  Min: 2024.0, Max: 2025.0, Mean: 2024.00, Std: 0.06

Coloana: legs1_departureAt_part_of_day
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 4
  Exemple: ['morning' 'evening' 'afternoon' 'night']

Coloana: legs1_departureAt_hour_sin
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 21
  Exemple: [ 0.70710678 -0.5        -0.96592583 -0.5         0.96592583]
  Min: -1.0, Max: 1.0, Mean: -0.20, Std: 0.68

Coloana: legs1_departureAt_hour_cos
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 22
  Exemple: [-0.70710678  0.8660254   0.25881905 -0.25881905 -0.8660254 ]
  Min: -1.0, Max: 1.0, Mean: -0.10, Std: 0.69

Coloana: legs1_arrivalAt_hour
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 24
  Exemple: [14.  8. 15. 12.  7.]
  Min: 0.0, Max: 23.0, Mean: 13.75, Std: 6.79

Coloana: legs1_arrivalAt_minute
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 60
  Exemple: [20. 30. 37. 31. 35.]
  Min: 0.0, Max: 59.0, Mean: 26.61, Std: 16.12

Coloana: legs1_arrivalAt_is_weekend
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [0 1]
  Min: 0, Max: 1, Mean: 0.25, Std: 0.44

Coloana: legs1_arrivalAt_day
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 31
  Exemple: [ 9. 10. 18. 16. 11.]
  Min: 1.0, Max: 31.0, Mean: 15.95, Std: 8.81

Coloana: legs1_arrivalAt_month
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 12
  Exemple: [ 7. 10.  8.  9.  5.  6.  4. 11.  3.  2. 12.  1.]
  Min: 1.0, Max: 12.0, Mean: 8.71, Std: 1.47

Coloana: legs1_arrivalAt_year
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [2024. 2025.]
  Min: 2024.0, Max: 2025.0, Mean: 2024.00, Std: 0.06

Coloana: legs1_arrivalAt_part_of_day
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 4
  Exemple: ['afternoon' 'morning' 'night' 'evening']

Coloana: legs1_arrivalAt_hour_sin
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 21
  Exemple: [-5.00000000e-01  8.66025404e-01 -7.07106781e-01  1.22464680e-16
  9.65925826e-01]
  Min: -1.0, Max: 1.0, Mean: -0.20, Std: 0.66

Coloana: legs1_arrivalAt_hour_cos
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 22
  Exemple: [-0.8660254  -0.5        -0.70710678 -1.         -0.25881905]
  Min: -1.0, Max: 1.0, Mean: -0.01, Std: 0.73

===== TEST.CSV INFO =====
Număr de rânduri: 6897776
Număr de coloane: 71

--- Tipuri de date și valori lipsă ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6897776 entries, 0 to 6897775
Data columns (total 71 columns):
 #   Column                                                  Dtype
---  ------                                                  -----
 0   Id                                                      int64
 1   bySelf                                                  bool
 2   companyID                                               int64
 3   corporateTariffCode                                     float64
 4   frequentFlyer                                           object
 5   nationality                                             int64
 6   isAccess3D                                              bool
 7   isVip                                                   bool
 8   legs0_arrivalAt                                         object
 9   legs0_departureAt                                       object
 10  legs0_duration                                          object
 11  legs0_segments0_aircraft_code                           object
 12  legs0_segments0_arrivalTo_airport_city_iata             object
 13  legs0_segments0_arrivalTo_airport_iata                  object
 14  legs0_segments0_baggageAllowance_quantity               float64
 15  legs0_segments0_baggageAllowance_weightMeasurementType  float64
 16  legs0_segments0_cabinClass                              float64
 17  legs0_segments0_departureFrom_airport_iata              object
 18  legs0_segments0_duration                                object
 19  legs0_segments0_flightNumber                            int64
 20  legs0_segments0_marketingCarrier_code                   object
 21  legs0_segments0_operatingCarrier_code                   object
 22  legs0_segments0_seatsAvailable                          float64
 23  legs1_duration                                          float64
 24  legs1_segments0_aircraft_code                           object
 25  legs1_segments0_arrivalTo_airport_city_iata             object
 26  legs1_segments0_arrivalTo_airport_iata                  object
 27  legs1_segments0_baggageAllowance_quantity               float64
 28  legs1_segments0_baggageAllowance_weightMeasurementType  float64
 29  legs1_segments0_cabinClass                              float64
 30  legs1_segments0_departureFrom_airport_iata              object
 31  legs1_segments0_duration                                float64
 32  legs1_segments0_marketingCarrier_code                   object
 33  legs1_segments0_operatingCarrier_code                   object
 34  legs1_segments0_seatsAvailable                          float64
 35  miniRules0_monetaryAmount                               float64
 36  miniRules0_statusInfos                                  float64
 37  miniRules1_monetaryAmount                               float64
 38  miniRules1_statusInfos                                  float64
 39  pricingInfo_isAccessTP                                  float64
 40  pricingInfo_passengerCount                              int64
 41  profileId                                               int64
 42  ranker_id                                               object
 43  requestDate                                             object
 44  searchRoute                                             object
 45  sex                                                     bool
 46  taxes                                                   float64
 47  totalPrice                                              float64
 48  __index_level_0__                                       int64
 49  legs0_segments0_baggageAllowance_missing_initially      int64
 50  miniRules0_statusInfos_was_missing                      int64
 51  miniRules1_statusInfos_was_missing                      int64
 52  legs1_segments0_aircraft_code_was_missing               int64
 53  legs1_departureAt_hour                                  float64
 54  legs1_departureAt_minute                                float64
 55  legs1_departureAt_is_weekend                            int64
 56  legs1_departureAt_day                                   float64
 57  legs1_departureAt_month                                 float64
 58  legs1_departureAt_year                                  float64
 59  legs1_departureAt_part_of_day                           object
 60  legs1_departureAt_hour_sin                              float64
 61  legs1_departureAt_hour_cos                              float64
 62  legs1_arrivalAt_hour                                    float64
 63  legs1_arrivalAt_minute                                  float64
 64  legs1_arrivalAt_is_weekend                              int64
 65  legs1_arrivalAt_day                                     float64
 66  legs1_arrivalAt_month                                   float64
 67  legs1_arrivalAt_year                                    float64
 68  legs1_arrivalAt_part_of_day                             object
 69  legs1_arrivalAt_hour_sin                                float64
 70  legs1_arrivalAt_hour_cos                                float64
dtypes: bool(4), float64(32), int64(13), object(22)
memory usage: 3.5+ GB
None

--- Descriptive Statistics (Numerice) ---
                                                        count  ...         max
Id                                                  6897776.0  ...  25043147.0
companyID                                           6897776.0  ...     63482.0
corporateTariffCode                                 6897776.0  ...       181.0
nationality                                         6897776.0  ...        47.0
legs0_segments0_baggageAllowance_quantity           6897776.0  ...        60.0
legs0_segments0_baggageAllowance_weightMeasurem...  6897776.0  ...         1.0
legs0_segments0_cabinClass                          6897776.0  ...         4.0
legs0_segments0_flightNumber                        6897776.0  ...      9980.0
legs0_segments0_seatsAvailable                      6897776.0  ...         9.0
legs1_duration                                      6897776.0  ...      1523.0
legs1_segments0_baggageAllowance_quantity           6897776.0  ...        60.0
legs1_segments0_baggageAllowance_weightMeasurem...  6897776.0  ...         1.0
legs1_segments0_cabinClass                          6897776.0  ...         4.0
legs1_segments0_duration                            6897776.0  ...      1130.0
legs1_segments0_seatsAvailable                      6897776.0  ...         9.0
miniRules0_monetaryAmount                           6897776.0  ...    243187.0
miniRules0_statusInfos                              6897776.0  ...         1.0
miniRules1_monetaryAmount                           6897776.0  ...    473635.0
miniRules1_statusInfos                              6897776.0  ...         1.0
pricingInfo_isAccessTP                              6897776.0  ...         1.0
pricingInfo_passengerCount                          6897776.0  ...         1.0
profileId                                           6897776.0  ...   3667551.0
taxes                                               6897776.0  ...    840097.0
totalPrice                                          6897776.0  ...   9934573.0
__index_level_0__                                   6897776.0  ...  25043147.0
legs0_segments0_baggageAllowance_missing_initially  6897776.0  ...         1.0
miniRules0_statusInfos_was_missing                  6897776.0  ...         1.0
miniRules1_statusInfos_was_missing                  6897776.0  ...         1.0
legs1_segments0_aircraft_code_was_missing           6897776.0  ...         1.0
legs1_departureAt_hour                              6897776.0  ...        23.0
legs1_departureAt_minute                            6897776.0  ...        59.0
legs1_departureAt_is_weekend                        6897776.0  ...         1.0
legs1_departureAt_day                               6897776.0  ...        31.0
legs1_departureAt_month                             6897776.0  ...        12.0
legs1_departureAt_year                              6897776.0  ...      2025.0
legs1_departureAt_hour_sin                          6897776.0  ...         1.0
legs1_departureAt_hour_cos                          6897776.0  ...         1.0
legs1_arrivalAt_hour                                6897776.0  ...        23.0
legs1_arrivalAt_minute                              6897776.0  ...        59.0
legs1_arrivalAt_is_weekend                          6897776.0  ...         1.0
legs1_arrivalAt_day                                 6897776.0  ...        31.0
legs1_arrivalAt_month                               6897776.0  ...        12.0
legs1_arrivalAt_year                                6897776.0  ...      2025.0
legs1_arrivalAt_hour_sin                            6897776.0  ...         1.0
legs1_arrivalAt_hour_cos                            6897776.0  ...         1.0

[45 rows x 8 columns]

--- Descriptive Statistics (Categorice) ---

Coloana: frequentFlyer
Valori unice (417): ['SU' 'S7/SU' 'SU/S7' 'SU/B2' 'SU/B2/KC' 'SU/S7/KC/UT' 'SU/EK' 'TK'
 'S7/SU/UT' 'S7']
Frecvență:
frequentFlyer
SU          3981260
SU/S7        932857
S7/SU        514990
S7           235467
SU/S7/UT      99182
Name: count, dtype: int64

Coloana: legs0_arrivalAt
Valori unice (29117): ['2024-12-19T11:20:00' '2024-12-19T12:45:00' '2024-12-20T03:50:00'
 '2024-12-20T04:20:00' '2024-12-19T14:20:00' '2024-12-19T22:05:00'
 '2024-12-20T00:55:00' '2024-12-20T02:20:00' '2024-12-20T04:05:00'
 '2024-12-19T18:00:00']
Frecvență:
legs0_arrivalAt
2025-02-03T20:15:00    10347
2025-02-03T23:05:00     6606
2025-02-03T15:05:00     6095
2025-02-04T00:05:00     5920
2025-02-03T12:20:00     5577
Name: count, dtype: int64

Coloana: legs0_departureAt
Valori unice (27294): ['2024-12-19T06:50:00' '2024-12-19T08:25:00' '2024-12-19T23:25:00'
 '2024-12-19T23:55:00' '2024-12-19T10:00:00' '2024-12-19T17:45:00'
 '2024-12-19T20:35:00' '2024-12-19T22:00:00' '2024-12-19T23:45:00'
 '2024-12-19T13:35:00']
Frecvență:
legs0_departureAt
2025-02-03T16:30:00    8756
2025-02-03T09:15:00    8101
2024-12-04T11:00:00    7630
2024-11-20T11:30:00    7372
2024-11-20T11:00:00    7362
Name: count, dtype: int64

Coloana: legs0_duration
Valori unice (1024): ['02:30:00' '02:20:00' '02:25:00' '02:35:00' '02:40:00' '07:30:00'
 '12:00:00' '04:35:00' '04:40:00' '04:45:00']
Frecvență:
legs0_duration
01:40:00    2213466
01:35:00     807684
01:45:00     553931
01:30:00     160680
02:40:00     117778
Name: count, dtype: int64

Coloana: legs0_segments0_aircraft_code
Valori unice (93): ['32A' '320' '73H' '32N' '333' '32B' '7M8' '738' 'SU9' '789']
Frecvență:
legs0_segments0_aircraft_code
SU9    3304389
32A     577763
73H     549876
32B     542038
320     541774
Name: count, dtype: int64

Coloana: legs0_segments0_arrivalTo_airport_city_iata
Valori unice (386): ['SVX' 'OVB' 'ALA' 'NQZ' 'CIT' 'LED' 'TAS' 'BAK' 'IST' 'BJS']
Frecvență:
legs0_segments0_arrivalTo_airport_city_iata
LED    2342858
MOW    2331945
AER     357402
SVX     296480
OVB     196742
Name: count, dtype: int64

Coloana: legs0_segments0_arrivalTo_airport_iata
Valori unice (413): ['SVX' 'OVB' 'ALA' 'NQZ' 'CIT' 'LED' 'TAS' 'GYD' 'IST' 'PKX']
Frecvență:
legs0_segments0_arrivalTo_airport_iata
LED    2342858
SVO    1484397
VKO     604228
AER     357402
SVX     296480
Name: count, dtype: int64

Coloana: legs0_segments0_departureFrom_airport_iata
Valori unice (290): ['SVO' 'DME' 'VKO' 'KZN' 'OVB' 'LED' 'DXB' 'KUF' 'IST' 'ZIA']
Frecvență:
legs0_segments0_departureFrom_airport_iata
SVO    2394433
LED    1529889
VKO    1069548
DME     417614
SVX     170454
Name: count, dtype: int64

Coloana: legs0_segments0_duration
Valori unice (214): ['02:30:00' '02:20:00' '02:25:00' '02:35:00' '02:40:00' '04:05:00'
 '04:35:00' '04:40:00' '04:45:00' '03:00:00']
Frecvență:
legs0_segments0_duration
01:40:00    2266050
01:35:00     843241
01:45:00     587908
02:20:00     187274
01:30:00     173301
Name: count, dtype: int64

Coloana: legs0_segments0_marketingCarrier_code
Valori unice (136): ['SU' 'U6' 'DP' 'S7' 'DV' 'HY' 'J2' 'TK' 'CZ' 'N4']
Frecvență:
legs0_segments0_marketingCarrier_code
SU    4808781
U6     404319
S7     382823
DP     251517
TK     158722
Name: count, dtype: int64

Coloana: legs0_segments0_operatingCarrier_code
Valori unice (170): ['SU' 'U6' 'DP' 'S7' 'DV' 'FV' 'HY' 'J2' 'TK' 'CZ']
Frecvență:
legs0_segments0_operatingCarrier_code
FV    3377167
SU    1422753
U6     404319
S7     382222
DP     251517
Name: count, dtype: int64

Coloana: legs1_segments0_aircraft_code
Valori unice (94): ['32B' '32N' '320' '73H' '32A' 'E70' '7M8' '319' '737' '32Q']
Frecvență:
legs1_segments0_aircraft_code
SU9    3284661
73H     605434
32B     565255
32A     558852
320     542247
Name: count, dtype: int64

Coloana: legs1_segments0_arrivalTo_airport_city_iata
Valori unice (282): ['MOW' 'OVB' 'LED' 'NQZ' 'TAS' 'BAK' 'IST' 'BJS' 'KZN' 'KUF']
Frecvență:
legs1_segments0_arrivalTo_airport_city_iata
MOW    3644054
LED    1478167
SVX     259467
OVB     205681
KZN     158139
Name: count, dtype: int64

Coloana: legs1_segments0_arrivalTo_airport_iata
Valori unice (304): ['SVO' 'DME' 'VKO' 'OVB' 'LED' 'NQZ' 'TAS' 'GYD' 'IST' 'PKX']
Frecvență:
legs1_segments0_arrivalTo_airport_iata
SVO    2291377
LED    1442148
VKO     936221
DME     337668
SVX     269303
Name: count, dtype: int64

Coloana: legs1_segments0_departureFrom_airport_iata
Valori unice (338): ['SVX' 'ALA' 'SVO' 'DME' 'ZIA' 'VKO' 'OVB' 'LED' 'KJA' 'AER']
Frecvență:
legs1_segments0_departureFrom_airport_iata
LED    2339509
SVO    1220733
VKO     670632
AER     384921
DME     249666
Name: count, dtype: int64

Coloana: legs1_segments0_marketingCarrier_code
Valori unice (137): ['SU' 'U6' 'DP' 'S7' 'DV' 'HY' 'J2' 'TK' 'CZ' '5N']
Frecvență:
legs1_segments0_marketingCarrier_code
SU    4755435
U6     380017
S7     378223
DP     226756
TK     163455
Name: count, dtype: int64

Coloana: legs1_segments0_operatingCarrier_code
Valori unice (169): ['SU' 'U6' 'DP' 'S7' 'DV' 'FV' 'HY' 'J2' 'TK' 'KC']
Frecvență:
legs1_segments0_operatingCarrier_code
FV    3199314
SU    1573985
U6     379187
S7     378974
DP     231062
Name: count, dtype: int64

Coloana: ranker_id
Valori unice (45231): ['c9373e5f772e43d593dd6ad2fa90f67a' '2e6b0b51f761433cb28fae7c439db04c'
 '726ad47c29bb4196b54bbe283a7267bb' '458c1e8431814da99092580f45eed7e8'
 '8c387f33dc824a89ba7a6f2cb5908e36' 'c20984555e9c45e2a351943dd8616caa'
 '47131784f2dc47a2b34369443052a197' 'bb4cc7c6993b4883a626c39d1f817e03'
 'a08c023735464fa7b8f2d5c13850ee6a' '72f9c70ad8e04feabf9d38dd1dd7f140']
Frecvență:
ranker_id
ccedcdeb3bf646d7abaa9ac6ba1ca9f7    7022
53f9b3c0949f459bb8e4cf15044a43c3    6840
e9f3de07c353417bb937cfe35342bd43    6360
674d9cad04ac4880bf28b61c88b58120    6352
61eac0f0364d472cbe71551924439653    6335
Name: count, dtype: int64

Coloana: requestDate
Valori unice (44738): ['2024-10-29T12:50:42.000000000' '2024-10-29T12:52:12.000000000'
 '2024-10-29T12:49:43.000000000' '2024-10-29T12:50:13.000000000'
 '2024-10-29T12:53:16.000000000' '2024-10-29T12:53:04.000000000'
 '2024-10-29T12:56:49.000000000' '2024-10-29T12:55:32.000000000'
 '2024-10-29T12:56:25.000000000' '2024-10-29T12:58:03.000000000']
Frecvență:
requestDate
2024-12-18T10:47:41.000000000    7022
2024-12-04T17:50:47.000000000    6840
2024-12-03T11:43:59.000000000    6360
2024-12-04T08:47:07.000000000    6352
2024-12-16T19:21:01.000000000    6335
Name: count, dtype: int64

Coloana: searchRoute
Valori unice (3646): ['MOWSVX/SVXMOW' 'MOWALA/ALAMOW' 'KZNMOW/MOWKZN' 'OVBMOW' 'LEDMOW/MOWLED'
 'DXBLON' 'KUFAER/AERKUF' 'MOWSZX/SZXMOW' 'ISTMOW' 'KUFGOJ/GOJKUF']
Frecvență:
searchRoute
MOWLED/LEDMOW    1997387
LEDMOW/MOWLED    1145816
MOWAER/AERMOW     293821
LEDMOW            229396
MOWLED            223257
Name: count, dtype: int64

Coloana: legs1_departureAt_part_of_day
Valori unice (4): ['evening' 'afternoon' 'morning' 'night']
Frecvență:
legs1_departureAt_part_of_day
evening      2384321
morning      2209612
afternoon    2079773
night         224070
Name: count, dtype: int64

Coloana: legs1_arrivalAt_part_of_day
Valori unice (4): ['evening' 'afternoon' 'morning' 'night']
Frecvență:
legs1_arrivalAt_part_of_day
evening      2414900
afternoon    2053985
morning      1698491
night         730400
Name: count, dtype: int64

--- Distribuții, Min, Max și Valori unice pentru toate coloanele ---

Coloana: Id
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 6897776
  Exemple: [18144679 18144680 18144681 18144682 18144683]
  Min: 18144679, Max: 25043147, Mean: 21594259.40, Std: 1991216.74

Coloana: bySelf
  Tip: bool
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [ True False]

Coloana: companyID
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 495
  Exemple: [62840 59766 42620 28626 36948]
  Min: 16636, Max: 63482, Mean: 44505.69, Std: 13024.60

Coloana: corporateTariffCode
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 179
  Exemple: [ 81. 179.  50. 139.  62.]
  Min: 0.0, Max: 181.0, Mean: 105.83, Std: 46.53

Coloana: frequentFlyer
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 417
  Exemple: ['SU' 'S7/SU' 'SU/S7' 'SU/B2' 'SU/B2/KC']

Coloana: nationality
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 34
  Exemple: [36 47 21 23  0]
  Min: 0, Max: 47, Mean: 35.77, Std: 2.43

Coloana: isAccess3D
  Tip: bool
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [False  True]

Coloana: isVip
  Tip: bool
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [False  True]

Coloana: legs0_arrivalAt
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 29117
  Exemple: ['2024-12-19T11:20:00' '2024-12-19T12:45:00' '2024-12-20T03:50:00'
 '2024-12-20T04:20:00' '2024-12-19T14:20:00']

Coloana: legs0_departureAt
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 27294
  Exemple: ['2024-12-19T06:50:00' '2024-12-19T08:25:00' '2024-12-19T23:25:00'
 '2024-12-19T23:55:00' '2024-12-19T10:00:00']

Coloana: legs0_duration
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 1024
  Exemple: ['02:30:00' '02:20:00' '02:25:00' '02:35:00' '02:40:00']

Coloana: legs0_segments0_aircraft_code
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 93
  Exemple: ['32A' '320' '73H' '32N' '333']

Coloana: legs0_segments0_arrivalTo_airport_city_iata
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 386
  Exemple: ['SVX' 'OVB' 'ALA' 'NQZ' 'CIT']

Coloana: legs0_segments0_arrivalTo_airport_iata
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 413
  Exemple: ['SVX' 'OVB' 'ALA' 'NQZ' 'CIT']

Coloana: legs0_segments0_baggageAllowance_quantity
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 17
  Exemple: [ 0.  1.  2. 40. 30. 35. 25. 20. 10. 50. 23. 33.  3. 60. 15. 45. 46.]
  Min: 0.0, Max: 60.0, Mean: 2.34, Std: 6.89

Coloana: legs0_segments0_baggageAllowance_weightMeasurementType
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [0. 1.]
  Min: 0.0, Max: 1.0, Mean: 0.05, Std: 0.23

Coloana: legs0_segments0_cabinClass
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 4
  Exemple: [1. 2. 4. 3.]
  Min: 1.0, Max: 4.0, Mean: 1.04, Std: 0.20

Coloana: legs0_segments0_departureFrom_airport_iata
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 290
  Exemple: ['SVO' 'DME' 'VKO' 'KZN' 'OVB']

Coloana: legs0_segments0_duration
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 214
  Exemple: ['02:30:00' '02:20:00' '02:25:00' '02:35:00' '02:40:00']

Coloana: legs0_segments0_flightNumber
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 5714
  Exemple: [1410  273 6541  403  261]
  Min: 1, Max: 9980, Mean: 3856.57, Std: 2749.47

Coloana: legs0_segments0_marketingCarrier_code
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 136
  Exemple: ['SU' 'U6' 'DP' 'S7' 'DV']

Coloana: legs0_segments0_operatingCarrier_code
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 170
  Exemple: ['SU' 'U6' 'DP' 'S7' 'DV']

Coloana: legs0_segments0_seatsAvailable
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 9
  Exemple: [4. 7. 2. 9. 6. 8. 3. 5. 1.]
  Min: 1.0, Max: 9.0, Mean: 4.93, Std: 3.23

Coloana: legs1_duration
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 1403
  Exemple: [165. 155. 160. 170. 470.]
  Min: 1.0, Max: 1523.0, Mean: 252.47, Std: 295.19

Coloana: legs1_segments0_aircraft_code
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 94
  Exemple: ['32B' '32N' '320' '73H' '32A']

Coloana: legs1_segments0_arrivalTo_airport_city_iata
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 282
  Exemple: ['MOW' 'OVB' 'LED' 'NQZ' 'TAS']

Coloana: legs1_segments0_arrivalTo_airport_iata
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 304
  Exemple: ['SVO' 'DME' 'VKO' 'OVB' 'LED']

Coloana: legs1_segments0_baggageAllowance_quantity
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 16
  Exemple: [ 0.  1.  2. 40. 30. 35. 25. 20. 10. 50. 23. 15.  3. 33. 60. 45.]
  Min: 0.0, Max: 60.0, Mean: 2.33, Std: 6.87

Coloana: legs1_segments0_baggageAllowance_weightMeasurementType
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [0. 1.]
  Min: 0.0, Max: 1.0, Mean: 0.05, Std: 0.22

Coloana: legs1_segments0_cabinClass
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 4
  Exemple: [1. 2. 4. 3.]
  Min: 1.0, Max: 4.0, Mean: 1.04, Std: 0.19

Coloana: legs1_segments0_departureFrom_airport_iata
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 338
  Exemple: ['SVX' 'ALA' 'SVO' 'DME' 'ZIA']

Coloana: legs1_segments0_duration
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 698
  Exemple: [165. 155. 160. 170. 135.]
  Min: 14.0, Max: 1130.0, Mean: 147.37, Std: 92.84

Coloana: legs1_segments0_marketingCarrier_code
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 137
  Exemple: ['SU' 'U6' 'DP' 'S7' 'DV']

Coloana: legs1_segments0_operatingCarrier_code
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 169
  Exemple: ['SU' 'U6' 'DP' 'S7' 'DV']

Coloana: legs1_segments0_seatsAvailable
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 9
  Exemple: [4. 3. 2. 7. 9. 6. 8. 1. 5.]
  Min: 1.0, Max: 9.0, Mean: 5.21, Std: 3.27

Coloana: miniRules0_monetaryAmount
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 237129
  Exemple: [2800.            0.         2838.95288086 2858.6003418  2817.37475586]
  Min: 0.0, Max: 243187.0, Mean: 2719.36, Std: 4006.76

Coloana: miniRules0_statusInfos
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [1. 0.]
  Min: 0.0, Max: 1.0, Mean: 0.98, Std: 0.15

Coloana: miniRules1_monetaryAmount
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 187477
  Exemple: [   0.         2800.         1500.         2838.27294922 2837.67724609]
  Min: 0.0, Max: 473635.0, Mean: 1416.30, Std: 4016.97

Coloana: miniRules1_statusInfos
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [0. 1.]
  Min: 0.0, Max: 1.0, Mean: 0.54, Std: 0.50

Coloana: pricingInfo_isAccessTP
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [1. 0.]
  Min: 0.0, Max: 1.0, Mean: 0.63, Std: 0.48

Coloana: pricingInfo_passengerCount
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 1
  Exemple: [1]
  Min: 1, Max: 1, Mean: 1.00, Std: 0.00

Coloana: profileId
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 18981
  Exemple: [3604015 3344069  639964  638530 3292775]
  Min: 5065, Max: 3667551, Mean: 2479011.18, Std: 1029325.77

Coloana: ranker_id
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 45231
  Exemple: ['c9373e5f772e43d593dd6ad2fa90f67a' '2e6b0b51f761433cb28fae7c439db04c'
 '726ad47c29bb4196b54bbe283a7267bb' '458c1e8431814da99092580f45eed7e8'
 '8c387f33dc824a89ba7a6f2cb5908e36']

Coloana: requestDate
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 44738
  Exemple: ['2024-10-29T12:50:42.000000000' '2024-10-29T12:52:12.000000000'
 '2024-10-29T12:49:43.000000000' '2024-10-29T12:50:13.000000000'
 '2024-10-29T12:53:16.000000000']

Coloana: searchRoute
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 3646
  Exemple: ['MOWSVX/SVXMOW' 'MOWALA/ALAMOW' 'KZNMOW/MOWKZN' 'OVBMOW' 'LEDMOW/MOWLED']

Coloana: sex
  Tip: bool
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [False  True]

Coloana: taxes
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 23672
  Exemple: [1018. 3284.  570. 2192. 3357.]
  Min: 0.0, Max: 840097.0, Mean: 4675.95, Std: 13980.20

Coloana: totalPrice
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 150082
  Exemple: [ 9818. 14018. 22418. 12974. 16974.]
  Min: 800.0, Max: 9934573.0, Mean: 36164.77, Std: 67193.55

Coloana: __index_level_0__
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 6897776
  Exemple: [18144679 18144680 18144681 18144682 18144683]
  Min: 18144679, Max: 25043147, Mean: 21594259.40, Std: 1991216.74

Coloana: legs0_segments0_baggageAllowance_missing_initially
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [1 0]
  Min: 0, Max: 1, Mean: 1.00, Std: 0.00

Coloana: miniRules0_statusInfos_was_missing
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [1 0]
  Min: 0, Max: 1, Mean: 0.92, Std: 0.27

Coloana: miniRules1_statusInfos_was_missing
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [1 0]
  Min: 0, Max: 1, Mean: 0.92, Std: 0.28

Coloana: legs1_segments0_aircraft_code_was_missing
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [1 0]
  Min: 0, Max: 1, Mean: 0.84, Std: 0.37

Coloana: legs1_departureAt_hour
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 24
  Exemple: [21. 12. 23.  6. 20.]
  Min: 0.0, Max: 23.0, Mean: 14.28, Std: 5.59

Coloana: legs1_departureAt_minute
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 60
  Exemple: [10.  0. 50. 30. 25.]
  Min: 0.0, Max: 59.0, Mean: 21.11, Std: 16.57

Coloana: legs1_departureAt_is_weekend
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [1 0]
  Min: 0, Max: 1, Mean: 0.21, Std: 0.41

Coloana: legs1_departureAt_day
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 31
  Exemple: [21. 14. 15.  5. 16.]
  Min: 1.0, Max: 31.0, Mean: 16.42, Std: 7.97

Coloana: legs1_departureAt_month
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 12
  Exemple: [12. 11.  9. 10.  8.  7.  6.  4.  5.  1.  2.  3.]
  Min: 1.0, Max: 12.0, Mean: 9.95, Std: 3.35

Coloana: legs1_departureAt_year
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [2024. 2025.]
  Min: 2024.0, Max: 2025.0, Mean: 2024.12, Std: 0.33

Coloana: legs1_departureAt_part_of_day
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 4
  Exemple: ['evening' 'afternoon' 'morning' 'night']

Coloana: legs1_departureAt_hour_sin
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 21
  Exemple: [-7.07106781e-01  1.22464680e-16 -2.58819045e-01  1.00000000e+00
 -8.66025404e-01]
  Min: -1.0, Max: 1.0, Mean: -0.17, Std: 0.71

Coloana: legs1_departureAt_hour_cos
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 22
  Exemple: [ 7.07106781e-01 -1.00000000e+00  9.65925826e-01  6.12323400e-17
  5.00000000e-01]
  Min: -1.0, Max: 1.0, Mean: -0.18, Std: 0.66

Coloana: legs1_arrivalAt_hour
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 24
  Exemple: [21. 12. 23.  7. 19.]
  Min: 0.0, Max: 23.0, Mean: 13.90, Std: 6.38

Coloana: legs1_arrivalAt_minute
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 60
  Exemple: [55. 35. 50. 30. 10.]
  Min: 0.0, Max: 59.0, Mean: 26.53, Std: 15.79

Coloana: legs1_arrivalAt_is_weekend
  Tip: int64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [1 0]
  Min: 0, Max: 1, Mean: 0.26, Std: 0.44

Coloana: legs1_arrivalAt_day
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 31
  Exemple: [21. 15. 14. 17. 18.]
  Min: 1.0, Max: 31.0, Mean: 16.47, Std: 7.95

Coloana: legs1_arrivalAt_month
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 12
  Exemple: [12. 11.  9. 10.  8.  7.  4.  6.  5.  3.  1.  2.]
  Min: 1.0, Max: 12.0, Mean: 9.95, Std: 3.35

Coloana: legs1_arrivalAt_year
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 2
  Exemple: [2024. 2025.]
  Min: 2024.0, Max: 2025.0, Mean: 2024.12, Std: 0.33

Coloana: legs1_arrivalAt_part_of_day
  Tip: object
  Valori lipsă: 0 (0.00%)
  Valori unice: 4
  Exemple: ['evening' 'afternoon' 'morning' 'night']

Coloana: legs1_arrivalAt_hour_sin
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 21
  Exemple: [-7.07106781e-01  1.22464680e-16 -2.58819045e-01  9.65925826e-01
 -9.65925826e-01]
  Min: -1.0, Max: 1.0, Mean: -0.21, Std: 0.65

Coloana: legs1_arrivalAt_hour_cos
  Tip: float64
  Valori lipsă: 0 (0.00%)
  Valori unice: 22
  Exemple: [ 0.70710678 -1.          0.96592583 -0.25881905  0.25881905]
  Min: -1.0, Max: 1.0, Mean: -0.09, Std: 0.72

# **Preprocessing & Imputation 2**

In [None]:
%%writefile convert_parquet.py
import polars as pl

# Load the Parquet files
train_df = pl.read_parquet("train.parquet")
test_df = pl.read_parquet("test.parquet")

# Optional: check shape or preview
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# Save to CSV (or another format if needed)
train_df.write_csv("train.csv")
test_df.write_csv("test.csv")

print("Parquet files successfully converted to CSV.")

Writing convert_parquet.py


In [None]:
%%writefile summary.py
import pandas as pd

# List of columns to analyze
features = [
    "legs0_segments0_arrivalTo_airport_iata",
    "legs0_segments0_aircraft_code",
    "legs0_segments0_departureFrom_airport_iata",
    "legs0_segments0_arrivalTo_airport_city_iata",
    "legs0_segments0_baggageAllowance_weightMeasurementType",
    "legs0_segments0_baggageAllowance_quantity",
    "legs0_segments0_seatsAvailable"
]

# Load datasets
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

report = []

def get_feature_report(df, dataset_name, feature_name):
    if feature_name not in df.columns:
        return f"{dataset_name}: Column '{feature_name}' not found.\n"

    col = df[feature_name]
    total = len(col)
    missing = col.isna().sum()
    pct_missing = missing / total * 100
    nunique = col.nunique(dropna=True)
    dtype = col.dtype
    most_freq_val = col.value_counts(dropna=True).idxmax()
    most_freq_count = col.value_counts(dropna=True).max()
    pct_most_freq = most_freq_count / (total - missing) * 100
    n_uniq_once = (col.value_counts(dropna=True) == 1).sum()
    n_duplicates = total - nunique - missing

    section = [
        f"📊 Dataset: {dataset_name}",
        f"Column: {feature_name}",
        f"Data type: {dtype}",
        f"Total rows: {total:,}",
        f"Missing values: {missing:,} ({pct_missing:.6f}%)",
        f"Unique values (non-null): {nunique:,}",
        f"Values appearing only once: {n_uniq_once:,}",
        f"Duplicate values (excluding nulls): {n_duplicates:,}",
        f"Most frequent value: {most_freq_val} ({most_freq_count:,} times, {pct_most_freq:.2f}%)",
    ]

    if pd.api.types.is_numeric_dtype(col):
        section.append("\n🔹 Descriptive statistics:")
        section.append(str(col.describe()))
    else:
        top_values = col.value_counts(dropna=True).head(5)
        section.append("\n🔹 Top 5 most frequent values:")
        for val, count in top_values.items():
            section.append(f"  {val}: {count:,} ({(count / (total - missing)) * 100:.2f}%)")

    sample_values = col.dropna().unique()
    if len(sample_values) > 0:
        section.append("\n🔹 Example distinct values:")
        section.append(", ".join(map(str, sample_values[:10])) + (" ..." if len(sample_values) > 10 else ""))

    return "\n".join(section)

# Build report for all features
for feature in features:
    report.append(get_feature_report(train, "Train", feature))
    report.append("\n" + "-" * 40 + "\n")
    report.append(get_feature_report(test, "Test", feature))
    report.append("\n" + "=" * 60 + "\n")

# Save to file
output = "\n".join(report)
filename = "feature_summary.txt"

with open(filename, "w", encoding="utf-8") as f:
    f.write(output)

print(f"✅ Summary for all features saved to '{filename}'")


Writing summary.py


In [None]:
list1 = ['bySelf', 'companyID', 'corporateTariffCode', 'nationality', 'isAccess3D', 'isVip', 'legs0_duration', 'legs0_segments0_aircraft_code', 'legs0_segments0_arrivalTo_airport_city_iata', 'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_baggageAllowance_quantity', 'legs0_segments0_baggageAllowance_weightMeasurementType', 'legs0_segments0_departureFrom_airport_iata', 'legs0_segments0_duration', 'legs0_segments0_marketingCarrier_code', 'legs0_segments0_operatingCarrier_code', 'legs0_segments0_seatsAvailable', 'legs0_segments1_duration', 'legs1_duration', 'legs1_segments0_duration', 'legs1_segments1_duration', 'miniRules0_monetaryAmount', 'miniRules0_statusInfos', 'miniRules1_monetaryAmount', 'miniRules1_statusInfos', 'sex', 'taxes', 'totalPrice', 'price_per_tax', 'tax_rate', 'log_price', 'total_duration', 'duration_ratio', 'is_one_way', 'l0_seg', 'n_ff_programs', 'has_corporate_tariff', 'has_access_tp', 'corporate_policy_compliant', 'corporate_vip_flag', 'baggage_min', 'rules_missing', 'total_fees', 'is_popular_route', 'is_round_trip', 'n_segments_leg0', 'n_segments_leg1', 'total_segments', 'is_direct_leg0', 'is_direct_leg1', 'is_direct_shortest', 'both_direct', 'is_vip_freq', 'has_baggage', 'has_fees', 'fee_rate', 'group_size', 'group_size_log', 'stay_duration_hours', 'legs0_departureAt_hour', 'legs0_departureAt_weekday', 'legs0_departureAt_business_time', 'legs0_departureAt_time_bin', 'legs0_arrivalAt_hour', 'legs0_arrivalAt_weekday', 'legs0_arrivalAt_business_time', 'legs0_arrivalAt_time_bin', 'legs1_departureAt_hour', 'legs1_departureAt_weekday', 'legs1_departureAt_business_time', 'legs1_departureAt_time_bin', 'legs1_arrivalAt_hour', 'legs1_arrivalAt_weekday', 'legs1_arrivalAt_business_time', 'legs1_arrivalAt_time_bin', 'stay_duration_hours_log', 'is_short_trip', 'stay_duration_bin', 'hours_to_departure', 'days_to_departure', 'is_last_minute_booking', 'price_quantile_rank', 'duration_quantile_rank', 'rank_interaction_mul', 'rank_interaction_ratio', 'rank_interaction_sum', 'rank_interaction_sub', 'price_pct_rank', 'is_cheapest', 'price_from_median', 'is_min_segments', 'price_gap_ratio_from_min', 'is_direct_cheapest', 'legs0_segments0_cabinClass_is_1', 'legs0_segments0_cabinClass_is_2', 'legs0_segments0_cabinClass_is_3', 'legs0_segments0_cabinClass_is_4', 'legs1_segments0_cabinClass_is_1', 'legs1_segments0_cabinClass_is_2', 'legs1_segments0_cabinClass_is_3', 'legs1_segments0_cabinClass_is_4', 'cabin_class_level_sum_3', 'has_business_class', 'cabin_class_highest', 'cabin_class_lowest', 'cabin_class_diversity', 'all_cabin_level_1', 'company_select_count', 'avg_selected_price', 'std_selected_price', 'avg_selected_cabin', 'selected_direct_ratio', 'selected_night_ratio', 'log_company_select_count', 'is_very_popular_company', 'is_popular_company', 'is_top_selected_company', 'z_price_vs_company_selected', 'legs0_segments0_marketingCarrier_code_selection_rate', 'legs0_segments0_marketingCarrier_code_log_total_count', 'legs0_segments0_marketingCarrier_code_selected_rank_bin', 'legs1_segments0_marketingCarrier_code_selection_rate', 'legs1_segments0_marketingCarrier_code_log_total_count', 'legs1_segments0_marketingCarrier_code_selected_rank_bin', 'legs0_segments0_marketingCarrier_code_in_frequentFlyer', 'is_major_carrier_0_0', 'legs0_segments0_marketingCarrier_code_ff_and_business']
list2 = [
    "legs1_segments3_marketingCarrier_code",
    "legs1_segments3_flightNumber",
    "legs1_segments3_baggageAllowance_quantity",
    "legs1_segments3_departureFrom_airport_iata",
    "legs1_segments3_cabinClass",
    "legs1_segments3_baggageAllowance_weightMeasurementType",
    "legs1_segments3_seatsAvailable",
    "legs1_segments3_operatingCarrier_code",
    "legs1_segments3_aircraft_code",
    "legs1_segments3_arrivalTo_airport_city_iata",
    "legs1_segments3_duration",
    "legs1_segments3_arrivalTo_airport_iata",
    "legs0_segments3_aircraft_code",
    "legs0_segments3_baggageAllowance_weightMeasurementType",
    "legs0_segments3_seatsAvailable",
    "legs0_segments3_cabinClass",
    "legs0_segments3_baggageAllowance_quantity",
    "legs0_segments3_departureFrom_airport_iata",
    "legs0_segments3_flightNumber",
    "legs0_segments3_duration",
    "legs0_segments3_arrivalTo_airport_city_iata",
    "legs0_segments3_operatingCarrier_code",
    "legs0_segments3_marketingCarrier_code",
    "legs0_segments3_arrivalTo_airport_iata",
    "legs1_segments2_baggageAllowance_weightMeasurementType",
    "legs1_segments2_baggageAllowance_quantity",
    "legs1_segments2_seatsAvailable",
    "legs1_segments2_cabinClass",
    "legs1_segments2_duration",
    "legs1_segments2_aircraft_code",
    "legs1_segments2_operatingCarrier_code",
    "legs1_segments2_arrivalTo_airport_city_iata",
    "legs1_segments2_arrivalTo_airport_iata",
    "legs1_segments2_marketingCarrier_code",
    "legs1_segments2_flightNumber",
    "legs1_segments2_departureFrom_airport_iata",
    "legs0_segments2_baggageAllowance_weightMeasurementType",
    "legs0_segments2_baggageAllowance_quantity",
    "legs0_segments2_cabinClass",
    "legs0_segments2_seatsAvailable",
    "legs0_segments2_aircraft_code",
    "legs0_segments2_marketingCarrier_code",
    "legs0_segments2_departureFrom_airport_iata",
    "legs0_segments2_arrivalTo_airport_city_iata",
    "legs0_segments2_flightNumber",
    "legs0_segments2_operatingCarrier_code",
    "legs0_segments2_duration",
    "legs0_segments2_arrivalTo_airport_iata",
    "miniRules1_percentage",
    "miniRules0_percentage",
    "legs1_segments1_seatsAvailable",
    "legs1_segments1_baggageAllowance_weightMeasurementType",
    "legs1_segments1_baggageAllowance_quantity",
    "legs1_segments1_cabinClass",
    "legs1_segments1_departureFrom_airport_iata",
    "legs1_segments1_marketingCarrier_code",
    "legs1_segments1_flightNumber",
    "legs1_segments1_aircraft_code",
    "legs1_segments1_operatingCarrier_code",
    "legs1_segments1_arrivalTo_airport_city_iata",
    "legs1_segments1_arrivalTo_airport_iata",
    "legs1_segments1_duration",
    "legs0_segments1_seatsAvailable",
    "legs0_segments1_baggageAllowance_weightMeasurementType",
    "legs0_segments1_baggageAllowance_quantity",
    "legs0_segments1_cabinClass",
    "legs0_segments1_aircraft_code",
    "legs0_segments1_arrivalTo_airport_city_iata",
    "legs0_segments1_departureFrom_airport_iata",
    "legs0_segments1_flightNumber",
    "legs0_segments1_operatingCarrier_code",
    "legs0_segments1_duration",
    "legs0_segments1_arrivalTo_airport_iata",
    "legs0_segments1_marketingCarrier_code",
    "frequentFlyer",
    "corporateTariffCode",
    "legs1_segments0_seatsAvailable",
    "legs1_segments0_baggageAllowance_quantity",
    "legs1_segments0_baggageAllowance_weightMeasurementType",
    "legs1_segments0_cabinClass",
    "legs1_segments0_arrivalTo_airport_city_iata",
    "legs1_segments0_departureFrom_airport_iata",
    "legs1_segments0_arrivalTo_airport_iata",
    "legs1_departureAt",
    "legs1_arrivalAt",
    "legs1_duration",
    "legs1_segments0_duration",
    "legs1_segments0_operatingCarrier_code",
    "legs1_segments0_marketingCarrier_code",
    "legs1_segments0_flightNumber",
    "legs1_segments0_aircraft_code",
    "miniRules1_statusInfos",
    "miniRules0_statusInfos",
    "miniRules1_monetaryAmount",
    "miniRules0_monetaryAmount",
    "pricingInfo_isAccessTP",
    "legs0_segments0_seatsAvailable",
    "legs0_segments0_baggageAllowance_quantity",
    "legs0_segments0_baggageAllowance_weightMeasurementType",
    "legs0_segments0_arrivalTo_airport_city_iata",
    "legs0_segments0_departureFrom_airport_iata",
    "legs0_segments0_aircraft_code",
    "legs0_segments0_arrivalTo_airport_iata"
]

common_elements = sorted(set(list1) & set(list2))

print(f"✅ Found {len(common_elements)} common elements:\n")
for i, col in enumerate(common_elements, 1):
    print(f"{i:>2}. {col}")

✅ Found 16 common elements:

 1. corporateTariffCode
 2. legs0_segments0_aircraft_code
 3. legs0_segments0_arrivalTo_airport_city_iata
 4. legs0_segments0_arrivalTo_airport_iata
 5. legs0_segments0_baggageAllowance_quantity
 6. legs0_segments0_baggageAllowance_weightMeasurementType
 7. legs0_segments0_departureFrom_airport_iata
 8. legs0_segments0_seatsAvailable
 9. legs0_segments1_duration
10. legs1_duration
11. legs1_segments0_duration
12. legs1_segments1_duration
13. miniRules0_monetaryAmount
14. miniRules0_statusInfos
15. miniRules1_monetaryAmount
16. miniRules1_statusInfos


In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import time

# === Start the timer ===
start = time.time()

# === Load the data ===
train_original = pd.read_csv("train.csv", low_memory=False)
train_copy = train_original.copy(deep=True)

# === Select relevant columns ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount'
]

target_col = 'corporateTariffCode'
cols_needed = selected_features + [target_col]

# === Filter complete rows for training ===
df_full = train_copy[cols_needed].dropna(subset=cols_needed)

# === Encode the target column ===
le = LabelEncoder()
df_full[target_col] = le.fit_transform(df_full[target_col])

# === Randomly select 1,000,000 rows for validation ===
sample = df_full.sample(n=1000000, random_state=42)
X_valid = sample[selected_features]
y_valid = sample[target_col]

# === Remove these rows from training data ===
df_train = df_full.drop(sample.index, errors='ignore')
X_train = df_train[selected_features]
y_train = df_train[target_col]

# === Train a classification model ===
model = XGBClassifier(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    n_jobs=24,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Predict missing values from the sample ===
predicted = model.predict(X_valid)

# === Compute metrics ===
acc = accuracy_score(y_valid, predicted)
f1 = f1_score(y_valid, predicted, average='weighted')
cm = confusion_matrix(y_valid, predicted)
duration = round(time.time() - start, 2)

# === Save results to file ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Direct prediction results (XGBClassifier with LabelEncoder):\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"F1 Score (weighted): {f1:.4f}\n")
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(cm))
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

# === Also print to console ===
print("✅ Results saved to imputation_results.txt")

Writing test_imputation.py


In [None]:
✅ Direct prediction results (XGBClassifier with LabelEncoder):
Accuracy: 0.9939
F1 Score (weighted): 0.9937
Confusion Matrix:
[[  788     0     0 ...     0     0     0]
 [    0   131     0 ...     0     0     0]
 [    0     0     4 ...     0     0     0]
 ...
 [    0     0     0 ... 13902     5     4]
 [    0     0     0 ...    23   976     0]
 [    0     0     0 ...     4     0 32641]]
⏱️ Duration: 6587.36 seconds

In [None]:
%%writefile impute_corporateTariffCode.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import time

# === Timer ===
start = time.time()

# === Load the files ===
train = pd.read_csv("train.csv", low_memory=False)
test = pd.read_csv("test.csv", low_memory=False)

# === Relevant columns for prediction ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_seatsAvailable',
    'legs0_segments0_flightNumber',
    'taxes',
    'legs0_segments0_baggageAllowance_quantity',
    'legs0_segments0_baggageAllowance_weightMeasurementType',
    'pricingInfo_isAccessTP',
    'miniRules0_monetaryAmount',
    'miniRules1_monetaryAmount',
    'miniRules0_statusInfos',
    'miniRules1_statusInfos'
]

# === Target columns to impute ===
target_columns = [
    'corporateTariffCode'
]

# === Function to train and impute a target column ===
def impute_column(target):
    print(f"\n🔧 Processing target: '{target}'")

    train_valid = train.dropna(subset=features + [target])
    if train_valid.empty:
        print(f"❌ Skipped '{target}': No complete training data.")
        return

    X_train = train_valid[features]
    y_train_raw = train_valid[target]

    # Label Encoding
    le = LabelEncoder()
    y_train = le.fit_transform(y_train_raw)

    # Model
    model = XGBClassifier(
        n_estimators=500,
        max_depth=12,
        learning_rate=0.1,
        tree_method='hist',
        n_jobs=24,
        random_state=42,
        eval_metric='logloss'
    )
    model.fit(X_train, y_train)

    # Impute for train & test
    for df, name in [(train, "train"), (test, "test")]:
        missing_mask = df[target].isna()
        if missing_mask.sum() == 0:
            print(f"✅ {name}: No missing values in '{target}'")
            continue

        imputable = df.loc[missing_mask, features].dropna()
        imputable_indices = imputable.index

        if len(imputable_indices) == 0:
            print(f"⚠️ {name}: No imputable rows with all required features for '{target}'.")
            continue

        preds = model.predict(imputable)
        preds_labels = le.inverse_transform(preds)
        df.loc[imputable_indices, target] = preds_labels
        print(f"✅ {name}: Imputed {len(imputable_indices)} missing values in '{target}'")

    # Report
    for df, name in [(train, "train"), (test, "test")]:
        total = len(df)
        missing = df[target].isna().sum()
        value_counts = df[target].value_counts(dropna=False).sort_index()
        print(f"\n📊 {name} — '{target}':")
        print(f"  Total rows: {total}")
        print(f"  Missing: {missing} ({100 * missing / total:.2f}%)")
        print("  Top 10 value counts (including imputations):")
        print(value_counts.head(10))

    # === Save only the imputed target column ===
    train[[target]].to_csv(f"train_{target}.csv", index=False)
    test[[target]].to_csv(f"test_{target}.csv", index=False)
    print(f"💾 Saved: train_{target}.csv and test_{target}.csv")

# === Run for each target column ===
for target in target_columns:
    impute_column(target)

# === Done ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")

Writing impute_corporateTariffCode.py


In [None]:
  bst.update(dtrain, iteration=i, fobj=obj)
✅ Results saved to imputation_results.txt

🔧 Processing target: 'corporateTariffCode'
✅ train: Imputed 6903139 missing values in 'corporateTariffCode'
✅ test: Imputed 2725874 missing values in 'corporateTariffCode'

📊 train — 'corporateTariffCode':
  Total rows: 18145372
  Missing: 2330786 (12.85%)
  Top 10 value counts (including imputations):
corporateTariffCode
0.0    11695
1.0     2045
2.0       77
3.0    25928
4.0     1555
5.0    35289
6.0    20952
7.0     1354
8.0       98
9.0     4283
Name: count, dtype: int64

📊 test — 'corporateTariffCode':
  Total rows: 6897776
  Missing: 809461 (11.74%)
  Top 10 value counts (including imputations):
corporateTariffCode
0.0      1480
3.0      5145
4.0        26
5.0      3406
6.0      1905
7.0        11
9.0       763
10.0    21867
11.0      156
13.0     2232
Name: count, dtype: int64
💾 Saved: train_corporateTariffCode.csv and test_corporateTariffCode.csv

⏱️ Total duration: 7697.46 seconds

In [None]:
%%writefile summary.py
import pandas as pd

# List of columns to analyze
features = [
    "corporateTariffCode"
]

# Load datasets
train = pd.read_csv("train_corporateTariffCode.csv", low_memory=False)
test = pd.read_csv("test_corporateTariffCode.csv", low_memory=False)

report = []

def get_feature_report(df, dataset_name, feature_name):
    if feature_name not in df.columns:
        return f"{dataset_name}: Column '{feature_name}' not found.\n"

    col = df[feature_name]
    total = len(col)
    missing = col.isna().sum()
    pct_missing = missing / total * 100
    nunique = col.nunique(dropna=True)
    dtype = col.dtype
    most_freq_val = col.value_counts(dropna=True).idxmax()
    most_freq_count = col.value_counts(dropna=True).max()
    pct_most_freq = most_freq_count / (total - missing) * 100
    n_uniq_once = (col.value_counts(dropna=True) == 1).sum()
    n_duplicates = total - nunique - missing

    section = [
        f"📊 Dataset: {dataset_name}",
        f"Column: {feature_name}",
        f"Data type: {dtype}",
        f"Total rows: {total:,}",
        f"Missing values: {missing:,} ({pct_missing:.6f}%)",
        f"Unique values (non-null): {nunique:,}",
        f"Values appearing only once: {n_uniq_once:,}",
        f"Duplicate values (excluding nulls): {n_duplicates:,}",
        f"Most frequent value: {most_freq_val} ({most_freq_count:,} times, {pct_most_freq:.2f}%)",
    ]

    if pd.api.types.is_numeric_dtype(col):
        section.append("\n🔹 Descriptive statistics:")
        section.append(str(col.describe()))
    else:
        top_values = col.value_counts(dropna=True).head(5)
        section.append("\n🔹 Top 5 most frequent values:")
        for val, count in top_values.items():
            section.append(f"  {val}: {count:,} ({(count / (total - missing)) * 100:.2f}%)")

    sample_values = col.dropna().unique()
    if len(sample_values) > 0:
        section.append("\n🔹 Example distinct values:")
        section.append(", ".join(map(str, sample_values[:10])) + (" ..." if len(sample_values) > 10 else ""))

    return "\n".join(section)

# Build report for all features
for feature in features:
    report.append(get_feature_report(train, "Train", feature))
    report.append("\n" + "-" * 40 + "\n")
    report.append(get_feature_report(test, "Test", feature))
    report.append("\n" + "=" * 60 + "\n")

# Save to file
output = "\n".join(report)
filename = "feature_summary.txt"

with open(filename, "w", encoding="utf-8") as f:
    f.write(output)

print(f"✅ Summary for all features saved to '{filename}'")


Writing summary.py


In [None]:
%%writefile impute_corporateTariffCode.py
import pandas as pd

# === Load the datasets ===
train = pd.read_csv("train_corporateTariffCode.csv", low_memory=False)
test = pd.read_csv("test_corporateTariffCode.csv", low_memory=False)

# === Find the most frequent value in 'corporateTariffCode' from TRAIN ===
most_frequent_value = train['corporateTariffCode'].mode(dropna=True)[0]
print(f"🔍 Most frequent value to impute: {most_frequent_value}")

# === Fill missing values in both datasets ===
train['corporateTariffCode'].fillna(most_frequent_value, inplace=True)
test['corporateTariffCode'].fillna(most_frequent_value, inplace=True)

# === Save the updated files ===
train.to_csv("train_corporateTariffCode.csv", index=False)
test.to_csv("test_corporateTariffCode.csv", index=False)
print("✅ Missing values imputed and files saved.")

Writing impute_corporateTariffCode.py


In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import time

# === Start the timer ===
start = time.time()

# === Load the data ===
df_original = pd.read_parquet("train.parquet")
df = df_original.copy(deep=True)  # Work on a deep copy

# === Define features and target ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_flightNumber',
    'taxes'
]
target = 'corporateTariffCode'
cols_needed = features + [target]

# === Drop rows with missing values in features or target ===
df = df.dropna(subset=cols_needed)

# === Encode categorical features ===
cat_features = ['companyID', 'legs0_segments0_cabinClass', 'legs0_segments0_flightNumber']
feature_encoders = {}

print("\n🔤 Encoding categorical features:")
for col in tqdm(cat_features, desc="Encoding features"):
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    feature_encoders[col] = le

# === Encode target ===
target_encoder = LabelEncoder()
df[target] = target_encoder.fit_transform(df[target].astype(str))

# === Sample 1,000,000 rows for validation (or fewer if needed) ===
sample_size = min(1_000_000, len(df))
sample = df.sample(n=sample_size, random_state=42)
X_valid = sample[features]
y_valid = sample[target]

# === Use remaining data for training ===
train_df = df.drop(sample.index)
X_train = train_df[features]
y_train = train_df[target]

# === Train the classifier ===
model = XGBClassifier(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    device='cuda',
    n_jobs=24,
    random_state=42,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Predict and evaluate ===
preds = model.predict(X_valid)
acc = accuracy_score(y_valid, preds)
f1 = f1_score(y_valid, preds, average='weighted')
cm = confusion_matrix(y_valid, preds)
duration = round(time.time() - start, 2)

# === Save results ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Imputation Evaluation Results:\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"F1 Score (weighted): {f1:.4f}\n")
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(cm))
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

# === Also print to console ===
print("✅ Results saved to imputation_results.txt")

Writing test_imputation.py


In [None]:
✅ Imputation Evaluation Results:
Accuracy: 0.9897
F1 Score (weighted): 0.9895
Confusion Matrix:
[[ 871    0    0 ...    0    0    0]
 [   0  117    0 ...    0    0    0]
 [   0    0  440 ...    0    0    0]
 ...
 [   0    0    0 ... 2617    0    0]
 [   0    0    0 ...    0  118    0]
 [   0    0    0 ...    0    0  647]]
⏱️ Duration: 2114.58 seconds

In [None]:
%%writefile impute_corporateTariffCode.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import time

# === Start the timer ===
start = time.time()

# === Load and copy data ===
train_original = pd.read_parquet("train.parquet")
test = pd.read_parquet("test.parquet")
train = train_original.copy(deep=True)

# === Keep ID columns for final merge ===
id_columns = ['Id', 'ranker_id']
if not all(col in train.columns for col in id_columns) or not all(col in test.columns for col in id_columns):
    raise ValueError("❌ Both train and test files must contain 'Id' and 'ranker_id' columns.")

# === Features used for prediction ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_flightNumber',
    'taxes'
]

# === Targets to impute ===
target_columns = ['corporateTariffCode']

# === Encode categorical features ===
cat_features = [col for col in features if train[col].dtype == 'object' or test[col].dtype == 'object']
encoders = {}

print("\n🔤 Encoding categorical features:")
for col in tqdm(cat_features, desc="Encoding features"):
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]], axis=0).astype(str)
    le.fit(combined)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))
    encoders[col] = le

# === Function to impute one target column ===
def impute_column(target):
    print(f"\n🔧 Processing target: '{target}'")

    # Only use complete rows for training
    train_valid = train.dropna(subset=features + [target])
    if train_valid.empty:
        print(f"❌ Skipped '{target}': No complete training data.")
        return

    X_train = train_valid[features]
    y_raw = train_valid[target].astype(str)

    # Encode target
    le_target = LabelEncoder()
    y_train = le_target.fit_transform(y_raw)

    # === Train model ===
    model = XGBClassifier(
        n_estimators=500,
        max_depth=12,
        learning_rate=0.1,
        tree_method='hist',
        device='cuda',
        n_jobs=24,
        random_state=42,
        eval_metric='logloss'
    )
    model.fit(X_train, y_train)

    # === Predict and impute missing values ===
    for df, name in [(train, "train"), (test, "test")]:
        missing_mask = df[target].isna()
        if missing_mask.sum() == 0:
            print(f"✅ {name}: No missing values in '{target}'")
            continue

        imputable = df.loc[missing_mask, features].dropna()
        if imputable.empty:
            print(f"⚠️ {name}: No imputable rows with all required features for '{target}'.")
            continue

        # Predict
        probs = model.predict_proba(imputable)
        preds = np.argmax(probs, axis=1)
        preds_labels = le_target.inverse_transform(preds)
        confidences = np.max(probs, axis=1)

        # Assign predictions and confidence scores
        df.loc[imputable.index, target] = preds_labels
        df.loc[imputable.index, f"{target}_confidence"] = confidences
        print(f"✅ {name}: Imputed {len(imputable)} rows for '{target}'")

    # === Summary report ===
    for df, name in [(train, "train"), (test, "test")]:
        total = len(df)
        missing = df[target].isna().sum()
        print(f"\n📊 {name} — '{target}':")
        print(f"  Total: {total} rows | Missing: {missing} ({100 * missing / total:.2f}%)")
        print("  Top values:")
        print(df[target].value_counts(dropna=False).head(10))

    # === Save imputed results with ID columns ===
    for df, name in [(train, "train"), (test, "test")]:
        output = df[id_columns + [target, f"{target}_confidence"]]
        output.to_parquet(f"{name}_{target}.parquet", index=False)
        print(f"💾 Saved: {name}_{target}.parquet")

# === Run for each target column ===
for target in target_columns:
    impute_column(target)

# === Done ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")

Writing impute_corporateTariffCode.py


In [None]:
🔧 Processing target: 'corporateTariffCode'
✅ train: Imputed 9233925 rows for 'corporateTariffCode'
✅ test: Imputed 3535335 rows for 'corporateTariffCode'

📊 train — 'corporateTariffCode':
  Total: 18145372 rows | Missing: 0 (0.00%)
  Top values:
corporateTariffCode
108    2681238
161    1702303
153    1152302
112     692479
181     670689
29      666098
39      525542
91      518967
101     518550
57      499939
Name: count, dtype: Int64

📊 test — 'corporateTariffCode':
  Total: 6897776 rows | Missing: 0 (0.00%)
  Top values:
corporateTariffCode
161    954166
108    824755
153    454217
57     346277
112    322052
29     236172
181    207049
54     196706
66     188822
91     179587
Name: count, dtype: Int64
💾 Saved: train_corporateTariffCode.parquet
💾 Saved: test_corporateTariffCode.parquet

⏱️ Total duration: 2378.83 seconds

In [None]:
%%writefile threshold_to_use.py
import polars as pl

# Thresholds to analyze
thresholds = [0.998, 0.996, 0.994, 0.992, 0.99, 0.98, 0.97, 0.96, 0.95, 0.90, 0.85, 0.80, 0.75, 0.70, 0.65, 0.60, 0.55, 0.50]

# Files and labels
files = {
    "train": "train_corporateTariffCode.parquet",
    "test": "test_corporateTariffCode.parquet"
}

# Column to analyze
confidence_col = "corporateTariffCode_confidence"

# Store output lines
output_lines = []

for name, path in files.items():
    df = pl.read_parquet(path)
    total_rows = df.height

    header = f"\n📂 {name.upper()} ({path}) — Total rows: {total_rows}"
    print(header)
    output_lines.append(header)

    for t in thresholds:
        count = df.filter(pl.col(confidence_col) < t).height
        line = f"  < {t:.2f}: {count} rows ({count / total_rows:.2%})"
        print(line)
        output_lines.append(line)

# Save results to text file
with open("confidence_summary.txt", "w") as f:
    f.write("\n".join(output_lines))

print("\n✅ Results saved to 'confidence_summary.txt'")

Writing threshold_to_use.py


In [None]:
📂 TRAIN (train_corporateTariffCode.parquet) — Total rows: 18145372
  < 0.95: 2330633 rows (12.84%)
  < 0.90: 1855031 rows (10.22%)
  < 0.85: 1543493 rows (8.51%)
  < 0.80: 1302455 rows (7.18%)
  < 0.75: 1089950 rows (6.01%)
  < 0.70: 901057 rows (4.97%)
  < 0.65: 730767 rows (4.03%)
  < 0.60: 561450 rows (3.09%)
  < 0.55: 394565 rows (2.17%)
  < 0.50: 228448 rows (1.26%)

📂 TEST (test_corporateTariffCode.parquet) — Total rows: 6897776
  < 0.95: 913306 rows (13.24%)
  < 0.90: 721933 rows (10.47%)
  < 0.85: 595711 rows (8.64%)
  < 0.80: 497986 rows (7.22%)
  < 0.75: 415604 rows (6.03%)
  < 0.70: 341673 rows (4.95%)
  < 0.65: 273161 rows (3.96%)
  < 0.60: 206979 rows (3.00%)
  < 0.55: 143036 rows (2.07%)
  < 0.50: 80523 rows (1.17%)

✅ Results saved to 'confidence_summary.txt'

In [None]:
%%writefile remove_based_thr.py
import polars as pl

# === Files to process ===
files = {
    "train_corporateTariffCode.parquet": "train_corporateTariffCode_filtered.parquet",
    "test_corporateTariffCode.parquet": "test_corporateTariffCode_filtered.parquet"
}

# === Threshold and columns ===
confidence_col = "corporateTariffCode_confidence"
target_col = "corporateTariffCode"
threshold = 0.995

for input_path, output_path in files.items():
    # Load the data
    df = pl.read_parquet(input_path)

    # Replace low-confidence values with null
    df_cleaned = df.with_columns([
        pl.when(df[confidence_col] < threshold)
        .then(None)
        .otherwise(df[target_col])
        .alias(target_col)
    ])

    # Save the cleaned dataframe
    df_cleaned.write_parquet(output_path)
    print(f"✅ Saved: '{output_path}' with low-confidence values removed.")

Writing remove_based_thr.py


In [None]:
%%writefile change_corporateTariffCode_values.py
import polars as pl

# === Define file pairs (input, filtered) ===
file_pairs = [
    ("train.parquet", "train_corporateTariffCode_filtered.parquet", "train_filled.parquet"),
    ("test.parquet", "test_corporateTariffCode_filtered.parquet", "test_filled.parquet")
]

# === Column to update ===
column_name = "corporateTariffCode"

for original_path, filtered_path, output_path in file_pairs:
    # Load data
    df_orig = pl.read_parquet(original_path)
    df_filtered = pl.read_parquet(filtered_path)

    # Validate IDs match
    if df_orig.shape[0] != df_filtered.shape[0] or not (df_orig["Id"] == df_filtered["Id"]).all():
        raise ValueError(f"❌ '{original_path}' and '{filtered_path}' must have same number of rows and identical 'Id' order.")

    # Replace nulls in original with non-null values from filtered
    updated_column = (
        pl.when(df_orig[column_name].is_null() & df_filtered[column_name].is_not_null())
        .then(df_filtered[column_name])
        .otherwise(df_orig[column_name])
    )

    # Update DataFrame
    df_updated = df_orig.with_columns([
        updated_column.alias(column_name)
    ])

    # Save result
    df_updated.write_parquet(output_path)
    print(f"✅ File saved as '{output_path}'")

Writing change_corporateTariffCode_values.py


In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
import shutil
import time
from tqdm import tqdm
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# === Timer start ===
start = time.time()

# === Copy original file to avoid modifying it ===
original_path = "train.parquet"
copy_path = "train_copy.parquet"
shutil.copy(original_path, copy_path)

# === Load copied data ===
df = pd.read_parquet(copy_path)

# === Feature and target setup ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_flightNumber',
    'taxes'
]
target_col = 'frequentFlyer'
cols_needed = selected_features + [target_col]

# === Drop missing values ===
df_clean = df[cols_needed].dropna()

# === Separate features and raw target ===
X = df_clean[selected_features].copy()
y_raw = df_clean[target_col]

# === Filter out rare classes (frequency < 2) BEFORE encoding ===
value_counts = y_raw.value_counts()
valid_classes = value_counts[value_counts >= 2].index
mask = y_raw.isin(valid_classes)

X = X.loc[mask]
y_raw_filtered = y_raw.loc[mask]

# === Train-validation split (stratified), still using raw labels ===
X_train, X_valid, y_train_raw, y_valid_raw = train_test_split(
    X, y_raw_filtered, test_size=1_000_000, stratify=y_raw_filtered, random_state=42
)

# === Keep only common classes between train and valid ===
common_classes = set(y_train_raw.unique()) & set(y_valid_raw.unique())
train_mask = y_train_raw.isin(common_classes)
valid_mask = y_valid_raw.isin(common_classes)

X_train = X_train.loc[train_mask]
X_valid = X_valid.loc[valid_mask]
y_train_raw = y_train_raw.loc[train_mask]
y_valid_raw = y_valid_raw.loc[valid_mask]

# === Encode labels AFTER filtering ===
le = LabelEncoder()
le.fit(y_train_raw)
y_train = le.transform(y_train_raw)
y_valid = le.transform(y_valid_raw)

# === Encode categorical features ===
cat_columns = ['companyID', 'legs0_segments0_cabinClass', 'legs0_segments0_flightNumber']
label_encoders = {}

print("\n🔤 Encoding categorical features:")
for col in tqdm(cat_columns, desc="Encoding"):
    le_col = LabelEncoder()
    combined = pd.concat([X_train[col], X_valid[col]], axis=0).astype(str)
    le_col.fit(combined)
    X_train[col] = le_col.transform(X_train[col].astype(str))
    X_valid[col] = le_col.transform(X_valid[col].astype(str))
    label_encoders[col] = le_col

# === Train XGBoost model (GPU-compatible setup) ===
model = XGBClassifier(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.1,
    tree_method='hist',
    device='cuda',
    n_jobs=24,
    random_state=42,
    eval_metric='logloss'
)

print("\n🚀 Training model on GPU...")
for _ in tqdm(range(1), desc="Fitting model"):
    model.fit(X_train, y_train)

# === Predict in batches ===
print("🔍 Predicting on validation set...")
predicted = []
batch_size = 500_000
for i in tqdm(range(0, len(X_valid), batch_size), desc="Predicting"):
    batch = X_valid.iloc[i:i+batch_size]
    predicted.extend(model.predict(batch))
predicted = np.array(predicted)

# === Evaluation ===
acc = accuracy_score(y_valid, predicted)
f1 = f1_score(y_valid, predicted, average='weighted')
cm = confusion_matrix(y_valid, predicted)
duration = round(time.time() - start, 2)

# === Save results ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Direct prediction results (XGBClassifier + GPU):\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"F1 Score (weighted): {f1:.4f}\n")
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(cm))
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

print("✅ Results saved to imputation_results.txt")


Overwriting test_imputation.py


In [None]:
✅ Direct prediction results (XGBClassifier + GPU):
Accuracy: 0.7664
F1 Score (weighted): 0.7618
Confusion Matrix:
[[203   0   0 ...   0   0   0]
 [  0  25   0 ...   0   0   0]
 [  0   0 281 ...   0   0   0]
 ...
 [  0   0   0 ...  37   0   0]
 [  0   0   0 ...   0  27   2]
 [  0   0   0 ...   0   2  25]]
⏱️ Duration: 3699.16 seconds

In [None]:
%%writefile impute_frequentFlyer.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import time

# === Start the timer ===
start = time.time()

# === Load and copy data ===
train_original = pd.read_parquet("train.parquet")
test = pd.read_parquet("test.parquet")
train = train_original.copy(deep=True)

# === Keep ID columns for final merge ===
id_columns = ['Id', 'ranker_id']
if not all(col in train.columns for col in id_columns) or not all(col in test.columns for col in id_columns):
    raise ValueError("❌ Both train and test files must contain 'Id' and 'ranker_id' columns.")

# === Features used for prediction ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_flightNumber',
    'taxes'
]

# === Targets to impute ===
target_columns = ['frequentFlyer']

# === Encode categorical features ===
cat_features = [col for col in features if train[col].dtype == 'object' or test[col].dtype == 'object']
encoders = {}

print("\n🔤 Encoding categorical features:")
for col in tqdm(cat_features, desc="Encoding features"):
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]], axis=0).astype(str)
    le.fit(combined)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))
    encoders[col] = le

# === Function to impute one target column ===
def impute_column(target):
    print(f"\n🔧 Processing target: '{target}'")

    # Only use complete rows for training
    train_valid = train.dropna(subset=features + [target])
    if train_valid.empty:
        print(f"❌ Skipped '{target}': No complete training data.")
        return

    X_train = train_valid[features]
    y_raw = train_valid[target].astype(str)

    # Encode target
    le_target = LabelEncoder()
    y_train = le_target.fit_transform(y_raw)

    # === Train model ===
    model = XGBClassifier(
        n_estimators=500,
        max_depth=12,
        learning_rate=0.1,
        tree_method='hist',
        device='cpu',
        n_jobs=24,
        random_state=42,
        eval_metric='logloss'
    )
    model.fit(X_train, y_train)

    # === Predict and impute missing values ===
    for df, name in [(train, "train"), (test, "test")]:
        missing_mask = df[target].isna()
        if missing_mask.sum() == 0:
            print(f"✅ {name}: No missing values in '{target}'")
            continue

        imputable = df.loc[missing_mask, features].dropna()
        if imputable.empty:
            print(f"⚠️ {name}: No imputable rows with all required features for '{target}'.")
            continue

        # Predict
        probs = model.predict_proba(imputable)
        preds = np.argmax(probs, axis=1)
        preds_labels = le_target.inverse_transform(preds)
        confidences = np.max(probs, axis=1)

        # Assign predictions and confidence scores
        df.loc[imputable.index, target] = preds_labels
        df.loc[imputable.index, f"{target}_confidence"] = confidences
        print(f"✅ {name}: Imputed {len(imputable)} rows for '{target}'")

    # === Summary report ===
    for df, name in [(train, "train"), (test, "test")]:
        total = len(df)
        missing = df[target].isna().sum()
        print(f"\n📊 {name} — '{target}':")
        print(f"  Total: {total} rows | Missing: {missing} ({100 * missing / total:.2f}%)")
        print("  Top values:")
        print(df[target].value_counts(dropna=False).head(10))

    # === Save imputed results with ID columns ===
    for df, name in [(train, "train"), (test, "test")]:
        output = df[id_columns + [target, f"{target}_confidence"]]
        output.to_parquet(f"{name}_{target}.parquet", index=False)
        print(f"💾 Saved: {name}_{target}.parquet")

# === Run for each target column ===
for target in target_columns:
    impute_column(target)

# === Done ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")

Overwriting impute_frequentFlyer.py


In [None]:
🔧 Processing target: 'frequentFlyer'
✅ train: Imputed 12012727 rows for 'frequentFlyer'
✅ test: Imputed 3974920 rows for 'frequentFlyer'

📊 train — 'frequentFlyer':
  Total: 18145372 rows | Missing: 0 (0.00%)
  Top values:
frequentFlyer
SU          11255819
SU/S7        2316269
S7/SU         913382
S7            727887
SU/TK         374475
SU/S7/UT      201526
S7/SU/U6      187879
SU/UT         154466
SU/S7/U6      151065
SU/U6         139020
Name: count, dtype: int64

📊 test — 'frequentFlyer':
  Total: 6897776 rows | Missing: 0 (0.00%)
  Top values:
frequentFlyer
SU          3871364
SU/S7        978840
S7/SU        535440
S7           274501
SU/S7/UT      99802
SU/S7/U6      94280
SU/UT         90012
SU/TK         78246
SU/U6         64699
S7/SU/UT      49531
Name: count, dtype: int64
💾 Saved: train_frequentFlyer.parquet
💾 Saved: test_frequentFlyer.parquet

⏱️ Total duration: 15440.66 seconds

In [None]:
%%writefile threshold_to_use.py
import polars as pl

# Thresholds to analyze
thresholds = [0.998, 0.996, 0.994, 0.992, 0.99, 0.98, 0.97, 0.96, 0.95, 0.90, 0.85, 0.80, 0.75, 0.70, 0.65, 0.60, 0.55, 0.50]

# Files and labels
files = {
    "train": "train_frequentFlyer.parquet",
    "test": "test_frequentFlyer.parquet"
}

# Column to analyze
confidence_col = "frequentFlyer_confidence"

# Store output lines
output_lines = []

for name, path in files.items():
    df = pl.read_parquet(path)
    total_rows = df.height

    header = f"\n📂 {name.upper()} ({path}) — Total rows: {total_rows}"
    print(header)
    output_lines.append(header)

    for t in thresholds:
        count = df.filter(pl.col(confidence_col) < t).height
        line = f"  < {t:.2f}: {count} rows ({count / total_rows:.2%})"
        print(line)
        output_lines.append(line)

# Save results to text file
with open("confidence_summary.txt", "w") as f:
    f.write("\n".join(output_lines))

print("\n✅ Results saved to 'confidence_summary.txt'")

Writing threshold_to_use.py


In [None]:
%%writefile remove_based_thr.py
import polars as pl

# === Files to process ===
files = {
    "train_frequentFlyer.parquet": "train_frequentFlyer_filtered.parquet",
    "test_frequentFlyer.parquet": "test_frequentFlyer_filtered.parquet"
}

# === Threshold and columns ===
confidence_col = "frequentFlyer_confidence"
target_col = "frequentFlyer"
threshold = 0.99

for input_path, output_path in files.items():
    # Load the data
    df = pl.read_parquet(input_path)

    # Replace low-confidence values with null
    df_cleaned = df.with_columns([
        pl.when(df[confidence_col] < threshold)
        .then(None)
        .otherwise(df[target_col])
        .alias(target_col)
    ])

    # Save the cleaned dataframe
    df_cleaned.write_parquet(output_path)
    print(f"✅ Saved: '{output_path}' with low-confidence values removed.")

Writing remove_based_thr.py


In [None]:
%%writefile change_frequentFlyer_values.py
import polars as pl

# === Define file pairs (input, filtered) ===
file_pairs = [
    ("train.parquet", "train_frequentFlyer_filtered.parquet", "train_filled.parquet"),
    ("test.parquet", "test_frequentFlyer_filtered.parquet", "test_filled.parquet")
]

# === Column to update ===
column_name = "frequentFlyer"

for original_path, filtered_path, output_path in file_pairs:
    # Load data
    df_orig = pl.read_parquet(original_path)
    df_filtered = pl.read_parquet(filtered_path)

    # Validate IDs match
    if df_orig.shape[0] != df_filtered.shape[0] or not (df_orig["Id"] == df_filtered["Id"]).all():
        raise ValueError(f"❌ '{original_path}' and '{filtered_path}' must have same number of rows and identical 'Id' order.")

    # Replace nulls in original with non-null values from filtered
    updated_column = (
        pl.when(df_orig[column_name].is_null() & df_filtered[column_name].is_not_null())
        .then(df_filtered[column_name])
        .otherwise(df_orig[column_name])
    )

    # Update DataFrame
    df_updated = df_orig.with_columns([
        updated_column.alias(column_name)
    ])

    # Save result
    df_updated.write_parquet(output_path)
    print(f"✅ File saved as '{output_path}'")

Writing change_frequentFlyer_values.py


In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
import shutil
import time
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# === Start the timer ===
start = time.time()

# === Create a safe copy of the original file ===
original_path = "train.parquet"
copy_path = "train_copy.parquet"
shutil.copy(original_path, copy_path)

# === Load the copied data (.parquet) ===
df = pd.read_parquet(copy_path)

# === Select relevant columns ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_flightNumber',
    'taxes'
]
target_col = 'miniRules0_monetaryAmount'
cols_needed = selected_features + [target_col]

# === Filter complete rows for training ===
df_full = df[cols_needed].dropna(subset=cols_needed)

# === Randomly select 1,000,000 rows for validation ===
sample = df_full.sample(n=1_000_000, random_state=42)
X_valid = sample[selected_features].copy()
y_valid = sample[target_col].copy()

# === Remove validation rows from training set ===
df_train = df_full.drop(sample.index, errors='ignore')
X_train = df_train[selected_features].copy()
y_train = df_train[target_col].copy()

# === Encode categorical features ===
cat_cols = ['companyID', 'legs0_segments0_cabinClass', 'legs0_segments0_flightNumber']
print("\n🔤 Encoding categorical features...")
category_maps = {}

for col in cat_cols:
    X_train[col], uniques = pd.factorize(X_train[col])
    X_valid[col] = uniques.get_indexer(X_valid[col])
    category_maps[col] = list(uniques)

# === Train the model ===
model = XGBRegressor(
    n_estimators=2000,
    max_depth=16,
    learning_rate=0.1,
    tree_method='hist',
    device='cuda',
    n_jobs=24,
    random_state=42,
    eval_metric='rmse'
)
print("\n🚀 Training model...")
model.fit(X_train, y_train)

# === Predict on the validation set ===
y_pred = model.predict(X_valid)

# === Compute evaluation metrics ===
mae = mean_absolute_error(y_valid, y_pred)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
r2 = r2_score(y_valid, y_pred)
duration = round(time.time() - start, 2)

# === Save results ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Imputation evaluation (XGBRegressor on train_copy.parquet):\n")
    f.write(f"MAE: {mae:.2f}\n")
    f.write(f"RMSE: {rmse:.2f}\n")
    f.write(f"R² Score: {r2:.4f}\n")
    f.write(f"⏱️ Duration: {duration} seconds\n")

print("✅ Results saved to imputation_results.txt")

Overwriting test_imputation.py


In [None]:
✅ Imputation evaluation (XGBRegressor on train_copy.parquet):
MAE: 253.16
RMSE: 1268.10
R² Score: 0.8554
⏱️ Duration: 279.34 seconds

In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
import shutil
import time
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# === Start the timer ===
start = time.time()

# === Create a safe copy of the original file ===
original_path = "train.parquet"
copy_path = "train_copy.parquet"
shutil.copy(original_path, copy_path)

# === Load the copied data (.parquet) ===
df = pd.read_parquet(copy_path)

# === Select relevant columns ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_flightNumber',
    'taxes'
]
target_col = 'miniRules0_monetaryAmount'
cols_needed = selected_features + [target_col]

# === Filter complete rows for training ===
df_full = df[cols_needed].dropna(subset=cols_needed)

# === Randomly select 1,000,000 rows for validation ===
sample = df_full.sample(n=1_000_000, random_state=42)
X_valid = sample[selected_features].copy()
y_valid = sample[target_col].copy()

# === Remove validation rows from training set ===
df_train = df_full.drop(sample.index, errors='ignore')
X_train = df_train[selected_features].copy()
y_train = df_train[target_col].copy()

# === Encode categorical features ===
cat_cols = ['companyID', 'legs0_segments0_cabinClass', 'legs0_segments0_flightNumber']
print("\n🔤 Encoding categorical features...")
category_maps = {}

for col in cat_cols:
    X_train[col], uniques = pd.factorize(X_train[col])
    X_valid[col] = uniques.get_indexer(X_valid[col])
    category_maps[col] = list(uniques)

# === Train the model ===
model = XGBRegressor(
    n_estimators=2000,
    max_depth=16,
    learning_rate=0.1,
    tree_method='hist',
    device='cuda',
    n_jobs=24,
    random_state=42,
    eval_metric='rmse'
)
print("\n🚀 Training model...")
model.fit(X_train, y_train)

# === Predict on the validation set ===
y_pred = model.predict(X_valid)

# === Round predictions and clip negative values to 0 ===
y_pred = np.round(np.clip(y_pred, 0, None))

# === Compute evaluation metrics ===
mae = mean_absolute_error(y_valid, y_pred)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
r2 = r2_score(y_valid, y_pred)
duration = round(time.time() - start, 2)

# === Save results ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Imputation evaluation (XGBRegressor on train_copy.parquet):\n")
    f.write(f"MAE: {mae:.2f}\n")
    f.write(f"RMSE: {rmse:.2f}\n")
    f.write(f"R² Score: {r2:.4f}\n")
    f.write(f"⏱️ Duration: {duration} seconds\n")

print("✅ Results saved to imputation_results.txt")

Overwriting test_imputation.py


In [None]:
MAE: 249.32
RMSE: 1267.43
R² Score: 0.8555
⏱️ Duration: 263.22 seconds

In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
import shutil
import time
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# === Start the timer ===
start = time.time()

# === Create a safe copy of the original file ===
original_path = "train.parquet"
copy_path = "train_copy.parquet"
shutil.copy(original_path, copy_path)

# === Load the copied data (.parquet) ===
df = pd.read_parquet(copy_path)

# === Select relevant columns ===
selected_features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_flightNumber',
    'taxes'
]
target_col = 'miniRules1_monetaryAmount'
cols_needed = selected_features + [target_col]

# === Filter complete rows for training ===
df_full = df[cols_needed].dropna(subset=cols_needed)

# === Randomly select 1,000,000 rows for validation ===
sample = df_full.sample(n=1_000_000, random_state=42)
X_valid = sample[selected_features].copy()
y_valid = sample[target_col].copy()

# === Remove validation rows from training set ===
df_train = df_full.drop(sample.index, errors='ignore')
X_train = df_train[selected_features].copy()
y_train = df_train[target_col].copy()

# === Encode categorical features ===
cat_cols = ['companyID', 'legs0_segments0_cabinClass', 'legs0_segments0_flightNumber']
print("\n🔤 Encoding categorical features...")
category_maps = {}

for col in cat_cols:
    X_train[col], uniques = pd.factorize(X_train[col])
    X_valid[col] = uniques.get_indexer(X_valid[col])
    category_maps[col] = list(uniques)

# === Train the model ===
model = XGBRegressor(
    n_estimators=2000,
    max_depth=16,
    learning_rate=0.1,
    tree_method='hist',
    device='cuda',
    n_jobs=24,
    random_state=42,
    eval_metric='rmse'
)
print("\n🚀 Training model...")
model.fit(X_train, y_train)

# === Predict on the validation set ===
y_pred = model.predict(X_valid)

# === Round predictions and clip negative values to 0 ===
y_pred = np.round(np.clip(y_pred, 0, None))

# === Compute evaluation metrics ===
mae = mean_absolute_error(y_valid, y_pred)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
r2 = r2_score(y_valid, y_pred)
duration = round(time.time() - start, 2)

# === Save results ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Imputation evaluation (XGBRegressor on train_copy.parquet):\n")
    f.write(f"MAE: {mae:.2f}\n")
    f.write(f"RMSE: {rmse:.2f}\n")
    f.write(f"R² Score: {r2:.4f}\n")
    f.write(f"⏱️ Duration: {duration} seconds\n")

print("✅ Results saved to imputation_results.txt")

Overwriting test_imputation.py


In [None]:
✅ Imputation evaluation (XGBRegressor on train_copy.parquet):
MAE: 385.78
RMSE: 4847.35
R² Score: 0.4761
⏱️ Duration: 339.3 seconds

In [None]:
%%writefile impute_monetary.py
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import time

# === Start the timer ===
start = time.time()

# === Load data ===
train = pd.read_parquet("train.parquet")
test = pd.read_parquet("test.parquet")

# === Keep ID columns for final merge ===
id_columns = ['Id', 'ranker_id']
if not all(col in train.columns for col in id_columns) or not all(col in test.columns for col in id_columns):
    raise ValueError("❌ Both train and test files must contain 'Id' and 'ranker_id' columns.")

# === Features and targets ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_flightNumber',
    'taxes'
]
target_columns = ['miniRules0_monetaryAmount', 'miniRules1_monetaryAmount']

# === Encode categorical features ===
cat_features = [col for col in features if train[col].dtype == 'object' or test[col].dtype == 'object']
encoders = {}

print("\n🔤 Encoding categorical features:")
for col in tqdm(cat_features, desc="Encoding features"):
    le = LabelEncoder()
    combined = pd.concat([train[col], test[col]], axis=0).astype(str)
    le.fit(combined)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))
    encoders[col] = le

# === Impute numeric target columns ===
def impute_column(target):
    print(f"\n🔧 Imputing: '{target}'")

    train_valid = train.dropna(subset=features + [target])
    if train_valid.empty:
        print(f"❌ Skipped '{target}': No valid training data.")
        return

    X_train = train_valid[features]
    y_train = train_valid[target].astype(float)

    # === Train model ===
    model = XGBRegressor(
        n_estimators=2000,
        max_depth=16,
        learning_rate=0.1,
        tree_method='hist',
        device='cuda',
        n_jobs=24,
        random_state=42,
        eval_metric='rmse'
    )
    model.fit(X_train, y_train)

    # === Predict & impute ===
    for df, name in [(train, "train"), (test, "test")]:
        imputable_idx = df[df[target].isna()].index
        imputable = df.loc[imputable_idx, features].dropna()

        if imputable.empty:
            print(f"✅ {name}: No imputable rows for '{target}'")
            continue

        preds = model.predict(imputable)
        preds = np.clip(preds, 0, None)
        preds = np.floor(preds + 0.5).astype(float)

        # Confidence
        y_mean = y_train.mean()
        confidence = 1 - np.abs(preds - y_mean) / (y_mean + 1e-8)
        confidence = np.clip(confidence, 0, 1)

        # Fill in imputations
        df[target] = df[target].copy()
        df.loc[imputable.index, target] = preds

        df[target + "_confidence"] = np.nan
        df.loc[imputable.index, target + "_confidence"] = confidence

        # Save
        output = df[id_columns + [target, target + "_confidence"]].copy()
        output.to_parquet(f"{name}_{target}.parquet", index=False)
        print(f"💾 Saved: {name}_{target}.parquet — {len(imputable)} values imputed")

# === Run for all targets ===
for target in target_columns:
    impute_column(target)

# === Done ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")


Writing impute_monetary.py


In [None]:
🔤 Encoding categorical features:
Encoding features: 100%|██████████| 1/1 [00:01<00:00,  1.40s/it]
/home/ionut/anaconda3/envs/test_env/lib/python3.10/site-packages/xgboost/core.py:729: UserWarning: [17:09:27] WARNING: /workspace/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.

This warning will only be shown once.

  return func(**kwargs)

🔧 Imputing: 'miniRules0_monetaryAmount'
💾 Saved: train_miniRules0_monetaryAmount.parquet — 1395743 values imputed
💾 Saved: test_miniRules0_monetaryAmount.parquet — 504405 values imputed

🔧 Imputing: 'miniRules1_monetaryAmount'
💾 Saved: train_miniRules1_monetaryAmount.parquet — 1395743 values imputed
💾 Saved: test_miniRules1_monetaryAmount.parquet — 504405 values imputed

⏱️ Total duration: 584.12 seconds

In [None]:
%%writefile impute_monetary.py
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import time

# === Start timer ===
start = time.time()

# === Load input files ===
train = pd.read_parquet("train_filled.parquet")
test = pd.read_parquet("test_filled.parquet")

# === Features and targets ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_flightNumber',
    'taxes'
]
target_columns = ['miniRules0_monetaryAmount', 'miniRules1_monetaryAmount']

# === Encode categorical features ===
cat_features = [col for col in features if train[col].dtype == 'object' or test[col].dtype == 'object']
print("\n🔤 Encoding categorical features:")
for col in tqdm(cat_features, desc="Encoding"):
    le = LabelEncoder()
    full = pd.concat([train[col], test[col]], axis=0).astype(str)
    le.fit(full)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# === Logging results ===
log_lines = []

def impute_column(target):
    print(f"\n🔧 Imputing: {target}")
    log_lines.append(f"\n📌 {target}")

    train_valid = train.dropna(subset=features + [target])
    if train_valid.empty:
        print(f"❌ Skipped: No training data for '{target}'")
        log_lines.append("❌ No valid training data")
        return

    X_train = train_valid[features]
    y_train = train_valid[target].astype(float)

    model = XGBRegressor(
        n_estimators=2000,
        max_depth=16,
        learning_rate=0.1,
        tree_method='hist',
        device='cuda',
        n_jobs=24,
        random_state=42,
        eval_metric='rmse'
    )
    model.fit(X_train, y_train)

    for df, name in [(train, "train"), (test, "test")]:
        imputable_idx = df[df[target].isna()].index
        imputable = df.loc[imputable_idx, features].dropna()

        if imputable.empty:
            print(f"✅ {name}: No imputable values for '{target}'")
            continue

        preds = model.predict(imputable)
        preds = np.clip(preds, 0, None)
        preds = np.floor(preds + 0.5).astype(float)

        df.loc[imputable.index, target] = preds

        # Logging
        imputed_count = len(imputable)
        total = len(df)
        stats = df[target].describe().to_string()
        top_vals = df[target].value_counts().head(5).to_string()

        print(f"✅ {name}: Imputed {imputed_count} rows")
        log_lines.append(f"\n📂 {name}: {imputed_count} values imputed")
        log_lines.append("📊 Stats:\n" + stats)
        log_lines.append("🏆 Top values:\n" + top_vals)

# === Run for each target ===
for target in target_columns:
    impute_column(target)

# === Save datasets ===
train.to_parquet("train_filled.parquet", index=False)
test.to_parquet("test_filled.parquet", index=False)

# === Save log ===
with open("monetary_imputation_log.txt", "w") as f:
    f.write("\n".join(log_lines))

print("\n📝 Saved log to 'monetary_imputation_log.txt'")
print(f"⏱️ Done in {round(time.time() - start, 2)} seconds")

Overwriting impute_monetary.py


In [None]:
📌 miniRules0_monetaryAmount

📂 train: 1395743 values imputed
📊 Stats:
count    1.814537e+07
mean     2.489853e+03
std      3.278354e+03
min      0.000000e+00
25%      0.000000e+00
50%      2.800000e+03
75%      2.800000e+03
max      5.022370e+05
🏆 Top values:
miniRules0_monetaryAmount
2800.0    8796221
0.0       5619739
4000.0     494350
2300.0     433760
4600.0     268734

📂 test: 504405 values imputed
📊 Stats:
count    6.897776e+06
mean     2.693489e+03
std      3.984846e+03
min      0.000000e+00
25%      0.000000e+00
50%      2.800000e+03
75%      2.800000e+03
max      2.431870e+05
🏆 Top values:
miniRules0_monetaryAmount
2800.0    3520728
0.0       2038554
4000.0     166146
2300.0     131509
4600.0      41998

📌 miniRules1_monetaryAmount

📂 train: 1395743 values imputed
📊 Stats:
count    1.814537e+07
mean     1.362107e+03
std      5.742862e+03
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      2.800000e+03
max      7.161273e+06
🏆 Top values:
miniRules1_monetaryAmount
0.0       11578738
2800.0     3347198
1500.0      677747
3500.0      433574
4600.0      127484

📂 test: 504405 values imputed
📊 Stats:
count    6.897776e+06
mean     1.450065e+03
std      4.028909e+03
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      2.800000e+03
max      4.736350e+05
🏆 Top values:
miniRules1_monetaryAmount
0.0       4501675
2800.0    1236787
1500.0     227284
3500.0     131388
4000.0      21480

In [None]:
%%writefile test_imputation.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import time

# === Start the timer ===
start = time.time()

# === Load the data ===
df_original = pd.read_parquet("train.parquet")
df = df_original.copy(deep=True)  # Work on a deep copy

# === Define features and target ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_flightNumber',
    'taxes'
]
target = 'miniRules0_statusInfos'
cols_needed = features + [target]

# === Drop rows with missing values in features or target ===
df = df.dropna(subset=cols_needed)

# === Encode categorical features ===
cat_features = ['companyID', 'legs0_segments0_cabinClass', 'legs0_segments0_flightNumber']
feature_encoders = {}

print("\n🔤 Encoding categorical features:")
for col in tqdm(cat_features, desc="Encoding features"):
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    feature_encoders[col] = le

# === Encode target ===
target_encoder = LabelEncoder()
df[target] = target_encoder.fit_transform(df[target].astype(str))

# === Sample 1,000,000 rows for validation (or fewer if needed) ===
sample_size = min(1_000_000, len(df))
sample = df.sample(n=sample_size, random_state=42)
X_valid = sample[features]
y_valid = sample[target]

# === Use remaining data for training ===
train_df = df.drop(sample.index)
X_train = train_df[features]
y_train = train_df[target]

# === Train the classifier ===
model = XGBClassifier(
    n_estimators=2000,
    max_depth=14,
    learning_rate=0.1,
    tree_method='hist',
    device='cuda',
    n_jobs=24,
    random_state=42,
    eval_metric='logloss'
)
model.fit(X_train, y_train)

# === Predict and evaluate ===
preds = model.predict(X_valid)
acc = accuracy_score(y_valid, preds)
f1 = f1_score(y_valid, preds, average='weighted')
cm = confusion_matrix(y_valid, preds)
duration = round(time.time() - start, 2)

# === Save results ===
with open("imputation_results.txt", "w") as f:
    f.write("✅ Imputation Evaluation Results:\n")
    f.write(f"Accuracy: {acc:.4f}\n")
    f.write(f"F1 Score (weighted): {f1:.4f}\n")
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(cm))
    f.write(f"\n⏱️ Duration: {duration} seconds\n")

# === Also print to console ===
print("✅ Results saved to imputation_results.txt")

Overwriting test_imputation.py


In [None]:
✅ Imputation Evaluation Results:
Accuracy: 0.9936
F1 Score (weighted): 0.9934
Confusion Matrix:
[[ 21279   4158]
 [  2288 972275]]
⏱️ Duration: 107.54 seconds

In [None]:
%%writefile impute_statusInfos.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from tqdm import tqdm
import time

# === Start the timer ===
start = time.time()

# === Load and copy data ===
train_original = pd.read_parquet("train.parquet")
test = pd.read_parquet("test.parquet")
train = train_original.copy(deep=True)

# === Keep ID columns for final merge ===
id_columns = ['Id', 'ranker_id']
if not all(col in train.columns for col in id_columns) or not all(col in test.columns for col in id_columns):
    raise ValueError("❌ Both train and test files must contain 'Id' and 'ranker_id' columns.")

# === Features used for prediction ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_flightNumber',
    'taxes'
]

# === Targets to impute ===
target_columns = ['miniRules0_statusInfos', 'miniRules1_statusInfos']

# === Encode categorical features ===
cat_features = [col for col in features if train[col].dtype == 'object' or test[col].dtype == 'object']
encoders = {}

print("\n🔤 Encoding categorical features:")
for col in tqdm(cat_features, desc="Encoding features"):
    train[col], uniques = pd.factorize(train[col])
    test[col] = uniques.get_indexer(test[col])
    encoders[col] = uniques

# === Function to impute one target column ===
def impute_column(target):
    print(f"\n🔧 Processing target: '{target}'")

    train_valid = train.dropna(subset=features + [target])
    if train_valid.empty:
        print(f"❌ Skipped '{target}': No complete training data.")
        return

    X_train = train_valid[features]
    y_train = train_valid[target].astype(float)

    # === Train model ===
    model = XGBClassifier(
        n_estimators=2000,
        max_depth=14,
        learning_rate=0.1,
        tree_method='hist',
        device='cpu',
        n_jobs=24,
        random_state=42,
        eval_metric='logloss'
    )
    model.fit(X_train, y_train)

    # === Predict and impute for both datasets ===
    for df, name in [(train, "train"), (test, "test")]:
        missing_mask = df[target].isna()
        if missing_mask.sum() == 0:
            print(f"✅ {name}: No missing values in '{target}'")
            continue

        imputable = df.loc[missing_mask, features].dropna()
        if imputable.empty:
            print(f"⚠️ {name}: No imputable rows with all required features for '{target}'.")
            continue

        # Predict
        probs = model.predict_proba(imputable)
        preds = np.argmax(probs, axis=1).astype(float)
        confidences = np.max(probs, axis=1)

        # Assign predictions and confidence scores
        df.loc[imputable.index, target] = preds
        df.loc[imputable.index, f"{target}_confidence"] = confidences
        print(f"✅ {name}: Imputed {len(imputable)} rows for '{target}'")

    # === Summary report ===
    for df, name in [(train, "train"), (test, "test")]:
        total = len(df)
        missing = df[target].isna().sum()
        print(f"\n📊 {name} — '{target}':")
        print(f"  Total: {total} rows | Missing: {missing} ({100 * missing / total:.2f}%)")
        print("  Top values:")
        print(df[target].value_counts(dropna=False).head(10))

    # === Save imputed results ===
    for df, name in [(train, "train"), (test, "test")]:
        output = df[id_columns + [target, f"{target}_confidence"]]
        output.to_parquet(f"{name}_{target}.parquet", index=False)
        print(f"💾 Saved: {name}_{target}.parquet")

# === Run for each target column ===
for target in target_columns:
    impute_column(target)

# === Done ===
print(f"\n⏱️ Total duration: {round(time.time() - start, 2)} seconds")

Writing impute_statusInfos.py


In [None]:
%%writefile impute_statusInfos.py
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from tqdm import tqdm
import time

# === Start the timer ===
start = time.time()

# === Load data ===
train = pd.read_parquet("train_filled.parquet")
test = pd.read_parquet("test_filled.parquet")

# === Features used for prediction ===
features = [
    'totalPrice',
    'companyID',
    'legs0_segments0_cabinClass',
    'isAccess3D',
    'isVip',
    'legs0_segments0_flightNumber',
    'taxes'
]

# === Targets to impute ===
target_columns = ['miniRules0_statusInfos', 'miniRules1_statusInfos']

# === Encode categorical features ===
cat_features = [col for col in features if train[col].dtype == 'object' or test[col].dtype == 'object']

print("\n🔤 Encoding categorical features:")
for col in tqdm(cat_features, desc="Encoding features"):
    train[col], uniques = pd.factorize(train[col])
    test[col] = uniques.get_indexer(test[col])

# === Prepare report file ===
report_lines = []
report_lines.append(f"📝 Imputation Report — {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
report_lines.append("=" * 60 + "\n")

# === Function to impute one target column ===
def impute_column(target):
    report_lines.append(f"\n🔧 Target: {target}\n")

    train_valid = train.dropna(subset=features + [target])
    if train_valid.empty:
        report_lines.append(f"❌ Skipped: No valid training data for {target}\n")
        return

    X_train = train_valid[features]
    y_train = train_valid[target].astype(float)

    model = XGBClassifier(
        n_estimators=2000,
        max_depth=14,
        learning_rate=0.1,
        tree_method='hist',
        device='cpu',
        n_jobs=24,
        random_state=42,
        eval_metric='logloss'
    )
    model.fit(X_train, y_train)

    for df, name in [(train, "train"), (test, "test")]:
        missing_mask = df[target].isna()
        total_missing_before = missing_mask.sum()

        imputable = df.loc[missing_mask, features].dropna()
        imputable_indices = imputable.index
        if imputable.empty:
            report_lines.append(f"⚠️ {name}: No imputable rows with all features for {target}\n")
            continue

        preds = model.predict(imputable).astype(float)
        df.loc[imputable_indices, target] = preds
        total_imputed = len(imputable_indices)

        # Summary stats
        total_rows = len(df)
        total_missing_after = df[target].isna().sum()
        top_values = df[target].value_counts(dropna=False).head(10)

        report_lines.append(f"✅ {name}:\n")
        report_lines.append(f"  Imputed rows: {total_imputed}\n")
        report_lines.append(f"  Total rows: {total_rows}\n")
        report_lines.append(f"  Remaining missing: {total_missing_after} ({100 * total_missing_after / total_rows:.2f}%)\n")
        report_lines.append("  Top 10 values after imputation:\n")
        for val, count in top_values.items():
            report_lines.append(f"    {val}: {count}\n")

# === Run imputation ===
for target in target_columns:
    impute_column(target)

# === Overwrite the modified DataFrames ===
train.to_parquet("train_filled.parquet", index=False)
test.to_parquet("test_filled.parquet", index=False)

# === Write report to file ===
with open("imputation_report.txt", "w", encoding="utf-8") as f:
    f.writelines([line if line.endswith('\n') else line + '\n' for line in report_lines])

# === Done ===
print(f"\n💾 Overwritten 'train_filled.parquet' and 'test_filled.parquet'")
print(f"📝 Saved imputation report to 'imputation_report.txt'")
print(f"⏱️ Total duration: {round(time.time() - start, 2)} seconds")

Writing impute_statusInfos.py


In [None]:
📝 Imputation Report — 2025-07-19 19:16:42
============================================================

🔧 Target: miniRules0_statusInfos
✅ train:
  Imputed rows: 1469953
  Total rows: 18145372
  Remaining missing: 0 (0.00%)
  Top 10 values after imputation:
    1.0: 17694481
    0.0: 450891
✅ test:
  Imputed rows: 550192
  Total rows: 6897776
  Remaining missing: 0 (0.00%)
  Top 10 values after imputation:
    1.0: 6729060
    0.0: 168716

🔧 Target: miniRules1_statusInfos
✅ train:
  Imputed rows: 1518169
  Total rows: 18145372
  Remaining missing: 0 (0.00%)
  Top 10 values after imputation:
    1.0: 10528023
    0.0: 7617349
✅ test:
  Imputed rows: 574432
  Total rows: 6897776
  Remaining missing: 0 (0.00%)
  Top 10 values after imputation:
    1.0: 3859158
    0.0: 3038618

In [None]:
%%writefile check.py
import pandas as pd

# === Columns to check ===
cols_to_check = [
    "corporateTariffCode",
    "miniRules0_monetaryAmount",
    "miniRules1_monetaryAmount",
    "miniRules0_statusInfos",
    "miniRules1_statusInfos"
]

# === Load parquet files ===
train = pd.read_parquet("train_filled.parquet")
test = pd.read_parquet("test_filled.parquet")

# === Missing value reporting function ===
def report_missing(df, name):
    print(f"\n📊 Missing value report for: {name}")
    total = len(df)
    for col in cols_to_check:
        missing = df[col].isna().sum()
        percent = 100 * missing / total
        print(f"  {col:30s} - Missing: {missing:6d} / {total} ({percent:5.2f}%)")

# === Run the reports ===
report_missing(train, "train_filled.parquet")
report_missing(test, "test_filled.parquet")

Writing check.py


In [None]:
📊 Missing value report for: train_filled.parquet
  corporateTariffCode            - Missing: 3789156 / 18145372 (20.88%)
  miniRules0_monetaryAmount      - Missing:      0 / 18145372 ( 0.00%)
  miniRules1_monetaryAmount      - Missing:      0 / 18145372 ( 0.00%)
  miniRules0_statusInfos         - Missing:      0 / 18145372 ( 0.00%)
  miniRules1_statusInfos         - Missing:      0 / 18145372 ( 0.00%)

📊 Missing value report for: test_filled.parquet
  corporateTariffCode            - Missing: 1480358 / 6897776 (21.46%)
  miniRules0_monetaryAmount      - Missing:      0 / 6897776 ( 0.00%)
  miniRules1_monetaryAmount      - Missing:      0 / 6897776 ( 0.00%)
  miniRules0_statusInfos         - Missing:      0 / 6897776 ( 0.00%)
  miniRules1_statusInfos         - Missing:      0 / 6897776 ( 0.00%)

# **Feature Engineering**

In [None]:
%%writefile column_differencies.py
import pandas as pd

# Read the CSV files
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Get the sets of column names
train_columns = set(train_df.columns)
test_columns = set(test_df.columns)

# Columns that are in train but not in test
only_in_train = train_columns - test_columns

# Columns that are in test but not in train
only_in_test = test_columns - train_columns

# Print the results
print("Columns only in train.csv:", only_in_train)
print("Columns only in test.csv:", only_in_test)

Writing column_differencies.py


Columns only in train.csv: {'selected', 'legs0_segments0_seatsAvailable_missing_initially'}

Columns only in test.csv: set()

In [None]:
%%writefile remove_legs0_segments0_seatsAvailable_missing_initially.py
import pandas as pd

# Load the CSV file
train_df = pd.read_csv('train.csv')

# Drop the specified column if it exists
column_to_remove = 'legs0_segments0_seatsAvailable_missing_initially'
if column_to_remove in train_df.columns:
    train_df = train_df.drop(columns=[column_to_remove])
    train_df.to_csv('train.csv', index=False)
    print(f"Column '{column_to_remove}' has been removed and train.csv has been updated.")
else:
    print(f"Column '{column_to_remove}' not found in train.csv.")

Writing remove_legs0_segments0_seatsAvailable_missing_initially.py


In [None]:
%%writefile check_columns.py
import pandas as pd

# Load the CSV files
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Print column names
print("Columns in train.csv:")
print(train_df.columns.tolist())

print("\nColumns in test.csv:")
print(test_df.columns.tolist())

Writing check_columns.py


Columns in train.csv:
['Id', 'bySelf', 'companyID', 'corporateTariffCode', 'frequentFlyer', 'nationality', 'isAccess3D', 'isVip', 'legs0_arrivalAt', 'legs0_departureAt', 'legs0_duration', 'legs0_segments0_aircraft_code', 'legs0_segments0_arrivalTo_airport_city_iata', 'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_baggageAllowance_quantity', 'legs0_segments0_baggageAllowance_weightMeasurementType', 'legs0_segments0_cabinClass', 'legs0_segments0_departureFrom_airport_iata', 'legs0_segments0_duration', 'legs0_segments0_flightNumber', 'legs0_segments0_marketingCarrier_code', 'legs0_segments0_operatingCarrier_code', 'legs0_segments0_seatsAvailable', 'legs1_duration', 'legs1_segments0_aircraft_code', 'legs1_segments0_arrivalTo_airport_city_iata', 'legs1_segments0_arrivalTo_airport_iata', 'legs1_segments0_baggageAllowance_quantity', 'legs1_segments0_baggageAllowance_weightMeasurementType', 'legs1_segments0_cabinClass', 'legs1_segments0_departureFrom_airport_iata', 'legs1_segments0_duration', 'legs1_segments0_marketingCarrier_code', 'legs1_segments0_operatingCarrier_code', 'legs1_segments0_seatsAvailable', 'miniRules0_monetaryAmount', 'miniRules0_statusInfos', 'miniRules1_monetaryAmount', 'miniRules1_statusInfos', 'pricingInfo_isAccessTP', 'pricingInfo_passengerCount', 'profileId', 'ranker_id', 'requestDate', 'searchRoute', 'sex', 'taxes', 'totalPrice', 'selected', '__index_level_0__', 'legs0_segments0_baggageAllowance_missing_initially', 'miniRules0_statusInfos_was_missing', 'miniRules1_statusInfos_was_missing', 'legs1_segments0_aircraft_code_was_missing', 'legs1_departureAt_hour', 'legs1_departureAt_minute', 'legs1_departureAt_is_weekend', 'legs1_departureAt_day', 'legs1_departureAt_month', 'legs1_departureAt_year', 'legs1_departureAt_part_of_day', 'legs1_departureAt_hour_sin', 'legs1_departureAt_hour_cos', 'legs1_arrivalAt_hour', 'legs1_arrivalAt_minute', 'legs1_arrivalAt_is_weekend', 'legs1_arrivalAt_day', 'legs1_arrivalAt_month', 'legs1_arrivalAt_year', 'legs1_arrivalAt_part_of_day', 'legs1_arrivalAt_hour_sin', 'legs1_arrivalAt_hour_cos']


Columns in test.csv:
['Id', 'bySelf', 'companyID', 'corporateTariffCode', 'frequentFlyer', 'nationality', 'isAccess3D', 'isVip', 'legs0_arrivalAt', 'legs0_departureAt', 'legs0_duration', 'legs0_segments0_aircraft_code', 'legs0_segments0_arrivalTo_airport_city_iata', 'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_baggageAllowance_quantity', 'legs0_segments0_baggageAllowance_weightMeasurementType', 'legs0_segments0_cabinClass', 'legs0_segments0_departureFrom_airport_iata', 'legs0_segments0_duration', 'legs0_segments0_flightNumber', 'legs0_segments0_marketingCarrier_code', 'legs0_segments0_operatingCarrier_code', 'legs0_segments0_seatsAvailable', 'legs1_duration', 'legs1_segments0_aircraft_code', 'legs1_segments0_arrivalTo_airport_city_iata', 'legs1_segments0_arrivalTo_airport_iata', 'legs1_segments0_baggageAllowance_quantity', 'legs1_segments0_baggageAllowance_weightMeasurementType', 'legs1_segments0_cabinClass', 'legs1_segments0_departureFrom_airport_iata', 'legs1_segments0_duration', 'legs1_segments0_marketingCarrier_code', 'legs1_segments0_operatingCarrier_code', 'legs1_segments0_seatsAvailable', 'miniRules0_monetaryAmount', 'miniRules0_statusInfos', 'miniRules1_monetaryAmount', 'miniRules1_statusInfos', 'pricingInfo_isAccessTP', 'pricingInfo_passengerCount', 'profileId', 'ranker_id', 'requestDate', 'searchRoute', 'sex', 'taxes', 'totalPrice', '__index_level_0__', 'legs0_segments0_baggageAllowance_missing_initially', 'miniRules0_statusInfos_was_missing', 'miniRules1_statusInfos_was_missing', 'legs1_segments0_aircraft_code_was_missing', 'legs1_departureAt_hour', 'legs1_departureAt_minute', 'legs1_departureAt_is_weekend', 'legs1_departureAt_day', 'legs1_departureAt_month', 'legs1_departureAt_year', 'legs1_departureAt_part_of_day', 'legs1_departureAt_hour_sin', 'legs1_departureAt_hour_cos', 'legs1_arrivalAt_hour', 'legs1_arrivalAt_minute', 'legs1_arrivalAt_is_weekend', 'legs1_arrivalAt_day', 'legs1_arrivalAt_month', 'legs1_arrivalAt_year', 'legs1_arrivalAt_part_of_day', 'legs1_arrivalAt_hour_sin', 'legs1_arrivalAt_hour_cos']

# **Feature Importance**

In [None]:
0                                           is_min_segments_leg0  2512.625244
1                                                 is_direct_leg0   252.625687
2                                                n_segments_leg0   222.089539
3                                                  free_exchange   170.093369
4                                                n_segments_leg1   115.357239
5                                               is_cross_country    55.688885
6                                                    both_direct    51.249695
7                                legs0_segments0_cabinClass_is_2    51.106506
8                                           is_min_segments_leg1    49.711380
9                                legs0_segments0_cabinClass_is_1    49.309460
10                                                   free_cancel    40.257221
11                                        pricingInfo_isAccessTP    34.591415
12                                          is_major_carrier_0_0    26.813438
13                                                         isVip    25.694996
14                                            contains_capitials    22.130260
15                     legs0_segments0_baggageAllowance_quantity    21.239143
16                                            is_direct_cheapest    20.692394
17                                           is_exact_round_trip    19.788628
18                                            corporate_vip_flag    18.943798
19                                           is_cheaper_than_avg    18.558277
20      legs0_segments0_marketingCarrier_code_log_selected_count    18.139412
21                               legs1_segments0_cabinClass_is_2    17.824265
22                                                legs0_duration    16.884428
23                                          rank_interaction_sum    16.179651
24                                    corporate_policy_compliant    15.583480
25        legs0_segments0_baggageAllowance_weightMeasurementType    15.426142
26         legs0_segments0_marketingCarrier_code_log_total_count    15.376082
27                                                is_direct_leg1    14.998496
28                                            has_business_class    14.756565
29                                arrival_airport_Country_CodeA2    14.615512
30     legs0_segments0_marketingCarrier_code_cabin2_select_ratio    14.346004
31                               legs0_segments0_cabinClass_is_4    14.239492
32                                        miniRules1_statusInfos    14.119366
33                                                    isAccess3D    13.768279
34                                          rank_interaction_mul    13.539866
35       legs0_segments0_marketingCarrier_code_company_diversity    12.877114
36                                              is_top3_cheapest    12.729091
37                                                    is_one_way    12.495940
38                                                 is_short_trip    12.136695
39                               legs1_segments0_cabinClass_is_1    11.995763
40      legs1_segments0_marketingCarrier_code_log_selected_count    11.955055
41                                        duration_quantile_rank    11.719626
42                                                total_duration    11.601267
43                                                   nationality    11.372430
44                                              is_popular_route    11.370732
45          legs0_segments0_marketingCarrier_code_user_diversity    11.263096
46                                        outbound_route_hotness    11.175965
47          legs0_segments0_marketingCarrier_code_avg_price_rank    10.893168
48                                                legs1_duration    10.702594
49       legs0_segments0_marketingCarrier_code_avg_duration_rank    10.447308
50                                              legs0_is_red_eye    10.440387
51                                               company_avg_pct    10.395974
52                                         price_relative_to_min    10.156937
53         legs1_segments0_marketingCarrier_code_log_total_count    10.141954
54             legs0_segments0_marketingCarrier_code_night_ratio    10.033763
55                                               fee_ratio_rule0     9.780451
56        legs0_segments0_marketingCarrier_code_in_frequentFlyer     9.732109
57                                        miniRules0_statusInfos     9.699606
58                                        legs1_departureAt_hour     9.657551
59                                          return_route_hotness     9.541995
60                                              carrier_pop_prod     9.521894
61     legs0_segments0_marketingCarrier_code_cabin1_select_ratio     9.358589
62                                    fee_ratio_rule1_is_missing     9.277889
63                                               fee_ratio_rule1     9.266473
64                               legs1_segments0_cabinClass_is_4     8.972970
65                         legs0_segments0_marketingCarrier_code     8.833404
66                                                   is_vip_freq     8.596709
67                                      legs0_segments0_duration     8.381750
68                                          legs1_arrivalAt_hour     8.328686
69     legs0_segments0_marketingCarrier_code_cabin4_select_ratio     8.159398
70                                             all_cabin_level_1     8.093809
71                                            return_destination     7.964754
72                                       stay_duration_hours_log     7.906754
73                                      price_zscore_from_median     7.744307
74                              departure_airport_Country_CodeA2     7.705009
75                         legs0_segments0_operatingCarrier_code     7.601945
76                                    fee_ratio_rule0_is_missing     7.598970
77                                        legs0_departureAt_hour     7.588921
78                              arrival_airport_UTC_Offset_Hours     7.545106
79   legs0_segments0_marketingCarrier_code_is_only_frequentFlyer     7.497924
80                                                     log_price     7.424724
81                                           price_quantile_rank     7.422409
82                                                group_size_log     7.409439
83                                                 n_ff_programs     7.393448
84                                   z_price_vs_company_selected     7.282055
85                                                     log_taxes     7.224535
86                                   leg0_duration_quantile_rank     7.219212
87                                                duration_ratio     7.215049
88                                          legs0_arrivalAt_hour     7.201886
89                                 legs1_arrivalAt_business_time     7.185959
90                                            std_selected_price     7.181143
91                                      legs1_segments0_duration     7.174419
92                                            avg_selected_price     7.147635
93                                         selected_direct_ratio     7.035986
94                                          selected_night_ratio     7.026170
95                               legs0_departureAt_business_time     7.021380
96                                               company_avg_dct     6.930465
97                                   leg1_duration_quantile_rank     6.926739
98                                                 timezone_diff     6.918768
99                                   corporateTariffCode_hotness     6.907644
100                                     log_company_select_count     6.899118
101                                         is_expensive_outlier     6.885046
102                       legs0_segments0_arrivalTo_airport_iata     6.838060
103                                                 return_route     6.811344
104                                    legs1_departureAt_weekday     6.796517
105                                       is_codeshare_leg0_seg0     6.763454
106                                      leg_dur_interaction_mul     6.718515
107                           departure_airport_UTC_Offset_Hours     6.712487
108                  legs0_segments0_arrivalTo_airport_city_iata     6.699299
109                                          corporateTariffCode     6.671411
110                              legs1_departureAt_business_time     6.657274
111                                           is_direct_shortest     6.653126
112                                                return_origin     6.596240
113                                      user_selected_count_log     6.584061
114                                legs0_arrivalAt_business_time     6.580869
115                   legs0_segments0_departureFrom_airport_iata     6.561128
116                                         rank_interaction_sub     6.503201
117                                      leg_dur_interaction_sub     6.495512
118                                      legs1_arrivalAt_weekday     6.473288
119                                                    companyID     6.439044
120                                   legs0_is_business_friendly     6.423227
121                                               outbound_route     6.401344
122                                              outbound_origin     6.395598
123                                legs0_segments0_aircraft_code     6.348318
124                                                     tax_rate     6.331712
125                                    legs0_departureAt_weekday     6.307673
126                                    leg_dur_interaction_ratio     6.293390
127                                           hours_to_departure     6.235342
128                                      legs0_arrivalAt_weekday     5.934218
129                                         outbound_destination     5.882819
130                               legs0_segments0_seatsAvailable     5.388764
131                                         has_corporate_tariff     5.297897
132                                                          sex     5.200131

# **Data standardization/normalization**

# **Feature.py**

In [None]:
import polars as pl
import numpy as np
from haversine import haversine
from scipy.stats import gaussian_kde
from polars.datatypes import Struct, List
from .utils import timer


###########################################################
#                     Feature Engineering                 #
###########################################################


def dur_to_min(col):
    """More efficient duration to minutes converter"""
    # Extract days and time parts in one pass
    days = col.str.extract(r"^(\d+)\.", 1).cast(pl.Int64).fill_null(0) * 1440
    time_str = (
        pl.when(col.str.contains(r"^\d+\."))
        .then(col.str.replace(r"^\d+\.", ""))
        .otherwise(col)
    )
    hours = time_str.str.extract(r"^(\d+):", 1).cast(pl.Int64).fill_null(0) * 60
    minutes = time_str.str.extract(r":(\d+):", 1).cast(pl.Int64).fill_null(0)
    return (days + hours + minutes).fill_null(0)


def kde_mode(arr):
    arr = arr[~np.isnan(arr)]
    kde = gaussian_kde(arr)
    xs = np.linspace(np.min(arr), np.max(arr), 1000)
    ys = kde(xs)
    mode = xs[np.argmax(ys)]
    return mode


@timer
def initial_transformations(df, FULL):
    # Precompute marketing carrier columns check
    mc_cols = [
        f"legs{l}_segments{s}_marketingCarrier_code" for l in (0, 1) for s in range(4)
    ]
    mc_exists = [col for col in mc_cols if col in df.columns]

    piece_to_kg = 20
    baggage_cols = [
        f"legs{l}_segments{s}_baggageAllowance" for l in (0, 1) for s in range(2)
    ]

    df = df.with_columns(
        [
            # Price features
            (pl.col("totalPrice") / (pl.col("taxes") + 1)).alias("price_per_tax"),
            (pl.col("taxes") / (pl.col("totalPrice") + 1)).alias("tax_rate"),
            pl.col("totalPrice").log1p().alias("log_price"),
            # Duration features
            (
                pl.col("legs0_duration").fill_null(0)
                + pl.col("legs1_duration").fill_null(0)
            ).alias("total_duration"),
            pl.when(pl.col("legs1_duration").fill_null(0) > 0)
            .then(pl.col("legs0_duration") / pl.col("legs1_duration"))
            .otherwise(-1)
            .alias("duration_ratio"),
            (pl.col("legs0_duration").rank("dense").over("ranker_id") <= 1)
            .cast(pl.Int32)
            .alias("is_top3_shortest_duration"),
            # Trip type
            (
                pl.col("legs1_duration").is_null()
                | (pl.col("legs1_duration") == 0)
                | pl.col("legs1_segments0_departureFrom_airport_iata").is_null()
            )
            .cast(pl.Int32)
            .alias("is_one_way"),
            # Total segments count
            (
                pl.sum_horizontal(
                    pl.col(col).is_not_null().cast(pl.UInt8) for col in mc_exists
                )
                if mc_exists
                else pl.lit(0)
            ).alias("l0_seg"),
            # FF features
            (
                pl.col("frequentFlyer").fill_null("").str.count_matches("/")
                + (pl.col("frequentFlyer").fill_null("") != "").cast(pl.Int32)
            ).alias("n_ff_programs"),
            pl.col("corporateTariffCode")
            .is_not_null()
            .cast(pl.Int32)
            .alias("has_corporate_tariff"),
            pl.when(
                pl.col("corporateTariffCode").is_not_null()
                & (pl.col("pricingInfo_isAccessTP") == 1.0)
            )
            .then(1)
            .otherwise(0)
            .alias("corporate_policy_compliant"),
            pl.when(
                pl.col("corporateTariffCode").is_not_null() & (pl.col("isVip") == True)
            )
            .then(1)
            .otherwise(0)
            .alias("corporate_vip_flag"),
            # Baggage & fees
            (
                (
                    pl.col("miniRules0_monetaryAmount")
                    / pl.col("totalPrice").cast(pl.Float64)
                )
            ).alias("fee_ratio_rule0"),
            (
                (
                    pl.col("miniRules1_monetaryAmount")
                    / pl.col("totalPrice").cast(pl.Float64)
                )
            ).alias("fee_ratio_rule1"),
            (
                (pl.col("miniRules0_monetaryAmount") == 0)
                & (pl.col("miniRules0_statusInfos") == 1)
            )
            .cast(pl.Int8)
            .alias("free_cancel"),
            (
                (pl.col("miniRules1_monetaryAmount") == 0)
                & (pl.col("miniRules1_statusInfos") == 1)
            )
            .cast(pl.Int8)
            .alias("free_exchange"),
            # Routes & carriers
            pl.col("searchRoute")
            .is_in(["MOWLED/LEDMOW", "LEDMOW/MOWLED", "MOWLED", "LEDMOW"])
            .cast(pl.Int32)
            .alias("is_popular_route"),
            pl.col("searchRoute")
            .str.contains("MOW|LED")
            .cast(pl.Int32)
            .alias("contains_capitials"),
            pl.col("legs0_segments0_flightNumber")
            .is_in(["208"])
            .cast(pl.Int32)
            .alias("is_popular_flight"),
            (
                pl.col("legs0_segments0_marketingCarrier_code")
                != pl.col("legs0_segments0_operatingCarrier_code")
            )
            .cast(pl.Int32)
            .alias("is_codeshare_leg0_seg0"),
        ]
    )

    # Search Route
    df = df.with_columns(
        [
            pl.col("searchRoute").cast(pl.Utf8),
            pl.col("searchRoute").str.split_exact("/", 1).alias("route_struct"),
        ]
    )

    df = df.with_columns(
        [
            pl.col("route_struct").struct.field("field_0").alias("outbound_route"),
            pl.col("route_struct").struct.field("field_1").alias("return_route"),
        ]
    )

    df = df.with_columns(
        [
            pl.col("outbound_route").str.slice(0, 3).alias("outbound_origin"),
            pl.col("outbound_route").str.slice(3, 3).alias("outbound_destination"),
            pl.col("return_route").str.slice(0, 3).alias("return_origin"),
            pl.col("return_route").str.slice(3, 3).alias("return_destination"),
        ]
    )

    df = df.with_columns(
        [
            (
                (pl.col("outbound_origin") == pl.col("return_destination"))
                & (pl.col("outbound_destination") == pl.col("return_origin"))
            )
            .cast(pl.Int8)
            .alias("is_exact_round_trip")
            .fill_null(-1),
        ]
    )

    df = df.drop(["route_struct", "searchRoute"])

    # Route hotness
    train_size = 16487352 if not FULL else 18145372
    train_df = df.slice(0, train_size)
    selected_df = train_df.filter(pl.col("selected") == 1)

    outbound_hot = selected_df.group_by("outbound_route").agg(
        pl.count().alias("outbound_route_hotness")
    )

    return_hot = selected_df.group_by("return_route").agg(
        pl.count().alias("return_route_hotness")
    )

    df = (
        df.join(outbound_hot, on="outbound_route", how="left")
        .join(return_hot, on="return_route", how="left")
        .with_columns(
            [
                pl.col("outbound_route_hotness").fill_null(0),
                pl.col("return_route_hotness").fill_null(0),
            ]
        )
    )

    # Fill missing values using hand-craft rules
    df = df.with_columns(
        [
            pl.col("fee_ratio_rule0")
            .is_null()
            .cast(pl.Int8)
            .alias("fee_ratio_rule0_is_missing"),
            pl.col("fee_ratio_rule1")
            .is_null()
            .cast(pl.Int8)
            .alias("fee_ratio_rule1_is_missing"),
            pl.col("pricingInfo_isAccessTP").fill_null(-1).cast(pl.Int32),
            pl.col("legs0_segments0_seatsAvailable")
            .fill_null(strategy="mean")
            .alias("legs0_segments0_seatsAvailable"),
            pl.col("miniRules0_statusInfos").fill_null(-1).cast(pl.Int32),
            pl.col("miniRules1_statusInfos").fill_null(-1).cast(pl.Int32),
        ]
    )
    # df = df.drop("legs0_segments0_seatsAvailable")
    return df


@timer
def build_segment_features(df):
    # Segment counts - more efficient
    seg_exprs = []
    for leg in (0, 1):
        seg_cols = [f"legs{leg}_segments{s}_flightNumber" for s in range(4)]
        if seg_cols:
            seg_exprs.append(
                pl.sum_horizontal(pl.col(c).is_not_null() for c in seg_cols)
                .cast(pl.Int32)
                .alias(f"n_segments_leg{leg}")
            )
        else:
            seg_exprs.append(pl.lit(0).cast(pl.Int32).alias(f"n_segments_leg{leg}"))

    # First create segment counts
    df = df.with_columns(seg_exprs)

    # Then use them for derived features
    df = df.with_columns(
        [
            # (pl.col("n_segments_leg0") + pl.col("n_segments_leg1")).alias(
            #     "total_segments"
            # ),
            (pl.col("n_segments_leg0") == 1).cast(pl.Int32).alias("is_direct_leg0"),
            pl.when(pl.col("is_one_way") == 1)
            .then(0)
            .otherwise((pl.col("n_segments_leg1") == 1).cast(pl.Int32))
            .alias("is_direct_leg1"),
            (pl.col("l0_seg") == pl.col("l0_seg").min().over("ranker_id"))
            .cast(pl.Int32)
            .alias("is_min_segments"),
            (
                pl.col("n_segments_leg0")
                == pl.col("n_segments_leg0").min().over("ranker_id")
            )
            .cast(pl.Int32)
            .alias("is_min_segments_leg0"),
            pl.when(pl.col("is_one_way") == 1)
            .then(0)
            .otherwise(
                (
                    pl.col("n_segments_leg1")
                    == pl.col("n_segments_leg1").min().over("ranker_id")
                ).cast(pl.Int32)
            )
            .alias("is_min_segments_leg1"),
        ]
    )

    # Shortest direct
    direct_shortest = (
        df.filter(pl.col("is_direct_leg0") == 1)
        .group_by("ranker_id")
        .agg(pl.col("total_duration").min().alias("min_direct"))
    )

    df = (
        df.join(direct_shortest, on="ranker_id", how="left")
        .with_columns(
            (
                (pl.col("is_direct_leg0") == 1)
                & (pl.col("total_duration") == pl.col("min_direct"))
            )
            .cast(pl.Int32)
            .fill_null(0)
            .alias("is_direct_shortest")
        )
        .drop("min_direct")
    )

    return df


@timer
def build_time_features(df: pl.DataFrame) -> pl.DataFrame:
    time_cols = [
        "legs0_departureAt",
        "legs0_arrivalAt",
        "legs1_departureAt",
        "legs1_arrivalAt",
    ]

    # Conver time format to datetime
    dt_cols = [
        pl.col(col).str.to_datetime(strict=False).alias(f"{col}_dt")
        for col in time_cols
    ]
    df = df.with_columns(dt_cols)

    # Original time features
    time_exprs = []
    for col in time_cols:
        dt_col = f"{col}_dt"
        h = pl.col(dt_col).dt.hour().fill_null(-1)

        time_exprs.extend(
            [
                h.alias(f"{col}_hour"),
                pl.col(dt_col).dt.weekday().fill_null(0).alias(f"{col}_weekday"),
                ((h >= 6) & (h < 20)).cast(pl.Int32).alias(f"{col}_business_time"),
                pl.when(h == -1)
                .then(-1)
                .when(h < 6)
                .then(0)
                .when(h < 9)
                .then(1)
                .when(h < 11)
                .then(2)
                .when(h < 13)
                .then(3)
                .when(h < 17)
                .then(4)
                .when(h < 20)
                .then(5)
                .otherwise(6)
                .alias(f"{col}_time_bin"),
            ]
        )
    df = df.with_columns(time_exprs)

    # Combo features for sorting model (bin, red-eye, business-friendly)
    combo_exprs = []
    for leg in ["legs0"]:
        dep_bin = f"{leg}_departureAt_time_bin"
        arr_bin = f"{leg}_arrivalAt_time_bin"
        dep_hour = f"{leg}_departureAt_hour"
        arr_hour = f"{leg}_arrivalAt_hour"

        # bin combination feature
        combo_exprs.append(
            (pl.col(dep_bin).cast(pl.Utf8) + "_" + pl.col(arr_bin).cast(pl.Utf8)).alias(
                f"{leg}_dep_arr_bin_combo"
            )
        )

        # business-friendly: both dep & arr in 6–10 or 17–20
        combo_exprs.append(
            (
                ((pl.col(dep_hour) >= 6) & (pl.col(dep_hour) < 20))
                & ((pl.col(arr_hour) >= 6) & (pl.col(arr_hour) < 20))
            )
            .cast(pl.Int8)
            .alias(f"{leg}_is_business_friendly")
        )

        # red-eye flight: dep or arr hour < 6
        combo_exprs.append(
            ((pl.col(dep_hour) < 6) | (pl.col(arr_hour) < 6))
            .cast(pl.Int8)
            .alias(f"{leg}_is_red_eye")
        )

    df = df.with_columns(combo_exprs)

    # Stay duration
    df = df.with_columns(
        (
            (pl.col("legs1_departureAt_dt") - pl.col("legs0_arrivalAt_dt"))
            .dt.total_microseconds()
            .cast(pl.Float64)
            / 1e6
            / 3600.0
        ).alias("stay_duration_hours")
    )

    df = df.with_columns(
        pl.when(pl.col("stay_duration_hours") < 0)
        .then(0)
        .otherwise(pl.col("stay_duration_hours"))
        .alias("stay_duration_hours")
    )

    df = df.with_columns(
        pl.when(pl.col("is_one_way") == 1)
        .then(-1)
        .otherwise(pl.col("stay_duration_hours"))
        .alias("stay_duration_hours")
    )

    # Bins of stay duration
    stay_exprs = [
        pl.when(pl.col("stay_duration_hours") == -1)
        .then(-1)
        .otherwise(pl.col("stay_duration_hours").log1p())
        .alias("stay_duration_hours_log"),
        pl.when(
            (pl.col("stay_duration_hours").is_not_null())
            & (pl.col("stay_duration_hours") >= 0)
            & (pl.col("stay_duration_hours") < 48)
        )
        .then(1)
        .otherwise(0)
        .cast(pl.Int8)
        .alias("is_short_trip"),
        pl.when(pl.col("stay_duration_hours") == -1)
        .then(-1)
        .when(pl.col("stay_duration_hours") < 4)
        .then(0)
        .when(pl.col("stay_duration_hours") < 12)
        .then(1)
        .when(pl.col("stay_duration_hours") < 36)
        .then(2)
        .when(pl.col("stay_duration_hours") < 72)
        .then(3)
        .when(pl.col("stay_duration_hours") < 168)  # one week
        .then(4)
        .when(pl.col("stay_duration_hours") < 336)  # two weeks
        .then(5)
        .otherwise(6)
        .alias("stay_duration_bin"),
    ]
    time_exprs.extend(stay_exprs)

    # Interval between requestDate and boarding time
    booking_exprs = [
        (
            (pl.col("legs0_departureAt_dt") - pl.col("requestDate"))
            .dt.total_microseconds()
            .cast(pl.Float64)
            / 1e6
            / 3600
        ).alias("hours_to_departure"),
        (
            (
                (pl.col("legs0_departureAt_dt") - pl.col("requestDate"))
                .dt.total_microseconds()
                .cast(pl.Float64)
                / 1e6
                / 86400
            )
            .floor()
            .cast(pl.Int32)
        ).alias("days_to_departure"),
        # (
        #     (
        #         (pl.col("legs0_departureAt_dt") - pl.col("requestDate"))
        #         .dt.total_microseconds()
        #         .cast(pl.Float64)
        #         / 1e6
        #         < 48 * 3600
        #     )
        # )
        # .cast(pl.Int8)
        # .alias("is_last_minute_booking"),
    ]
    time_exprs.extend(booking_exprs)

    df = df.with_columns(time_exprs)
    df = df.drop(time_cols + [f"{col}_dt" for col in time_cols] + ["requestDate"])

    return df


@timer
def build_rank_features(df):
    # First apply the columns that will be used for ranking
    # Price and duration basic ranks
    rank_exprs = []
    for col, alias in [
        ("totalPrice", "price"),
        ("total_duration", "duration"),
        ("legs0_duration", "leg0_duration"),
        ("legs1_duration", "leg1_duration"),
    ]:
        min_col = pl.col(col).min().over("ranker_id")
        max_col = pl.col(col).max().over("ranker_id")
        rank_exprs.extend(
            [
                ((pl.col(col) - min_col) / (max_col - min_col + 1e-9)).alias(
                    f"{alias}_quantile_rank"
                ),
            ]
        )

    df = df.with_columns(rank_exprs)

    # Interaction between ranks
    eps = 1e-6
    df = df.with_columns(
        [
            (pl.col("price_quantile_rank") * pl.col("duration_quantile_rank")).alias(
                "rank_interaction_mul"
            ),
            (
                (
                    (pl.col("price_quantile_rank") + eps)
                    / (pl.col("duration_quantile_rank") + eps)
                ).clip(0.01, 100)
            ).alias("rank_interaction_ratio"),
            (pl.col("price_quantile_rank") + pl.col("duration_quantile_rank")).alias(
                "rank_interaction_sum"
            ),
            (pl.col("price_quantile_rank") - pl.col("duration_quantile_rank")).alias(
                "rank_interaction_sub"
            ),
            (
                pl.col("leg0_duration_quantile_rank")
                * pl.col("leg1_duration_quantile_rank")
            ).alias("leg_dur_interaction_mul"),
            (
                (pl.col("leg0_duration_quantile_rank") + eps)
                / (pl.col("leg1_duration_quantile_rank") + eps)
            )
            .clip(0.01, 100)
            .alias("leg_dur_interaction_ratio"),
            (
                pl.col("leg0_duration_quantile_rank")
                - pl.col("leg1_duration_quantile_rank")
            ).alias("leg_dur_interaction_sub"),
        ]
    )

    return df


@timer
def build_price_features(df: pl.DataFrame) -> pl.DataFrame:
    # ==== 1. 计算组内统计量（median, std, min, mean, max）====
    price_stats = df.group_by("ranker_id").agg(
        [
            pl.col("totalPrice").median().alias("grp_price_median"),
            pl.col("totalPrice").std().alias("grp_price_std"),
            pl.col("totalPrice").min().alias("grp_price_min"),
            pl.col("totalPrice").mean().alias("grp_price_mean"),
            pl.col("totalPrice").max().alias("grp_price_max"),
            pl.col("totalPrice").quantile(0.25, "nearest").alias("grp_price_q25"),
            pl.col("totalPrice").quantile(0.75, "nearest").alias("grp_price_q75"),
        ]
    )

    price_stats = price_stats.with_columns(
        [
            (pl.col("grp_price_q75") - pl.col("grp_price_q25")).alias("grp_price_iqr"),
        ]
    )

    df = df.join(price_stats, on="ranker_id", how="left")

    # ==== 2. 构造价格相关特征 ====
    price_exprs = [
        # 是否是 ranker_id 内 top3 最便宜
        (
            (pl.col("totalPrice").rank("dense").over("ranker_id") <= 3)
            .cast(pl.Int8)
            .alias("is_top3_cheapest")
        ),
        # z-score（相对中位数）
        (
            (pl.col("totalPrice") - pl.col("grp_price_median"))
            / (pl.col("grp_price_std") + 1)
        )
        .alias("price_zscore_from_median")
        .fill_null(0),
        # 相对最小价格差比值
        (
            (pl.col("totalPrice") - pl.col("grp_price_min"))
            / (pl.col("grp_price_min") + 1)
        ).alias("price_relative_to_min"),
        # 是否便宜于均价、是否比均价+std还贵（outlier）
        (pl.col("totalPrice") < pl.col("grp_price_q25"))
        .cast(pl.Int8)
        .alias("is_cheaper_than_avg"),
        (
            pl.col("totalPrice")
            > (
                pl.col("grp_price_q75")
                + 1.5 * (pl.col("grp_price_q75") - pl.col("grp_price_q25"))
            )
        )
        .cast(pl.Int8)
        .alias("is_expensive_outlier"),
    ]

    df = df.with_columns(price_exprs)

    # ==== 3. 构造直达航班中最便宜 ====
    direct_cheapest = (
        df.filter(pl.col("is_direct_leg0") == 1)
        .group_by("ranker_id")
        .agg(pl.col("totalPrice").min().alias("min_direct_price"))
    )

    df = (
        df.join(direct_cheapest, on="ranker_id", how="left")
        .with_columns(
            (
                (pl.col("is_direct_leg0") == 1)
                & (pl.col("totalPrice") == pl.col("min_direct_price"))
            )
            .cast(pl.Int8)
            .fill_null(0)
            .alias("is_direct_cheapest")
        )
        .drop("min_direct_price")
    )

    # ==== 4. 清理中间列 ====
    df = df.drop(
        [
            "grp_price_median",
            "grp_price_std",
            "grp_price_min",
            "grp_price_mean",
            "grp_price_max",
            "grp_price_q25",
            "grp_price_q75",
            "grp_price_iqr",
        ]
    )

    return df


@timer
def build_cabin_class_features(df):
    # Cabin class
    cabin_cols = [
        "legs0_segments0_cabinClass",
        "legs1_segments0_cabinClass",
    ]
    cabin_levels = [1, 2, 3, 4]
    onehot_exprs = [
        (pl.col(c) == l).fill_null(False).cast(pl.Int8).alias(f"{c}_is_{l}")
        for c in cabin_cols
        for l in cabin_levels
    ]
    df = df.with_columns(onehot_exprs)

    # Average cabin class
    def build_leg_ratio_exprs(leg: int):
        return [
            pl.when(pl.col(f"n_segments_leg{leg}") > 0)
            .then(
                pl.sum_horizontal(
                    [
                        pl.col(f"legs{leg}_segments{s}_cabinClass_is_{lvl}")
                        for s in range(2)
                        if f"legs{leg}_segments{s}_cabinClass_is_{lvl}" in df.columns
                    ]
                )
                / pl.col(f"n_segments_leg{leg}")
            )
            .otherwise(0.0)
            .alias(f"leg{leg}_cabin_ratio_{lvl}")
            for lvl in cabin_levels
        ]

    def build_avg_cabin_expr(leg: int):
        return (
            (
                pl.sum_horizontal(
                    [
                        pl.col(f"legs{leg}_segments{s}_cabinClass")
                        for s in range(2)
                        if f"legs{leg}_segments{s}_cabinClass" in df.columns
                    ]
                )
                / pl.col(f"n_segments_leg{leg}")
            )
            .fill_nan(0.0)
            .alias(f"leg{leg}_avg_cabin_class")
        )

    # df = df.with_columns(build_leg_ratio_exprs(0) + build_leg_ratio_exprs(1))

    # Average bussiness cabin ratio
    business_levels = [3, 4]
    business_cols = [
        f"{col}_is_{business_level}"
        for col in cabin_cols
        for business_level in business_levels
    ]

    df = df.with_columns(
        [
            pl.sum_horizontal([pl.col(c) for c in business_cols]).alias(
                "cabin_class_level_sum_3"
            )
        ]
    )
    df = df.with_columns(
        [
            # (
            #     pl.col("cabin_class_level_sum_3") / (pl.col("total_segments") + 1e-6)
            # ).alias("business_class_ratio"),
            (pl.col("cabin_class_level_sum_3") >= 1)
            .cast(pl.Int8)
            .alias("has_business_class"),
        ]
    )
    df = df.drop("cabin_class_level_sum_3")

    highest_expr = (
        pl.max_horizontal(
            [
                pl.col(f"{col}_is_{lvl}").cast(pl.Boolean)
                for col in cabin_cols
                for lvl in range(1, 5)
                if f"{col}_is_{lvl}" in df.columns
            ]
        )
        .cast(pl.Int8)
        .alias("cabin_class_highest")
    )

    lowest_expr = (
        pl.min_horizontal(
            [
                pl.col(f"{col}_is_{lvl}").cast(pl.Boolean)
                for col in cabin_cols
                for lvl in range(1, 5)
                if f"{col}_is_{lvl}" in df.columns
            ]
        )
        .cast(pl.Int8)
        .alias("cabin_class_lowest")
    )

    diversity_expr = pl.sum_horizontal(
        [
            pl.any_horizontal(
                [
                    pl.col(f"{col}_is_{lvl}").cast(pl.Boolean)
                    for col in cabin_cols
                    if f"{col}_is_{lvl}" in df.columns
                ]
            ).cast(pl.Int8)
            for lvl in cabin_levels
        ]
    ).alias("cabin_class_diversity")

    all_level_1_expr = (
        pl.all_horizontal(
            [
                ((pl.col(f"{col}_is_1") == 1) | (pl.col(f"{col}_is_2") == 1)).cast(
                    pl.Boolean
                )
                for col in cabin_cols
                if f"{col}_is_1" in df.columns or f"{col}_is_2" in df.columns
            ]
        )
        .cast(pl.Int8)
        .alias("all_cabin_level_1")
    )
    # df = df.with_columns([highest_expr, lowest_expr, diversity_expr, all_level_1_expr])
    df = df.with_columns([all_level_1_expr])

    return df


@timer
def build_company_features(df, FULL, top_k=10):
    train_size = 16487352 if not FULL else 18145372
    train_df = df.slice(0, train_size)
    selected_df = train_df.filter(pl.col("selected") == 1)

    avg_price = selected_df.select(pl.col("totalPrice").mean()).item()
    std_price = selected_df.select(pl.col("totalPrice").std()).item()

    company_pref = (
        selected_df.group_by("companyID")
        .agg(
            [
                pl.count().alias("company_select_count"),
                pl.col("totalPrice").mean().alias("avg_selected_price"),
                pl.col("totalPrice").std().alias("std_selected_price"),
                # pl.col("cabin_class_highest").mean().alias("avg_selected_cabin"),
                pl.col("both_direct").mean().alias("selected_direct_ratio"),
                (pl.col("legs0_departureAt_hour") < 6)
                .cast(pl.Int32)
                .mean()
                .alias("selected_night_ratio"),
                pl.col("price_quantile_rank").mean().alias("company_avg_pct"),
                pl.col("duration_quantile_rank").mean().alias("company_avg_dct"),
            ]
        )
        .with_columns(
            [
                pl.when(pl.col("company_select_count") <= 1)
                .then(avg_price)
                .otherwise(pl.col("avg_selected_price"))
                .alias("avg_selected_price"),
            ]
        )
        .with_columns(
            [
                pl.col("company_select_count")
                .log1p()
                .alias("log_company_select_count"),
                (pl.col("company_select_count") > 3000)
                .cast(pl.Int32)
                .alias("is_very_popular_company"),
                (
                    (pl.col("company_select_count") <= 3000)
                    & (pl.col("company_select_count") > 1000)
                )
                .cast(pl.Int32)
                .alias("is_popular_company"),
            ]
        )
        .with_columns(
            pl.col("company_select_count")
            .rank("dense", descending=True)
            .alias("company_rank")
        )
        .with_columns(
            (pl.col("company_rank") <= top_k)
            .cast(pl.Int32)
            .alias("is_top_selected_company")
        )
        .drop("company_rank")
    )

    df = df.join(company_pref, on="companyID", how="left")

    df = df.with_columns(
        [
            pl.col("company_select_count").fill_null(0),
            pl.col("log_company_select_count").fill_null(0),
            pl.col("avg_selected_price").fill_null(avg_price),
            pl.col("std_selected_price").fill_null(std_price),
            # pl.col("avg_selected_cabin").fill_null(0),
            pl.col("selected_direct_ratio").fill_null(0),
            pl.col("selected_night_ratio").fill_null(0),
            pl.col("is_very_popular_company").fill_null(0),
            pl.col("is_popular_company").fill_null(0),
            pl.col("is_top_selected_company").fill_null(0),
        ]
    )

    df = df.with_columns(
        [
            (
                (pl.col("totalPrice") - pl.col("avg_selected_price"))
                / (pl.col("std_selected_price") + 1e-6)
            ).alias("z_price_vs_company_selected"),
        ]
    )

    return df


@timer
def build_flight_features(df, FULL):
    selected_col = "legs0_segments0_marketingCarrier_code"
    train_size = 16487352 if not FULL else 18145372
    train_df = df.slice(0, train_size)
    selected_df = train_df.filter(pl.col("selected") == 1)

    # TODO: 计算各个不同舱位的选取率，作为客户群体特征
    cabin_col = "legs0_segments0_cabinClass"
    cabin_selected_df = selected_df.group_by([selected_col, cabin_col]).agg(
        pl.count().alias("cabin_selected_count")
    )
    total_selected_per_carrier = selected_df.group_by(selected_col).agg(
        pl.count().alias("total_selected_count")
    )
    cabin_stats_df = cabin_selected_df.join(
        total_selected_per_carrier, on=selected_col, how="inner"
    ).with_columns(
        (
            pl.col("cabin_selected_count") / (pl.col("total_selected_count") + 1e-5)
        ).alias("cabin_select_ratio")
    )
    cabin_stats_df = cabin_stats_df.with_columns(pl.col(cabin_col).cast(pl.Utf8))

    cabin_stats_wide_df = cabin_stats_df.pivot(
        values="cabin_select_ratio",
        index=selected_col,
        columns=cabin_col,
        aggregate_function="first",
    )

    existing_cols = cabin_stats_wide_df.columns
    required_cols = {
        "1.0": f"{selected_col}_cabin1_select_ratio",
        "2.0": f"{selected_col}_cabin2_select_ratio",
        "3.0": f"{selected_col}_cabin3_select_ratio",
        "4.0": f"{selected_col}_cabin4_select_ratio",
    }
    for old, new in required_cols.items():
        if old in existing_cols:
            cabin_stats_wide_df = cabin_stats_wide_df.with_columns(
                [pl.col(old).alias(new)]
            )
            cabin_stats_wide_df = cabin_stats_wide_df.drop(old)
        else:
            cabin_stats_wide_df = cabin_stats_wide_df.with_columns(
                [pl.lit(None).alias(new)]
            )

    # 统计总次数 & 被选次数
    selected_count_df = selected_df.group_by(selected_col).agg(
        pl.count().alias("selected_count")
    )
    total_count_df = train_df.group_by(selected_col).agg(
        pl.count().alias("total_count")
    )

    # 计算被选比例 & log计数
    stats_df = (
        selected_count_df.join(total_count_df, on=selected_col, how="inner")
        .with_columns(
            (pl.col("selected_count") / (pl.col("total_count") + 1e-5)).alias(
                f"{selected_col}_selection_rate"
            ),
            pl.col("selected_count")
            .cast(pl.Float64)
            .log1p()
            .alias(f"{selected_col}_log_selected_count"),
            pl.col("total_count")
            .cast(pl.Float64)
            .log1p()
            .alias(f"{selected_col}_log_total_count"),
        )
        .drop(["selected_count", "total_count"])
    )

    # 提取出发小时方便聚合时间相关特征
    train_df = train_df.with_columns(pl.col("legs0_departureAt_hour").alias("dep_hour"))
    selected_df = selected_df.with_columns(
        pl.col("legs0_departureAt_hour").alias("dep_hour")
    )

    # 聚合价格、航班持续时间、时间分布、公司/用户多样性等特征
    more_stats_df = selected_df.group_by(selected_col).agg(
        [
            pl.col("price_quantile_rank")
            .mean()
            .alias(f"{selected_col}_avg_price_rank"),
            pl.col("duration_quantile_rank")
            .mean()
            .alias(f"{selected_col}_avg_duration_rank"),
            pl.col("dep_hour").mean().alias(f"{selected_col}_avg_dep_hour"),
            (pl.col("dep_hour") < 6)
            .cast(pl.Int32)
            .mean()
            .alias(f"{selected_col}_night_ratio"),
            pl.col("companyID").n_unique().alias(f"{selected_col}_company_diversity"),
            pl.col("profileId").n_unique().alias(f"{selected_col}_user_diversity"),
        ]
    )

    global_avg_price = selected_df.select(pl.col("price_quantile_rank").mean()).item()
    global_avg_duration = selected_df.select(
        pl.col("duration_quantile_rank").mean()
    ).item()

    stats_df = stats_df.join(more_stats_df, on=selected_col, how="left")
    stats_df = stats_df.join(cabin_stats_wide_df, on=selected_col, how="left")
    stats_df = stats_df.drop(
        [
            f"{selected_col}_selection_rate",
            f"{selected_col}_log_selected_count",
            f"{selected_col}_log_total_count",
        ]
    )

    df = df.join(stats_df, on=selected_col, how="left").with_columns(
        [
            # pl.col(f"{selected_col}_selection_rate").fill_null(0.0),
            # pl.col(f"{selected_col}_log_selected_count").fill_null(0.0),
            # pl.col(f"{selected_col}_log_total_count").fill_null(0.0),
            pl.col(f"{selected_col}_avg_price_rank").fill_null(global_avg_price),
            pl.col(f"{selected_col}_avg_duration_rank").fill_null(global_avg_duration),
            pl.col(f"{selected_col}_avg_dep_hour").fill_null(12.0),
            pl.col(f"{selected_col}_night_ratio").fill_null(0.0),
            pl.col(f"{selected_col}_company_diversity").fill_null(0),
            pl.col(f"{selected_col}_user_diversity").fill_null(0),
            pl.col(f"{selected_col}_cabin1_select_ratio").fill_null(0.0),
            pl.col(f"{selected_col}_cabin2_select_ratio").fill_null(0.0),
            pl.col(f"{selected_col}_cabin3_select_ratio").fill_null(0.0),
            pl.col(f"{selected_col}_cabin4_select_ratio").fill_null(0.0),
        ]
    )

    return df


def get_rank_bin_expr(rank_col: str, bin_edges: list[int], bin_col: str):
    expr = pl.when(pl.col(rank_col) <= bin_edges[0]).then(1)
    for i in range(1, len(bin_edges)):
        expr = expr.when(pl.col(rank_col) <= bin_edges[i]).then(i + 1)
    expr = expr.otherwise(len(bin_edges) + 1).cast(pl.Int8).alias(bin_col)
    return expr


@timer
def build_carrier_features(
    df, carrier_col: str, FULL: bool, bin_edges: list[int] = [1, 2, 5, 12, 17, 28]
):
    carrier_col += "_segments0_marketingCarrier_code"
    select_rate_col = f"{carrier_col}_selection_rate"
    bin_col = f"{carrier_col}_selected_rank_bin"
    log_total_col = f"{carrier_col}_log_total_count"
    log_selected_col = f"{carrier_col}_log_selected_count"
    rank_col = "selected_rank"

    train_size = 16487352 if not FULL else 18145372
    train_df = df.slice(0, train_size)

    selected_cnt_df = (
        train_df.filter(pl.col("selected") == 1)
        .group_by(carrier_col)
        .agg(pl.count().alias("selected_count"))
    )

    total_cnt_df = train_df.group_by(carrier_col).agg(pl.count().alias("total_count"))

    rate_df = (
        selected_cnt_df.join(total_cnt_df, on=carrier_col, how="inner")
        .with_columns(
            (pl.col("selected_count") / (pl.col("total_count") + 1e-5)).alias(
                select_rate_col
            ),
            pl.col("total_count").cast(pl.Float64).log1p().alias(log_total_col),
            pl.col("selected_count").cast(pl.Float64).log1p().alias(log_selected_col),
        )
        .with_columns(
            pl.col("selected_count").rank("dense", descending=True).alias(rank_col)
        )
        .with_columns(get_rank_bin_expr(rank_col, bin_edges, bin_col))
        .select(
            [carrier_col, select_rate_col, log_total_col, log_selected_col, bin_col]
        )
    )

    df = df.join(rate_df, on=carrier_col, how="left").with_columns(
        pl.col(select_rate_col).fill_null(0.0),
        pl.col(log_total_col).fill_null(0.0),
        pl.col(log_selected_col).fill_null(0.0),
        pl.col(bin_col).fill_null(0),
    )

    return df


@timer
def build_frequent_flyer_features(
    df: pl.DataFrame, major_carriers: list[str]
) -> pl.DataFrame:
    for leg in [0]:
        carrier_col = f"legs{leg}_segments0_marketingCarrier_code"
        cabin_col = f"legs{leg}_segments0_cabinClass"

        # 是否是 frequentFlyer 的会员航司
        flag_ff_col = f"{carrier_col}_in_frequentFlyer"
        df = df.with_columns(
            pl.col("frequentFlyer")
            .fill_null("")
            .str.split("/")
            .list.contains(pl.col(carrier_col))
            .cast(pl.Int8)
            .alias(flag_ff_col)
        )

        # Only one
        df = df.with_columns(
            (
                (pl.col("frequentFlyer").fill_null("").str.split("/").list.len() == 1)
                & pl.col("frequentFlyer")
                .fill_null("")
                .str.contains(pl.col(carrier_col))
            )
            .cast(pl.Int8)
            .alias(f"{carrier_col}_is_only_frequentFlyer")
        )

        # 是否是 major carrier
        flag_major_col = f"is_major_carrier_{leg}_0"
        df = df.with_columns(
            pl.col(carrier_col)
            .is_in(major_carriers)
            .cast(pl.Int8)
            .alias(flag_major_col)
        )

        # frequent flyer + 低舱位
        df = df.with_columns(
            (
                (pl.col(flag_ff_col) == 1)
                & ((pl.col(cabin_col) == 1) | (pl.col(cabin_col) == 2))
            )
            .cast(pl.Int8)
            .alias(f"{carrier_col}_ff_and_economic")
        )

    return df


@timer
def build_profile_features(df, FULL=False):
    train_size = 16487352 if not FULL else 18145372
    print("Using train size: ", train_size)
    train_df = df.slice(0, train_size)
    train_selected = train_df.filter(pl.col("selected") == 1)

    agg = train_selected.group_by("profileId").agg(
        [
            pl.count().alias("user_selected_count"),
            pl.mean("price_quantile_rank").alias("user_avg_price_rank"),
            pl.mean("duration_quantile_rank").alias("user_avg_duration_rank"),
            pl.mean("legs0_segments0_cabinClass").alias(
                "user_avg_cabinClass_leg0_seg0"
            ),
        ]
    )

    total_counts = train_selected.group_by("profileId").agg(
        pl.count().alias("user_total_count")
    )
    profile_stats = agg.join(total_counts, on="profileId", how="left")

    profile_stats = profile_stats.with_columns(
        [
            (pl.col("user_total_count") > 0).cast(pl.Int32).alias("user_in_train"),
            pl.col("user_selected_count").log1p().alias("user_selected_count_log"),
        ]
    ).drop("user_total_count")

    # Use kde to approximate the peak
    price_arr = train_selected.select("price_quantile_rank").to_numpy().flatten()
    duration_arr = train_selected.select("duration_quantile_rank").to_numpy().flatten()
    cabin_arr = train_selected.select("legs0_segments0_cabinClass").to_numpy().flatten()

    price_mode = kde_mode(price_arr)
    duration_mode = kde_mode(duration_arr)
    cabin_mode = kde_mode(cabin_arr)

    fill_na_dict = {
        "user_selected_count": 0,
        "user_selected_count_log": 0,
        "user_in_train": 0,
        "user_avg_price_rank": price_mode,
        "user_avg_duration_rank": duration_mode,
        "user_avg_cabinClass_leg0_seg0": cabin_mode,
    }

    print(fill_na_dict)

    df = df.join(profile_stats, on="profileId", how="left")
    df = df.with_columns([pl.col(c).fill_null(v) for c, v in fill_na_dict.items()])

    df = df.with_columns(
        [
            (pl.col("price_quantile_rank") - pl.col("user_avg_price_rank")).alias(
                "price_rank_diff"
            ),
            (pl.col("duration_quantile_rank") - pl.col("user_avg_duration_rank")).alias(
                "duration_rank_diff"
            ),
        ]
    )

    df = df.drop(["user_avg_price_rank", "user_avg_duration_rank"])

    return df


@timer
def build_route_features(df, FULL):
    pass


@timer
def build_airport_feature(df):
    airport = pl.read_csv("/home/zhengxiang/FlightRank/data/airports.csv")

    # 起飞机场字段保留的列
    departure_cols = [
        "IATA",
        "UTC_Offset_Hours",
        "Country_CodeA2",
        "GeoPointLat",
        "GeoPointLong",
    ]
    departure_airport_info = airport.select(departure_cols).rename(
        {col: f"departure_airport_{col}" for col in departure_cols if col != "IATA"}
    )

    # 到达机场字段保留的列
    arrival_cols = [
        "IATA",
        "UTC_Offset_Hours",
        "Country_CodeA2",
        "GeoPointLat",
        "GeoPointLong",
    ]
    arrival_airport_info = airport.select(arrival_cols).rename(
        {col: f"arrival_airport_{col}" for col in arrival_cols if col != "IATA"}
    )

    # join 起飞机场信息
    df = df.join(
        departure_airport_info,
        left_on="legs0_segments0_departureFrom_airport_iata",
        right_on="IATA",
        how="left",
    )

    # join 到达机场信息
    df = df.join(
        arrival_airport_info,
        left_on="legs0_segments0_arrivalTo_airport_iata",
        right_on="IATA",
        how="left",
    )

    def haversine_distance(row):
        vals = list(row.values())
        if any(v is None for v in vals):
            return None
        lat1, lon1, lat2, lon2 = map(float, vals)
        return haversine((lat1, lon1), (lat2, lon2))

    df = df.with_columns(
        [
            # 是否跨国（不同国家）
            (
                pl.col("departure_airport_Country_CodeA2")
                != pl.col("arrival_airport_Country_CodeA2")
            )
            .cast(pl.Int8)
            .alias("is_cross_country"),
            # 时区差（绝对值）
            (
                pl.col("departure_airport_UTC_Offset_Hours")
                - pl.col("arrival_airport_UTC_Offset_Hours")
            ).alias("timezone_diff"),
            # # 经纬度距离（haversine）
            # pl.struct(
            #     [
            #         pl.col("departure_airport_GeoPointLat"),
            #         pl.col("departure_airport_GeoPointLong"),
            #         pl.col("arrival_airport_GeoPointLat"),
            #         pl.col("arrival_airport_GeoPointLong"),
            #     ]
            # )
            # .map_elements(haversine_distance)
            # .alias("geo_distance_km"),
        ]
    )

    return df


@timer
def build_corporateTariffCode_feature(df, FULL):
    train_size = 16487352 if not FULL else 18145372
    train_df = df.slice(0, train_size)
    selected_df = train_df.filter(pl.col("selected") == 1)

    ctc_df = selected_df.group_by("corporateTariffCode").agg(
        [
            pl.count().alias("corporateTariffCode_hotness"),
            pl.col("price_quantile_rank")
            .mean()
            .alias("corporateTariffCode_price_rank_mean"),
            pl.col("duration_quantile_rank")
            .mean()
            .alias("corporateTariffCode_duration_rank_mean"),
        ]
    )

    no_ctc_df = selected_df.filter(pl.col("corporateTariffCode").is_null())
    mean_price_rank_no_ctc = no_ctc_df["price_quantile_rank"].mean()
    mean_duration_rank_no_ctc = no_ctc_df["duration_quantile_rank"].mean()

    df = df.join(ctc_df, on="corporateTariffCode", how="left").with_columns(
        [
            pl.col("corporateTariffCode_hotness").fill_null(0),
            pl.col("corporateTariffCode_price_rank_mean").fill_null(
                mean_price_rank_no_ctc
            ),
            pl.col("corporateTariffCode_duration_rank_mean").fill_null(
                mean_duration_rank_no_ctc
            ),
        ]
    )

    return df


@timer
def handle_features_with_extreme(df):
    # Taxes
    df = df.with_columns((pl.col("taxes") + 1e-3).log().alias("log_taxes"))
    # Winsorization of price per tax
    low = df.select(pl.col("price_per_tax").quantile(0.01)).item()
    high = df.select(pl.col("price_per_tax").quantile(0.99)).item()

    df = df.with_columns(
        pl.when(pl.col("price_per_tax") < low)
        .then(low)
        .when(pl.col("price_per_tax") > high)
        .then(high)
        .otherwise(pl.col("price_per_tax"))
        .alias("price_per_tax")
    )

    df = df.with_columns(
        [
            (pl.col("legs0_duration").log1p().alias("legs0_duration")),
            (pl.col("legs1_duration").log1p().alias("legs1_duration")),
            (
                pl.col("legs0_segments0_duration")
                .log1p()
                .alias("legs0_segments0_duration")
            ),
            (
                pl.col("legs0_segments1_duration")
                .log1p()
                .alias("legs0_segments1_duration")
            ),
            (
                pl.col("legs1_segments0_duration")
                .log1p()
                .alias("legs1_segments0_duration")
            ),
            (
                pl.col("legs1_segments1_duration")
                .log1p()
                .alias("legs1_segments1_duration")
            ),
            (pl.col("total_duration").log1p().alias("total_duration")),
            (pl.col("hours_to_departure").log1p().alias("hours_to_departure")),
            # (
            #     pl.col("legs0_segments0_marketingCarrier_code_company_diversity")
            #     .log1p()
            #     .alias("legs0_segments0_marketingCarrier_code_company_diversity")
            # ),
            # (
            #     pl.col("legs0_segments0_marketingCarrier_code_user_diversity")
            #     .log1p()
            #     .alias("legs0_segments0_marketingCarrier_code_user_diversity")
            # ),
            # (
            #     pl.col("outbound_route_hotness")
            #     .log1p()
            #     .alias("outbound_route_hotness")
            # ),
            # (
            #     pl.col("return_route_hotness")
            #     .log1p()
            #     .alias("return_route_hotness")
            # ),
        ]
    )
    return df


@timer
def feature_engineering(data_raw, full):
    df = data_raw.clone()

    # Process duration columns
    dur_cols = ["legs0_duration", "legs1_duration"] + [
        f"legs{l}_segments{s}_duration" for l in (0, 1) for s in range(4)
    ]
    dur_exprs = [dur_to_min(pl.col(c)).alias(c) for c in dur_cols if c in df.columns]

    # Apply duration transformations first
    if dur_exprs:
        df = df.with_columns(dur_exprs)

    # Combine all initial transformations
    df = initial_transformations(df, full)

    # Segment feature
    df = build_segment_features(df)

    # More derived features
    df = df.with_columns(
        [
            (pl.col("is_direct_leg0") & pl.col("is_direct_leg1"))
            .cast(pl.Int32)
            .alias("both_direct"),
            ((pl.col("isVip") == 1) | (pl.col("n_ff_programs") > 0))
            .cast(pl.Int32)
            .alias("is_vip_freq"),
            # pl.col("baggage_min")
            # .fill_null(-1)
            # .map_elements(
            #     lambda x: 1 if x > 0 else (0 if x == 0 else -1), return_dtype=pl.Int32
            # )
            # .cast(pl.Int32)
            # .alias("has_baggage"),
            # (pl.col("total_fees") > 0).cast(pl.Int32).alias("has_fees"),
            # (pl.col("total_fees") / (pl.col("totalPrice") + 1)).alias("fee_rate"),
            pl.col("Id").count().over("ranker_id").alias("group_size"),
        ]
    )

    df = df.with_columns(
        [
            pl.col("legs0_segments0_baggageAllowance_weightMeasurementType").fill_null(
                -1
            ),
            pl.col("legs0_segments0_baggageAllowance_quantity").fill_null(-1),
        ]
    )

    df = df.with_columns(pl.col("group_size").log1p().alias("group_size_log"))

    # Time features - batch process
    df = build_time_features(df)

    # Batch rank computations - more efficient with single pass
    df = build_rank_features(df)

    # Price-specific features
    df = build_price_features(df)

    # Cabin class features
    df = build_cabin_class_features(df)

    # Company features
    df = build_company_features(df, full)

    # Flight features
    # NOTE: The flight features are not good online
    df = build_flight_features(df, full)

    # Carrier features
    df = build_carrier_features(df, "legs0", full)
    df = build_carrier_features(df, "legs1", full, [1, 2, 3, 5, 10, 24])

    # Frequent Flyer features
    major_carriers = ["SU", "S7"]
    df = build_frequent_flyer_features(df, major_carriers)

    # major_carriers = ["SU", "FV", "S7"]
    # flag_major_col = "is_operate_carrier_0_0"
    # df = df.with_columns(
    #     pl.col("legs0_segments0_operatingCarrier_code")
    #     .is_in(major_carriers)
    #     .cast(pl.Int8)
    #     .alias(flag_major_col)
    # )

    # Profile features
    df = build_profile_features(df, full)

    # Route features
    # df = build_route_features(df, full)

    # Airport features
    df = build_airport_feature(df)

    # corporateTariffCode features
    df = build_corporateTariffCode_feature(df, full)

    # Handle abnormal features
    df = handle_features_with_extreme(df)

    return df


###########################################################
#                      Feature Selection                  #
###########################################################


def feature_selection(data, trial=None):
    # Categorical features
    cat_features = [
        # Orignal features
        "bySelf",
        "companyID",
        "corporateTariffCode",
        "nationality",
        "isAccess3D",
        "isVip",
        # legs0_segments0
        "legs0_segments0_aircraft_code",
        "legs0_segments0_arrivalTo_airport_city_iata",
        "legs0_segments0_arrivalTo_airport_iata",
        "legs0_segments0_baggageAllowance_weightMeasurementType",
        "legs0_segments0_departureFrom_airport_iata",
        "legs0_segments0_marketingCarrier_code",
        "legs0_segments0_operatingCarrier_code",
        # initial transformations
        "is_top3_shortest_duration",
        "miniRules0_statusInfos",
        "miniRules1_statusInfos",
        "pricingInfo_isAccessTP",
        "sex",
        "is_one_way",
        "has_corporate_tariff",
        "corporate_policy_compliant",
        "corporate_vip_flag",
        "free_cancel",
        "free_exchange",
        "is_popular_route",
        "fee_ratio_rule0_is_missing",
        "fee_ratio_rule1_is_missing",
        "contains_capitials",
        "is_popular_flight",
        "is_codeshare_leg0_seg0",
        "outbound_origin",
        "outbound_destination",
        "return_origin",
        "return_destination",
        "outbound_route",
        "return_route",
        "is_exact_round_trip",
        # segment features
        "is_direct_leg0",
        "is_direct_leg1",
        "is_direct_shortest",
        "both_direct",
        "is_min_segments_leg0",
        "is_min_segments_leg1",
        "is_vip_freq",
        # time features
        "legs0_departureAt_hour",
        "legs0_departureAt_weekday",
        "legs0_departureAt_business_time",
        "legs0_departureAt_time_bin",
        "legs0_arrivalAt_hour",
        "legs0_arrivalAt_weekday",
        "legs0_arrivalAt_business_time",
        "legs0_arrivalAt_time_bin",
        "legs1_departureAt_hour",
        "legs1_departureAt_weekday",
        "legs1_departureAt_business_time",
        "legs1_departureAt_time_bin",
        "legs1_arrivalAt_hour",
        "legs1_arrivalAt_weekday",
        "legs1_arrivalAt_business_time",
        "legs1_arrivalAt_time_bin",
        "is_short_trip",
        "stay_duration_bin",
        "is_booking_today",
        "is_booking_3days",
        "is_booking_week",
        "is_departure_in_past",
        "is_top3_cheapest",
        "is_min_segments",
        "is_direct_cheapest",
        "legs0_dep_arr_bin_combo",
        "legs0_is_business_friendly",
        "legs0_is_red_eye",
        # cabin class features
        "legs0_segments0_cabinClass_is_1",
        "legs0_segments0_cabinClass_is_2",
        "legs0_segments0_cabinClass_is_3",
        "legs0_segments0_cabinClass_is_4",
        "legs1_segments0_cabinClass_is_1",
        "legs1_segments0_cabinClass_is_2",
        "legs1_segments0_cabinClass_is_3",
        "legs1_segments0_cabinClass_is_4",
        "has_business_class",
        # "cabin_class_highest",
        # "cabin_class_lowest",
        # "cabin_class_diversity",
        "all_cabin_level_1",
        # company features
        "is_very_popular_company",
        "is_popular_company",
        "is_top_selected_company",
        # flight features
        "legs0_segments0_flightNumber_popularity_bin",
        # carrier features
        "legs0_segments0_marketingCarrier_code_selected_rank_bin",
        "legs1_segments0_marketingCarrier_code_selected_rank_bin",
        # ff features
        "legs0_segments0_marketingCarrier_code_in_frequentFlyer",
        "legs0_segments0_marketingCarrier_code_is_only_frequentFlyer",
        "is_major_carrier_0_0",
        "is_operate_carrier_0_0",
        # "legs0_segments0_marketingCarrier_code_ff_and_economic",
        # profile features
        "user_in_train",
        # airport features
        "departure_airport_Country_CodeA2",
        "arrival_airport_Country_CodeA2",
        "is_cross_country",
        # price features
        "is_cheaper_than_avg",
        "is_expensive_outlier",
    ]

    # Numerical features
    num_features = [
        "legs0_duration",
        "legs0_segments0_baggageAllowance_quantity",
        "legs0_segments0_seatsAvailable",
        "legs0_segments0_duration",
        "legs0_segments1_duration",
        "legs1_duration",
        "legs1_segments0_duration",
        "legs1_segments1_duration",
        "taxes",
        "log_taxes",
        "totalPrice",
        "price_per_tax",
        "tax_rate",
        "log_price",
        "total_duration",
        "duration_ratio",
        "l0_seg",
        "n_ff_programs",
        "fee_ratio_rule0",
        "fee_ratio_rule1",
        "n_segments_leg0",
        "n_segments_leg1",
        "group_size",
        "group_size_log",
        "stay_duration_hours",
        "stay_duration_hours_log",
        "hours_to_departure",
        "days_to_departure",
        # rank features
        "price_quantile_rank",
        "duration_quantile_rank",
        "leg0_duration_quantile_rank",
        "leg1_duration_quantile_rank",
        "rank_interaction_mul",
        "rank_interaction_ratio",
        "rank_interaction_sum",
        "rank_interaction_sub",
        "leg_dur_interaction_mul",
        "leg_dur_interaction_ratio",
        "leg_dur_interaction_sub",
        # price features
        "price_zscore_from_median",
        "price_relative_to_min",
        # company features
        "cabin_class_level_sum_3",
        "company_select_count",
        "avg_selected_price",
        "std_selected_price",
        "selected_direct_ratio",
        "selected_night_ratio",
        "log_company_select_count",
        "z_price_vs_company_selected",
        "company_avg_pct",
        "company_avg_dct",
        # "company_carrier_select_ratio",
        "legs0_segments0_marketingCarrier_code_selection_rate",
        "legs0_segments0_marketingCarrier_code_log_total_count",
        "legs0_segments0_marketingCarrier_code_log_selected_count",
        "legs1_segments0_marketingCarrier_code_selection_rate",
        "legs1_segments0_marketingCarrier_code_log_total_count",
        "legs1_segments0_marketingCarrier_code_log_selected_count",
        "user_selected_count",
        "user_avg_cabinClass_leg0_seg0",
        "user_selected_count_log",
        "price_rank_diff",
        "duration_rank_diff",
        # flight features
        "legs0_segments0_marketingCarrier_code_avg_price_rank",
        "legs0_segments0_marketingCarrier_code_avg_duration_rank",
        "legs0_segments0_marketingCarrier_code_avg_dep_hour",
        "legs0_segments0_marketingCarrier_code_night_ratio",
        "legs0_segments0_marketingCarrier_code_company_diversity",
        "legs0_segments0_marketingCarrier_code_user_diversity",
        "legs0_segments0_marketingCarrier_code_selection_rate_right",
        "legs0_segments0_marketingCarrier_code_log_total_count_right",
        "legs0_segments0_marketingCarrier_code_cabin1_select_ratio",
        "legs0_segments0_marketingCarrier_code_cabin2_select_ratio",
        "legs0_segments0_marketingCarrier_code_cabin3_select_ratio",
        "legs0_segments0_marketingCarrier_code_cabin4_select_ratio",
        # route hotness
        "outbound_route_hotness",
        "return_route_hotness",
        # airport features
        "departure_airport_UTC_Offset_Hours",
        "departure_airport_GeoPointLat",
        "departure_airport_GeoPointLong",
        "arrival_airport_UTC_Offset_Hours",
        "arrival_airport_GeoPointLat",
        "arrival_airport_GeoPointLong",
        "timezone_diff",
        "geo_distance_km",
        # corporateTariffCode features
        "corporateTariffCode_hotness",
        "corporateTariffCode_price_rank_mean",
        "corporateTariffCode_duration_rank_mean",
    ]

    # Columns to exclude (uninformative or problematic)
    exclude_cols = [
        "Id",
        "ranker_id",
        "profileId",
        "selected",
        "requestDate",
        "legs0_departureAt",
        "legs0_arrivalAt",
        "legs1_departureAt",
        "legs1_arrivalAt",
        "miniRules0_percentage",
        "miniRules1_percentage",  # >90% missing
        "frequentFlyer",  # Already processed
        "pricingInfo_passengerCount",  # Exclude constant columns
        # "pricingInfo_isAccessTP",
        # "total_fees",
        "miniRules0_monetaryAmount",
        "miniRules1_monetaryAmount",
        # company features
        "is_very_popular_company",
        "is_popular_company",
        "is_top_selected_company",
        # "avg_selected_price",
        # "std_selected_price",
        # "selected_direct_ratio",
        # "selected_night_ratio",
        "company_select_count",
        # "log_company_select_count",
        # "z_price_vs_company_selected",
        # price features
        "totalPrice",
        "group_size",
        "taxes",
        # "log_taxes",
        "price_per_tax",
        # rank features
        "rank_interaction_ratio",
        "is_min_segments",
        "l0_seg",
        # analysis of validation set
        "is_top3_shortest_duration",
        # "is_direct_shortest",
        # "rank_interaction_mul",
        # "rank_interaction_sum",
        # "rank_interaction_sub",
        # profile features
        "price_rank_diff",
        "duration_rank_diff",
        "user_avg_cabinClass_leg0_seg0",
        "user_in_train",
        "user_selected_count",
        # timer features
        # "legs0_departureAt_business_time",
        # "legs0_arrivalAt_business_time",
        # "legs1_departureAt_business_time",
        # "legs1_arrivalAt_business_time",
        "stay_duration_hours",
        # max bin (for lightgbm)
        "companyID",
        # "legs0_segments0_arrivalTo_airport_city_iata",
        # "legs0_segments0_arrivalTo_airport_iata",
        # "legs0_segments0_departureFrom_airport_iata",
        "outbound_route",
        "return_route",
        # carrier code
        "legs0_segments0_marketingCarrier_code_selected_rank_bin",
        "legs1_segments0_marketingCarrier_code_selected_rank_bin",
        # "legs0_segments0_marketingCarrier_code_log_selected_count",
        # "legs1_segments0_marketingCarrier_code_log_selected_count",
        # "legs0_segments0_marketingCarrier_code_log_total_count",
        # "legs1_segments0_marketingCarrier_code_log_total_count",
        # duplicate cols
        "legs0_segments0_marketingCarrier_code_selection_rate_right",
        "legs0_segments0_marketingCarrier_code_log_selected_count_right",
        "legs0_segments0_marketingCarrier_code_log_total_count_right",
        "legs0_segments0_marketingCarrier_code_avg_dep_hour",
        # 20250715
        # "duration_ratio",
        # "legs0_segments0_marketingCarrier_code_selection_rate",
        # "legs1_segments0_marketingCarrier_code_selection_rate",
        # "price_from_median",
        # "selected_night_ratio",
        # "total_duration",
        "cabin_class_level_sum_3",
        # "legs0_is_business_friendly",
        "days_to_departure",
        "legs0_segments0_marketingCarrier_code_ff_and_economic",
        # 20250716
        # "leg_dur_interaction_mul",
        # "leg_dur_interaction_ratio",
        "legs0_arrivalAt_time_bin",
        "legs0_departureAt_time_bin",
        "legs1_arrivalAt_time_bin",
        "legs1_departureAt_time_bin",
        "legs0_dep_arr_bin_combo",
        "legs1_dep_arr_bin_combo",
        "stay_duration_bin",
        "departure_airport_GeoPointLat",
        "departure_airport_GeoPointLong",
        "arrival_airport_GeoPointLat",
        "arrival_airport_GeoPointLong",
        "legs0_segments1_duration",
        "legs1_segments1_duration",
        # 0.53344
        "legs0_segments0_marketingCarrier_code_selection_rate",
        "legs1_segments0_marketingCarrier_code_selection_rate",
        # "rank_interaction_sum",
        # "corporateTariffCode_hotness",
        "corporateTariffCode_price_rank_mean",
        "corporateTariffCode_duration_rank_mean",
        "geo_distance_km",
        "is_popular_flight",
        # 0.53817
    ]

    # Exclude columns with large missing ratio
    for leg in [0, 1]:
        for seg in [0, 1]:
            if leg == 0 and seg == 0:
                suffixes = ["cabinClass", "flightNumber"]
            else:
                suffixes = [
                    # Missing
                    "cabinClass",
                    "seatsAvailable",
                    "baggageAllowance_quantity",
                    "baggageAllowance_weightMeasurementType",
                    "aircraft_code",
                    "arrivalTo_airport_city_iata",
                    "arrivalTo_airport_iata",
                    "departureFrom_airport_iata",
                    "flightNumber",
                    "marketingCarrier_code",
                    "operatingCarrier_code",
                ]
            for suffix in suffixes:
                exclude_cols.append(f"legs{leg}_segments{seg}_{suffix}")

    # Exclude segment 2-3 columns (>98% missing)
    for leg in [0, 1]:
        for seg in [2, 3]:
            for suffix in [
                "aircraft_code",
                "arrivalTo_airport_city_iata",
                "arrivalTo_airport_iata",
                "baggageAllowance_quantity",
                "baggageAllowance_weightMeasurementType",
                "cabinClass",
                "departureFrom_airport_iata",
                "duration",
                "flightNumber",
                "marketingCarrier_code",
                "operatingCarrier_code",
                "seatsAvailable",
            ]:
                exclude_cols.append(f"legs{leg}_segments{seg}_{suffix}")

    available_cols = set(data.columns)
    all_features = [f for f in cat_features + num_features if f in available_cols]
    if trial:
        all_features = [
            f for f in all_features if trial.suggest_categorical(f, [True, False])
        ]

    feature_cols = [col for col in all_features if col not in exclude_cols]
    cat_features_final = [col for col in cat_features if col in feature_cols]
    num_features_final = [col for col in num_features if col in feature_cols]

    diff = set(feature_cols) - set(cat_features_final) - set(num_features_final)
    print(
        "Features in feature_cols but not in cat_features_final or num_features_final:",
        diff,
    )

    assert len(cat_features_final) + len(num_features_final) == len(
        feature_cols
    ), f"Using {len(feature_cols)} features ({len(cat_features_final)} categorical, {len(num_features_final)} numerical)"

    print(
        f"Using {len(feature_cols)} features ({len(cat_features_final)} categorical, {len(num_features_final)} numerical)"
    )

    X = data.select(feature_cols)
    y = data.select("selected")
    groups = data.select("ranker_id")

    return X, y, groups, cat_features_final, num_features_final

ImportError: attempted relative import with no known parent package

# **Trial**

In [None]:
%%writefile train_pipeline.py
import os, time, pytz, datetime
import optuna
import numpy as np
import pandas as pd
import polars as pl
import xgboost as xgb
import lightgbm as lgb
from src.utils import fill_missing, evaluate_hitrate_at_3, make_submission
from src.feature import feature_engineering, feature_selection
from src.plot import plot_hitrate_at_k, plot_ndcg_curve
from src.data import split_dataset
from src.params import get_hyper_params

FULL = True
MODEL = "xgboost"
DEBUG = False
DATA_DIR = "./data"
RANDOM_STATE = 42
tz = pytz.timezone("Europe/Bucharest")
TIME_TAG = datetime.datetime.now(tz).strftime("%Y%m%d%H%M%S")
print("Time Tag:", TIME_TAG)

MODEL_DIR = "model"
SUBMIT_DIR = "submission"
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(SUBMIT_DIR, exist_ok=True)

if not DEBUG:
    train_full = pl.read_parquet(f"{DATA_DIR}/train.parquet").drop("__index_level_0__")

    sampled_ids = train_full["ranker_id"].unique().sample(fraction=0.5, seed=42)

    train = train_full.filter(pl.col("ranker_id").is_in(sampled_ids))

    test = pl.read_parquet(f"{DATA_DIR}/test.parquet").drop("__index_level_0__").with_columns(
        pl.lit(0, dtype=pl.Int64).alias("selected")
    )

    df = pl.concat((train, test))
else:
    train_full = pl.read_parquet(f"{DATA_DIR}/train.parquet").drop("__index_level_0__")
    sampled_ids = train_full["ranker_id"].unique().sample(fraction=0.9, seed=42)
    df = train_full.filter(pl.col("ranker_id").is_in(sampled_ids))
    train = df
print("✅ Successfully loaded parquet files!")

df = feature_engineering(df)
print("✅ Feature engineering finished!")

schema = df.schema
dtype_df = pl.DataFrame({"column": list(schema.keys()), "dtype": [str(v) for v in schema.values()]})
na_infos = (
    df.select([pl.col(col).is_null().sum().alias(col) for col in df.columns])
    .melt(variable_name="column", value_name="null_count")
    .filter(pl.col("null_count") > 0)
    .join(dtype_df, on="column", how="left")
    .sort("null_count", descending=True)
)
print(na_infos)

df = df.with_columns(
    [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
    [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
)

X, y, groups, cat_features_final = feature_selection(df)
print(X.columns)

dtrain, dval, dtest, dfull, y_va, groups_va = split_dataset(train, X, y, groups, cat_features_final, MODEL)
best_params = get_hyper_params(MODEL)

print(f"🚀 Training final {MODEL} model...")
if MODEL == "xgboost":
    evals_result = {}
    model = xgb.train(
        best_params,
        dfull if FULL else dtrain,
        num_boost_round=2000,
        evals=[(dfull, "train")] if FULL else [(dtrain, "train"), (dval, "val")],
        verbose_eval=50,
        maximize=True,
        evals_result=evals_result,
    )
elif MODEL == "lightgbm":
    model = lgb.train(
        best_params,
        dfull if FULL else dtrain,
        num_boost_round=750,
        valid_sets=[dfull] if FULL else [dtrain, dval],
        callbacks=[lgb.log_evaluation(10), lgb.callback.record_evaluation({})],
    )

if FULL:
    model_path = os.path.join(MODEL_DIR, f"{MODEL}_{TIME_TAG}.json")
    model.save_model(model_path)
    print(f"✅ Model saved to: {model_path}")

if not FULL:
    va_preds = evaluate_hitrate_at_3(dval, y_va, groups_va, model)

    importance_df = (
        pl.DataFrame(
            [{"feature": k, "importance": v} for k, v in model.get_score(importance_type="gain").items()]
        ).sort("importance", descending=True)
    )
    print(importance_df.to_pandas().to_string())

    curves = plot_hitrate_at_k(groups_va, va_preds, y_va)
    print(f"\n📌 HitRate@1: {curves['All groups (>10)'][0]:.3f}")
    print(f"📌 HitRate@3: {curves['All groups (>10)'][2]:.3f}")
    print(f"📌 HitRate@5: {curves['All groups (>10)'][4]:.3f}")
    print(f"📌 HitRate@10: {curves['All groups (>10)'][9]:.3f}")

if FULL:
    submission_path = os.path.join(SUBMIT_DIR, f"submission_{TIME_TAG}.csv")
    make_submission(test, dtest, model, submission_path)
    print(f"✅ Submission file saved to: {submission_path}")

Overwriting train_pipeline.py


**DLRanker v1**

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class DLRanker(nn.Module):
    def __init__(self, cat_dims, num_numeric_feats, emb_dim=64, hidden=[512, 256, 128, 64]):
        super().__init__()

        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], emb_dim, padding_idx=0) for f in cat_dims
        })
        self.linear_cat = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], 1, padding_idx=0) for f in cat_dims
        })

        for emb in list(self.embeddings.values()) + list(self.linear_cat.values()):
            nn.init.xavier_uniform_(emb.weight)

        self.linear_num = nn.Linear(num_numeric_feats, 1)
        nn.init.xavier_uniform_(self.linear_num.weight)

        input_dim = emb_dim * len(cat_dims) + num_numeric_feats
        layers = []
        for h in hidden:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.1))
            input_dim = h
        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(input_dim, 1)

    def forward(self, x_cat, x_num):
        batch_size, group_size = x_num.shape[:2]

        linear_cat_sum = torch.stack([
            self.linear_cat[f](x_cat[f]).squeeze(-1)
            for f in self.linear_cat
        ], dim=0).sum(dim=0)

        linear_num_out = self.linear_num(x_num).squeeze(-1)
        linear_out = linear_cat_sum + linear_num_out

        embs = torch.stack([
            self.embeddings[f](x_cat[f]) for f in self.embeddings
        ], dim=2)

        sum_emb = embs.sum(dim=2)
        sum_emb_square = sum_emb ** 2
        square_emb_sum = (embs ** 2).sum(dim=2)
        fm_out = 0.0
        # fm_out = 0.5 * (sum_emb_square - square_emb_sum).sum(dim=2)

        embs_cat = embs.reshape(batch_size * group_size, -1)
        x_num_flat = x_num.reshape(batch_size * group_size, -1)
        deep_input = torch.cat([embs_cat, x_num_flat], dim=1)
        deep_out = self.mlp(deep_input)
        deep_out = self.output(deep_out).view(batch_size, group_size)

        return linear_out + fm_out + deep_out


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 1e-3

    model = DLRanker(cat_dims, num_numeric_feats).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Overwriting deeprec_validate.py


<h2>DlRanker with Attention<h2>

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class DLRankerWithAttention(nn.Module):
    def __init__(self, cat_dims, num_numeric_feats, emb_dim=64, hidden=[512, 256, 128, 64], num_heads=4):
        super().__init__()

        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], emb_dim, padding_idx=0) for f in cat_dims
        })
        self.linear_cat = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], 1, padding_idx=0) for f in cat_dims
        })

        for emb in list(self.embeddings.values()) + list(self.linear_cat.values()):
            nn.init.xavier_uniform_(emb.weight)

        self.linear_num = nn.Linear(num_numeric_feats, 1)
        nn.init.xavier_uniform_(self.linear_num.weight)

        # Deep input
        input_dim = emb_dim * len(cat_dims) + num_numeric_feats
        layers = []
        for h in hidden:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.1))
            input_dim = h
        self.mlp = nn.Sequential(*layers)

        # Attention block
        self.attn = nn.MultiheadAttention(embed_dim=input_dim, num_heads=num_heads, batch_first=True)
        self.attn_norm = nn.LayerNorm(input_dim)

        self.output = nn.Linear(input_dim, 1)

    def forward(self, x_cat, x_num):
        batch_size, group_size = x_num.shape[:2]

        # Linear interaction
        linear_cat_sum = torch.stack([
            self.linear_cat[f](x_cat[f]).squeeze(-1)
            for f in self.linear_cat
        ], dim=0).sum(dim=0)

        linear_num_out = self.linear_num(x_num).squeeze(-1)
        linear_out = linear_cat_sum + linear_num_out

        # Deep features
        embs = torch.stack([
            self.embeddings[f](x_cat[f]) for f in self.embeddings
        ], dim=2)  # [B, G, F, D]

        embs_cat = embs.reshape(batch_size * group_size, -1)
        x_num_flat = x_num.reshape(batch_size * group_size, -1)
        deep_input = torch.cat([embs_cat, x_num_flat], dim=1)
        deep_out = self.mlp(deep_input)  # [B*G, H]
        deep_out = deep_out.view(batch_size, group_size, -1)  # [B, G, H]

        # Self-attention over group
        attn_out, _ = self.attn(deep_out, deep_out, deep_out)  # [B, G, H]
        attn_out = self.attn_norm(attn_out + deep_out)

        final_scores = self.output(attn_out).squeeze(-1)  # [B, G]

        # Optional FM component (disabled by default)
        fm_out = 0.0
        # fm_out = 0.5 * (sum_emb_square - square_emb_sum).sum(dim=2)

        return linear_out + fm_out + final_scores


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 1e-3

    model = DLRankerWithAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dim=64,
        hidden=[512, 256, 128, 64],
        num_heads=4
    ).to(device)


    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()


Overwriting deeprec_validate.py


**DLRanker v1 full training**

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F_torch
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class FiBiNetRanker(nn.Module):
    def __init__(self, cat_dims, num_numeric_feats, emb_dim=64, hidden=[512, 256, 128, 64], reduction=4, use_senet=True):
        super().__init__()
        self.cat_fields = list(cat_dims.keys())
        self.emb_dim = emb_dim
        self.num_fields = len(self.cat_fields)
        self.use_senet = use_senet

        # Embedding layers
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], emb_dim, padding_idx=0) for f in self.cat_fields
        })
        self.linear_cat = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], 1, padding_idx=0) for f in self.cat_fields
        })
        self.linear_num = nn.Linear(num_numeric_feats, 1)

        for emb in list(self.embeddings.values()) + list(self.linear_cat.values()):
            nn.init.xavier_uniform_(emb.weight)
        nn.init.xavier_uniform_(self.linear_num.weight)

        # SENet
        if self.use_senet:
            self.se_fc1 = nn.Linear(self.num_fields * emb_dim, self.num_fields * emb_dim // reduction)
            self.se_fc2 = nn.Linear(self.num_fields * emb_dim // reduction, self.num_fields * emb_dim)

        # Bilinear layer (shared)
        self.bilinear = nn.Linear(emb_dim, emb_dim, bias=False)

        # MLP (deeper + stronger)
        input_dim = emb_dim + num_numeric_feats
        layers = []
        for h in hidden:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.LayerNorm(h))  # normalize
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))  # slightly increased dropout
            input_dim = h
        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(input_dim, 1)

    def senet(self, embs):
        B, F, D = embs.shape
        z = embs.view(B, -1)
        a = F_torch.relu(self.se_fc1(z))
        s = torch.sigmoid(self.se_fc2(a)).view(B, F, D)
        return embs * s

    def bilinear_interaction(self, embs):
        transformed = self.bilinear(embs)  # [B*G, F, D]
        interaction = (embs * transformed).mean(dim=1)  # [B*G, D]
        return interaction

    def forward(self, x_cat, x_num):
        B, G = x_num.shape[:2]
        F, D = self.num_fields, self.emb_dim

        # Embedding stacking
        embs = torch.stack([self.embeddings[f](x_cat[f]) for f in self.cat_fields], dim=2)  # [B, G, F, D]

        # Linear logits
        linear_cat_sum = torch.stack([
            self.linear_cat[f](x_cat[f]).squeeze(-1) for f in self.cat_fields
        ], dim=0).sum(dim=0)  # [B, G]

        linear_num_out = self.linear_num(x_num).squeeze(-1)  # [B, G]
        linear_out = linear_cat_sum + linear_num_out  # [B, G]

        # Feature interactions
        embs = embs.view(B * G, F, D)
        if self.use_senet:
            embs = self.senet(embs)
        bi = self.bilinear_interaction(embs)  # [B*G, D]

        # Deep part
        x_num_flat = x_num.view(B * G, -1)
        deep_input = torch.cat([bi, x_num_flat], dim=1)
        deep_out = self.mlp(deep_input)
        scores = self.output(deep_out).view(B, G)

        return scores + linear_out  # [B, G]



"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F_torch.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 7e-4

    model = FiBiNetRanker(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dim=64,
        hidden=[512, 256, 128, 64]
    ).to(device)


    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()


In [None]:
%%writefile deeprec_full_training.py
import os
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F_torch
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths


class FiBiNetRanker(nn.Module):
    def __init__(self, cat_dims, num_numeric_feats, emb_dim=64, hidden=[512, 256, 128, 64], reduction=4, use_senet=True):
        super().__init__()
        self.cat_fields = list(cat_dims.keys())
        self.emb_dim = emb_dim
        self.num_fields = len(self.cat_fields)
        self.use_senet = use_senet

        # Embedding layers
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], emb_dim, padding_idx=0) for f in self.cat_fields
        })
        self.linear_cat = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], 1, padding_idx=0) for f in self.cat_fields
        })
        self.linear_num = nn.Linear(num_numeric_feats, 1)

        for emb in list(self.embeddings.values()) + list(self.linear_cat.values()):
            nn.init.xavier_uniform_(emb.weight)
        nn.init.xavier_uniform_(self.linear_num.weight)

        # SENet
        if self.use_senet:
            self.se_fc1 = nn.Linear(self.num_fields * emb_dim, self.num_fields * emb_dim // reduction)
            self.se_fc2 = nn.Linear(self.num_fields * emb_dim // reduction, self.num_fields * emb_dim)

        # Bilinear layer (shared)
        self.bilinear = nn.Linear(emb_dim, emb_dim, bias=False)

        # MLP (deeper + stronger)
        input_dim = emb_dim + num_numeric_feats
        layers = []
        for h in hidden:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.LayerNorm(h))  # normalize
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))  # slightly increased dropout
            input_dim = h
        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(input_dim, 1)

    def senet(self, embs):
        B, F, D = embs.shape
        z = embs.view(B, -1)
        a = F_torch.relu(self.se_fc1(z))
        s = torch.sigmoid(self.se_fc2(a)).view(B, F, D)
        return embs * s

    def bilinear_interaction(self, embs):
        transformed = self.bilinear(embs)  # [B*G, F, D]
        interaction = (embs * transformed).mean(dim=1)  # [B*G, D]
        return interaction

    def forward(self, x_cat, x_num):
        B, G = x_num.shape[:2]
        F, D = self.num_fields, self.emb_dim

        # Embedding stacking
        embs = torch.stack([self.embeddings[f](x_cat[f]) for f in self.cat_fields], dim=2)  # [B, G, F, D]

        # Linear logits
        linear_cat_sum = torch.stack([
            self.linear_cat[f](x_cat[f]).squeeze(-1) for f in self.cat_fields
        ], dim=0).sum(dim=0)  # [B, G]

        linear_num_out = self.linear_num(x_num).squeeze(-1)  # [B, G]
        linear_out = linear_cat_sum + linear_num_out  # [B, G]

        # Feature interactions
        embs = embs.view(B * G, F, D)
        if self.use_senet:
            embs = self.senet(embs)
        bi = self.bilinear_interaction(embs)  # [B*G, D]

        # Deep part
        x_num_flat = x_num.view(B * G, -1)
        deep_input = torch.cat([bi, x_num_flat], dim=1)
        deep_out = self.mlp(deep_input)
        scores = self.output(deep_out).view(B, G)

        return scores + linear_out  # [B, G]

"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3):
    batch_size = scores.size(0)
    hits = 0
    for i in range(batch_size):
        l = lengths[i]
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
    return hits / batch_size

"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F_torch.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device):
    model.eval()
    total_hitrate = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):

            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            total_hitrate += hitrate * scores.size(0)
            total_loss += loss.item() * scores.size(0)
            count += scores.size(0)

    avg_hitrate = total_hitrate / count
    avg_loss = total_loss / count
    return avg_loss, avg_hitrate


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n2 = train_size

    X_tr, X_te = X[:n2], X[n2:]
    y_tr, y_te = y[:n2], y[n2:]
    groups_tr, groups_te = groups[:n2], groups[n2:]

    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 7e-4

    model = FiBiNetRanker(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dim=64,
        hidden=[512, 256, 128, 64]
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.05)

    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)

        train_losses.append(train_loss)

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )

        torch.save(model.state_dict(), model_path)
        print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()
    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "selected"])
    )


    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.parquet")
    submission.write_parquet(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Writing deeprec_full_training.py


In [None]:
%%writefile deeprec_full_training.py
import os
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths


class DLRanker(nn.Module):
    def __init__(self, cat_dims, num_numeric_feats, emb_dim=64, hidden=[512, 256, 128, 64]):
        super().__init__()

        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], emb_dim, padding_idx=0) for f in cat_dims
        })
        self.linear_cat = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], 1, padding_idx=0) for f in cat_dims
        })

        for emb in list(self.embeddings.values()) + list(self.linear_cat.values()):
            nn.init.xavier_uniform_(emb.weight)

        self.linear_num = nn.Linear(num_numeric_feats, 1)
        nn.init.xavier_uniform_(self.linear_num.weight)

        input_dim = emb_dim * len(cat_dims) + num_numeric_feats
        layers = []
        for h in hidden:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.1))
            input_dim = h
        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(input_dim, 1)

    def forward(self, x_cat, x_num):
        batch_size, group_size = x_num.shape[:2]

        linear_cat_sum = torch.stack([
            self.linear_cat[f](x_cat[f]).squeeze(-1)
            for f in self.linear_cat
        ], dim=0).sum(dim=0)

        linear_num_out = self.linear_num(x_num).squeeze(-1)
        linear_out = linear_cat_sum + linear_num_out

        embs = torch.stack([
            self.embeddings[f](x_cat[f]) for f in self.embeddings
        ], dim=2)

        sum_emb = embs.sum(dim=2)
        sum_emb_square = sum_emb ** 2
        square_emb_sum = (embs ** 2).sum(dim=2)
        fm_out = 0.0
        # fm_out = 0.5 * (sum_emb_square - square_emb_sum).sum(dim=2)

        embs_cat = embs.reshape(batch_size * group_size, -1)
        x_num_flat = x_num.reshape(batch_size * group_size, -1)
        deep_input = torch.cat([embs_cat, x_num_flat], dim=1)
        deep_out = self.mlp(deep_input)
        deep_out = self.output(deep_out).view(batch_size, group_size)

        return linear_out + fm_out + deep_out

"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3):
    batch_size = scores.size(0)
    hits = 0
    for i in range(batch_size):
        l = lengths[i]
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
    return hits / batch_size

"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.0, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device):
    model.eval()
    total_hitrate = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):

            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.0, focus_topk=3)

            total_hitrate += hitrate * scores.size(0)
            total_loss += loss.item() * scores.size(0)
            count += scores.size(0)

    avg_hitrate = total_hitrate / count
    avg_loss = total_loss / count
    return avg_loss, avg_hitrate


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n2 = train_size

    X_tr, X_te = X[:n2], X[n2:]
    y_tr, y_te = y[:n2], y[n2:]
    groups_tr, groups_te = groups[:n2], groups[n2:]

    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 1e-3

    model = DLRanker(cat_dims, num_numeric_feats).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.05)

    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)

        train_losses.append(train_loss)

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )

        torch.save(model.state_dict(), model_path)
        print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()
    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "selected"])
    )


    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.parquet")
    submission.write_parquet(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()


Writing deeprec_full_training.py


In [None]:
--- Epoch 1/10 ---
Epoch 1: Train Loss=0.3564, LR=0.001000 (569.4s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2492, LR=0.000889 (573.5s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2312, LR=0.000778 (573.1s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2243, LR=0.000667 (574.3s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2171, LR=0.000556 (571.1s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2105, LR=0.000444 (574.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2025, LR=0.000333 (568.6s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.1936, LR=0.000222 (571.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.1815, LR=0.000111 (569.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.1667, LR=0.000000 (566.6s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

In [None]:
--- Epoch 1/10 ---
Epoch 1: Train Loss=0.3411, LR=0.001000 (1224.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2722, LR=0.000889 (1229.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2459, LR=0.000778 (1225.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2291, LR=0.000667 (1225.6s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2150, LR=0.000556 (1229.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2010, LR=0.000444 (1222.5s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.1888, LR=0.000333 (1226.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.1746, LR=0.000222 (1231.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.1594, LR=0.000111 (1226.1s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.1468, LR=0.000000 (1229.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class DLRanker(nn.Module):
    def __init__(self, cat_dims, num_numeric_feats, emb_dim=64, hidden=[512, 256, 128, 64]):
        super().__init__()

        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], emb_dim, padding_idx=0) for f in cat_dims
        })
        self.linear_cat = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], 1, padding_idx=0) for f in cat_dims
        })

        for emb in list(self.embeddings.values()) + list(self.linear_cat.values()):
            nn.init.xavier_uniform_(emb.weight)

        self.linear_num = nn.Linear(num_numeric_feats, 1)
        nn.init.xavier_uniform_(self.linear_num.weight)

        input_dim = emb_dim * len(cat_dims) + num_numeric_feats
        layers = []
        for h in hidden:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.1))
            input_dim = h
        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(input_dim, 1)

    def forward(self, x_cat, x_num):
        batch_size, group_size = x_num.shape[:2]

        linear_cat_sum = torch.stack([
            self.linear_cat[f](x_cat[f]).squeeze(-1)
            for f in self.linear_cat
        ], dim=0).sum(dim=0)

        linear_num_out = self.linear_num(x_num).squeeze(-1)
        linear_out = linear_cat_sum + linear_num_out

        embs = torch.stack([
            self.embeddings[f](x_cat[f]) for f in self.embeddings
        ], dim=2)

        sum_emb = embs.sum(dim=2)
        sum_emb_square = sum_emb ** 2
        square_emb_sum = (embs ** 2).sum(dim=2)
        fm_out = 0.0
        # fm_out = 0.5 * (sum_emb_square - square_emb_sum).sum(dim=2)

        embs_cat = embs.reshape(batch_size * group_size, -1)
        x_num_flat = x_num.reshape(batch_size * group_size, -1)
        deep_input = torch.cat([embs_cat, x_num_flat], dim=1)
        deep_out = self.mlp(deep_input)
        deep_out = self.output(deep_out).view(batch_size, group_size)

        return linear_out + fm_out + deep_out


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.0, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.0, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    print("\n📊 Distribuția is_one_way în setul de antrenare:")
    print(X_tr["is_one_way"].value_counts().sort("is_one_way"))

    print("\n📊 Distribuția is_one_way în setul de validare:")
    print(X_va["is_one_way"].value_counts().sort("is_one_way"))

    print("\n📊 Distribuția is_one_way în setul de test:")
    print(X_te["is_one_way"].value_counts().sort("is_one_way"))

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")

    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 1e-3

    model = DLRanker(cat_dims, num_numeric_feats).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.parquet")
    val_df.write_parquet(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.parquet")
    submission.write_parquet(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Overwriting deeprec_validate.py


# **FibiNet**

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F_torch
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class FiBiNetRanker(nn.Module):
    def __init__(self, cat_dims, num_numeric_feats, emb_dim=16, hidden=[256, 128], reduction=4, use_senet=True):
        super().__init__()
        self.cat_fields = list(cat_dims.keys())
        self.emb_dim = emb_dim
        self.num_fields = len(self.cat_fields)
        self.use_senet = use_senet

        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], emb_dim, padding_idx=0) for f in self.cat_fields
        })

        self.linear_cat = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], 1, padding_idx=0) for f in self.cat_fields
        })

        self.linear_num = nn.Linear(num_numeric_feats, 1)

        for emb in list(self.embeddings.values()) + list(self.linear_cat.values()):
            nn.init.xavier_uniform_(emb.weight)
        nn.init.xavier_uniform_(self.linear_num.weight)

        if self.use_senet:
            self.se_fc1 = nn.Linear(self.num_fields * emb_dim, self.num_fields * emb_dim // reduction)
            self.se_fc2 = nn.Linear(self.num_fields * emb_dim // reduction, self.num_fields * emb_dim)

        self.bilinear = nn.Linear(emb_dim, emb_dim, bias=False)

        input_dim = emb_dim + num_numeric_feats
        layers = []
        for h in hidden:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.1))
            input_dim = h

        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(input_dim, 1)

    def senet(self, embs):
        B, F, D = embs.shape
        z = embs.view(B, -1)
        a = F_torch.relu(self.se_fc1(z))
        s = torch.sigmoid(self.se_fc2(a)).view(B, F, D)
        return embs * s

    def bilinear_interaction(self, embs):
        transformed = self.bilinear(embs)  # [B*G, F, D]
        interaction = (embs * transformed).mean(dim=1)  # [B*G, D]
        return interaction

    def forward(self, x_cat, x_num):
        B, G = x_num.shape[:2]
        F, D = self.num_fields, self.emb_dim

        embs = torch.stack([self.embeddings[f](x_cat[f]) for f in self.cat_fields], dim=2)  # [B, G, F, D]

        linear_cat_sum = torch.stack([
            self.linear_cat[f](x_cat[f]).squeeze(-1) for f in self.cat_fields
        ], dim=0).sum(dim=0)  # [B, G]

        linear_num_out = self.linear_num(x_num).squeeze(-1)  # [B, G]
        linear_out = linear_cat_sum + linear_num_out  # [B, G]

        embs = embs.view(B * G, F, D)

        if self.use_senet:
            embs = self.senet(embs)

        bi = self.bilinear_interaction(embs)  # [B*G, D]

        x_num_flat = x_num.view(B * G, -1)
        deep_input = torch.cat([bi, x_num_flat], dim=1)  # [B*G, D+N]
        deep_out = self.mlp(deep_input)
        scores = self.output(deep_out).view(B, G)  # [B, G]

        return scores + linear_out  # [B, G]


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F_torch.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 1e-3

    model = FiBiNetRanker(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dim=16,
        hidden=[512, 256, 128]
    ).to(device)


    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:458: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3967, Val Loss=0.3252, HitRate@3=0.4560, NDCG@3=0.3643, MAP@3=0.6130, LR=0.001000 (446.5s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2916, Val Loss=0.2868, HitRate@3=0.4839, NDCG@3=0.3881, MAP@3=0.6312, LR=0.000889 (444.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2761, Val Loss=0.3020, HitRate@3=0.4949, NDCG@3=0.4003, MAP@3=0.6450, LR=0.000778 (444.7s)

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2681, Val Loss=0.2786, HitRate@3=0.4958, NDCG@3=0.3996, MAP@3=0.6447, LR=0.000667 (444.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2663, Val Loss=0.3014, HitRate@3=0.5050, NDCG@3=0.4056, MAP@3=0.6463, LR=0.000556 (444.6s)

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2586, Val Loss=0.2871, HitRate@3=0.5046, NDCG@3=0.4038, MAP@3=0.6426, LR=0.000444 (443.7s)

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F_torch
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class FiBiNetRanker(nn.Module):
    def __init__(self, cat_dims, num_numeric_feats, emb_dim=64, hidden=[512, 256, 128, 64], reduction=4, use_senet=True):
        super().__init__()
        self.cat_fields = list(cat_dims.keys())
        self.emb_dim = emb_dim
        self.num_fields = len(self.cat_fields)
        self.use_senet = use_senet

        # Embedding layers
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], emb_dim, padding_idx=0) for f in self.cat_fields
        })
        self.linear_cat = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], 1, padding_idx=0) for f in self.cat_fields
        })
        self.linear_num = nn.Linear(num_numeric_feats, 1)

        for emb in list(self.embeddings.values()) + list(self.linear_cat.values()):
            nn.init.xavier_uniform_(emb.weight)
        nn.init.xavier_uniform_(self.linear_num.weight)

        # SENet
        if self.use_senet:
            self.se_fc1 = nn.Linear(self.num_fields * emb_dim, self.num_fields * emb_dim // reduction)
            self.se_fc2 = nn.Linear(self.num_fields * emb_dim // reduction, self.num_fields * emb_dim)

        # Bilinear layer (shared)
        self.bilinear = nn.Linear(emb_dim, emb_dim, bias=False)

        # MLP (deeper + stronger)
        input_dim = emb_dim + num_numeric_feats
        layers = []
        for h in hidden:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.LayerNorm(h))  # normalize
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))  # slightly increased dropout
            input_dim = h
        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(input_dim, 1)

    def senet(self, embs):
        B, F, D = embs.shape
        z = embs.view(B, -1)
        a = F_torch.relu(self.se_fc1(z))
        s = torch.sigmoid(self.se_fc2(a)).view(B, F, D)
        return embs * s

    def bilinear_interaction(self, embs):
        transformed = self.bilinear(embs)  # [B*G, F, D]
        interaction = (embs * transformed).mean(dim=1)  # [B*G, D]
        return interaction

    def forward(self, x_cat, x_num):
        B, G = x_num.shape[:2]
        F, D = self.num_fields, self.emb_dim

        # Embedding stacking
        embs = torch.stack([self.embeddings[f](x_cat[f]) for f in self.cat_fields], dim=2)  # [B, G, F, D]

        # Linear logits
        linear_cat_sum = torch.stack([
            self.linear_cat[f](x_cat[f]).squeeze(-1) for f in self.cat_fields
        ], dim=0).sum(dim=0)  # [B, G]

        linear_num_out = self.linear_num(x_num).squeeze(-1)  # [B, G]
        linear_out = linear_cat_sum + linear_num_out  # [B, G]

        # Feature interactions
        embs = embs.view(B * G, F, D)
        if self.use_senet:
            embs = self.senet(embs)
        bi = self.bilinear_interaction(embs)  # [B*G, D]

        # Deep part
        x_num_flat = x_num.view(B * G, -1)
        deep_input = torch.cat([bi, x_num_flat], dim=1)
        deep_out = self.mlp(deep_input)
        scores = self.output(deep_out).view(B, G)

        return scores + linear_out  # [B, G]



"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F_torch.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 7e-4

    model = FiBiNetRanker(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dim=64,
        hidden=[512, 256, 128, 64]
    ).to(device)


    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()


Writing deeprec_validate.py


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:462: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3718, Val Loss=0.3083, HitRate@3=0.4674, NDCG@3=0.3711, MAP@3=0.6171, LR=0.001000 (727.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2933, Val Loss=0.2905, HitRate@3=0.4715, NDCG@3=0.3777, MAP@3=0.6251, LR=0.000889 (726.1s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2770, Val Loss=0.2795, HitRate@3=0.4868, NDCG@3=0.3905, MAP@3=0.6359, LR=0.000778 (724.0s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2668, Val Loss=0.2731, HitRate@3=0.4884, NDCG@3=0.3969, MAP@3=0.6470, LR=0.000667 (727.3s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2613, Val Loss=0.2762, HitRate@3=0.4931, NDCG@3=0.3990, MAP@3=0.6451, LR=0.000556 (728.7s)

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2558, Val Loss=0.2726, HitRate@3=0.4994, NDCG@3=0.4032, MAP@3=0.6499, LR=0.000444 (725.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2508, Val Loss=0.2646, HitRate@3=0.5053, NDCG@3=0.4106, MAP@3=0.6560, LR=0.000333 (730.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.2453, Val Loss=0.2647, HitRate@3=0.5029, NDCG@3=0.4092, MAP@3=0.6563, LR=0.000222 (728.2s)

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.2399, Val Loss=0.2643, HitRate@3=0.4992, NDCG@3=0.4064, MAP@3=0.6541, LR=0.000111 (727.0s)

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.2346, Val Loss=0.2648, HitRate@3=0.5003, NDCG@3=0.4065, MAP@3=0.6563, LR=0.000000 (726.9s)

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F_torch
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch = [b for b in batch if b[3] >= 2]
    if len(batch) == 0:
        return None

    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths


class FiBiDLRanker(nn.Module):
    def __init__(self, cat_dims, num_numeric_feats, emb_dim=64, hidden=[512, 256, 128, 64], reduction=4, use_senet=True):
        super().__init__()
        self.cat_fields = list(cat_dims.keys())
        self.emb_dim = emb_dim
        self.num_fields = len(self.cat_fields)
        self.use_senet = use_senet

        # Embedding layers
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], emb_dim, padding_idx=0) for f in self.cat_fields
        })
        self.linear_cat = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], 1, padding_idx=0) for f in self.cat_fields
        })
        self.linear_num = nn.Linear(num_numeric_feats, 1)

        for emb in list(self.embeddings.values()) + list(self.linear_cat.values()):
            nn.init.xavier_uniform_(emb.weight)
        nn.init.xavier_uniform_(self.linear_num.weight)

        # SENet (with LayerNorm)
        if self.use_senet:
            self.se_fc1 = nn.Linear(self.num_fields * emb_dim, self.num_fields * emb_dim // reduction)
            self.se_ln1 = nn.LayerNorm(self.num_fields * emb_dim // reduction)
            self.se_dropout = nn.Dropout(0.1)
            self.se_fc2 = nn.Linear(self.num_fields * emb_dim // reduction, self.num_fields * emb_dim)

        # Bilinear layer (shared)
        self.bilinear = nn.Linear(emb_dim, emb_dim, bias=False)

        # MLP (using LayerNorm instead of BatchNorm)
        input_dim = emb_dim + num_numeric_feats
        layers = []
        for h in hidden:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.LayerNorm(h))     # replaced BatchNorm1d with LayerNorm
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            input_dim = h
        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(input_dim, 1)

    def senet(self, embs):
        B, F, D = embs.shape
        z = embs.view(B, -1)  # Flatten [B, F*D]
        a = self.se_fc1(z)    # Linear
        a = self.se_ln1(a)    # LayerNorm instead of BatchNorm
        a = F_torch.relu(a)
        a = self.se_dropout(a)
        s = torch.sigmoid(self.se_fc2(a)).view(B, F, D)
        return embs * s

    def bilinear_interaction(self, embs):
        transformed = self.bilinear(embs)
        interaction = (embs * transformed).mean(dim=1)
        return interaction

    def forward(self, x_cat, x_num):
        B, G = x_num.shape[:2]
        F, D = self.num_fields, self.emb_dim

        # Embeddings [B, G, F, D]
        embs = torch.stack([self.embeddings[f](x_cat[f]) for f in self.cat_fields], dim=2)

        # Linear output
        linear_cat_sum = torch.stack([
            self.linear_cat[f](x_cat[f]).squeeze(-1) for f in self.cat_fields
        ], dim=0).sum(dim=0)
        linear_num_out = self.linear_num(x_num).squeeze(-1)
        linear_out = linear_cat_sum + linear_num_out  # [B, G]

        # Feature interactions
        embs = embs.view(B * G, F, D)
        if self.use_senet:
            embs = self.senet(embs)
        bi = self.bilinear_interaction(embs)  # [B*G, D]

        # Deep part
        x_num_flat = x_num.view(B * G, -1)
        deep_input = torch.cat([bi, x_num_flat], dim=1)
        deep_out = self.mlp(deep_input)
        scores = self.output(deep_out).view(B, G)

        return scores + linear_out  # final output


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F_torch.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0
    valid_batches = 0

    for batch in tqdm(loader, desc="Training", leave=False):
        if batch is None:
            continue

        x_cat, x_num, y, lengths = batch

        x_num, y = x_num.to(device), y.to(device)
        lengths = lengths.to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        valid_batches += 1

    return total_loss / valid_batches if valid_batches > 0 else float("inf")



def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    SUBMIT_DIR = "submission"
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    learning_rates_to_try = [7e-4, 5e-4, 3e-4]

    for lr in learning_rates_to_try:
        run_name = f"lr_{lr:.0e}".replace("-", "")
        MODEL_DIR = f"model_{run_name}"
        os.makedirs(MODEL_DIR, exist_ok=True)

        print(f"\n\n🚀 Starting training with learning rate = {lr}\nSaved to: {MODEL_DIR}")

        best_val_loss = float("inf")
        best_hitrate = 0.0
        model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        train_losses = []
        val_losses = []
        val_hitrates = []
        val_ndcgs = []
        val_maps = []
        learning_rates = []

        os.makedirs(MODEL_DIR, exist_ok=True)
        os.makedirs(SUBMIT_DIR, exist_ok=True)

        early_stopper = EarlyStopping(patience=3, min_delta=0.00005)

        X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

        # Fill missing values
        float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
        for col in float_cols:
            X = X.with_columns(
                pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
            )

        n1 = 16487352
        # n1 = 17487300
        n2 = train_size

        X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
        y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
        groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

        # Inspect group sizes in validation set
        val_rankers = groups_va["ranker_id"].to_numpy()
        val_group_counts = Counter(val_rankers)

        print("\n📋 Validation group statistics:")
        # Save detailed group info to file
        group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
        with open(group_info_path, "w") as f:
            f.write(f"Total groups: {len(val_group_counts)}\n\n")
            f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

            group_to_rows = defaultdict(list)
            for i, g in enumerate(val_rankers):
                group_to_rows[g].append(i)

            for group_id, indices in sorted(group_to_rows.items()):
                start_idx = indices[0]
                end_idx = indices[-1]
                size = len(indices)
                f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

        print(f"📁 Saved validation group info to {group_info_path}")


        cat_dims = build_cat_dims(X, cat_features)
        num_numeric_feats = len(num_features)

        train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
        val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
        test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

        plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
        print("📊 Group size distribution saved.")

        train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
        val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
        test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

        print("✅ Data loaders created.")

        num_epochs = 10
        num_training_steps = num_epochs * len(train_loader)
        num_warmup_steps = int(0.1 * num_training_steps)

        model = FiBiDLRanker(
            cat_dims=cat_dims,
            num_numeric_feats=num_numeric_feats,
            emb_dim=64,
            hidden=[512, 256, 128, 64]
        ).to(device)

        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)

        scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )

        best_hitrate = 0
        for epoch in range(num_epochs):
            print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
            start_time = time.time()

            train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
            val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

            train_losses.append(train_loss)
            val_losses.append(val_loss)
            val_hitrates.append(val_hitrate.item())
            val_ndcgs.append(val_ndcg.item())
            val_maps.append(val_map.item())

            current_lr = scheduler.get_last_lr()
            learning_rates.append(current_lr[0])

            elapsed = time.time() - start_time
            print(
                f"Epoch {epoch+1}: "
                f"Train Loss={train_loss:.4f}, "
                f"Val Loss={val_loss:.4f}, "
                f"HitRate@3={val_hitrate:.4f}, "
                f"NDCG@3={val_ndcg:.4f}, "
                f"MAP@3={val_map:.4f}, "
                f"LR={current_lr[0]:.6f} "
                f"({elapsed:.1f}s)"
            )


            if val_loss < best_val_loss and val_hitrate > best_hitrate:
                best_val_loss = val_loss
                best_hitrate = val_hitrate
                torch.save(model.state_dict(), model_path)
                print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

            if early_stopper.step(-val_loss):
                print("⛔ Early stopping triggered.")
                break

        # Plot metrics
        epochs = list(range(1, len(train_losses) + 1))

        # Plot Loss Curve
        plt.figure()
        plt.plot(epochs, train_losses, label="Train Loss")
        plt.plot(epochs, val_losses, label="Val Loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.title("Training & Validation Loss")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
        plt.close()

        # Plot HitRate@3
        plt.figure()
        plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
        plt.xlabel("Epoch")
        plt.ylabel("HitRate@3")
        plt.title("Validation HitRate@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
        plt.close()

        # Plot Learning Rate
        plt.figure()
        plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
        plt.xlabel("Epoch")
        plt.ylabel("LR")
        plt.title("Learning Rate Schedule")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
        plt.close()

        # Plot NDCG@3
        plt.figure()
        plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
        plt.xlabel("Epoch")
        plt.ylabel("NDCG@3")
        plt.title("Validation NDCG@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
        plt.close()

        # Plot MAP@3
        plt.figure()
        plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
        plt.xlabel("Epoch")
        plt.ylabel("MAP@3")
        plt.title("Validation MAP@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
        plt.close()

        print("📊 Saved metric plots to model/")

        # Predict test set
        model.load_state_dict(torch.load(model_path))
        model.eval()

        val_ids = X_va["Id"]
        val_rankers = groups_va["ranker_id"]
        val_labels = y_va

        all_val_scores = []

        with torch.no_grad():
            for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
                x_num = x_num.to(device)
                x_cat = {k: v.to(device) for k, v in x_cat.items()}
                scores = model(x_cat, x_num)

                for i in range(scores.size(0)):
                    l = lengths[i]
                    all_val_scores.append(scores[i, :l].cpu().numpy())

        all_val_scores = np.concatenate(all_val_scores)

        assert len(all_val_scores) == X_va.shape[0], \
            f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

        val_df = X_va.with_columns([
            pl.Series("Id", val_ids),
            pl.Series("ranker_id", val_rankers),
            pl.Series("score", all_val_scores),
            pl.Series("label", val_labels)
        ])

        val_df = val_df.with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )

        val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
        val_df.write_csv(val_save_path)
        print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

        all_scores = []

        with torch.no_grad():
            for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
                x_num = x_num.to(device)
                x_cat = {k: v.to(device) for k, v in x_cat.items()}
                scores = model(x_cat, x_num)

                for i in range(scores.size(0)):
                    l = lengths[i]
                    all_scores.append(scores[i, :l].cpu().numpy())

        all_scores = np.concatenate(all_scores)

        assert len(all_scores) == X_te.shape[0], \
            f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

        submission_df = pl.DataFrame({
            "Id": test_ids,
            "ranker_id": test_rankers,
            "score": all_scores
        })

        submission = (
            submission_df
            .with_columns(
                pl.col("score")
                .rank(method="ordinal", descending=True)
                .over("ranker_id")
                .cast(pl.Int32)
                .alias("selected")
            )
            .select(["Id", "ranker_id", "score", "selected"])
        )

        submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
        submission.write_csv(submission_path)
        print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Overwriting deeprec_validate.py


lr = 7e-4, emb_dim=64, hidden=[512, 256, 128]

In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:477: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3929, Val Loss=0.3008, HitRate@3=0.4760, NDCG@3=0.3810, MAP@3=0.6308, LR=0.000700 (734.3s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2802, Val Loss=0.2804, HitRate@3=0.4886, NDCG@3=0.3959, MAP@3=0.6438, LR=0.000622 (735.4s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2565, Val Loss=0.2692, HitRate@3=0.5046, NDCG@3=0.4088, MAP@3=0.6580, LR=0.000544 (734.8s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2436, Val Loss=0.2642, HitRate@3=0.5121, NDCG@3=0.4132, MAP@3=0.6552, LR=0.000467 (734.4s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2308, Val Loss=0.2568, HitRate@3=0.5065, NDCG@3=0.4089, MAP@3=0.6576, LR=0.000389 (734.8s)

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2172, Val Loss=0.2580, HitRate@3=0.5116, NDCG@3=0.4140, MAP@3=0.6602, LR=0.000311 (735.1s)

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2031, Val Loss=0.2569, HitRate@3=0.5158, NDCG@3=0.4193, MAP@3=0.6657, LR=0.000233 (734.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.1898, Val Loss=0.2544, HitRate@3=0.5179, NDCG@3=0.4227, MAP@3=0.6722, LR=0.000156 (735.6s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.1732, Val Loss=0.2569, HitRate@3=0.5188, NDCG@3=0.4229, MAP@3=0.6711, LR=0.000078 (735.9s)

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.1575, Val Loss=0.2608, HitRate@3=0.5182, NDCG@3=0.4219, MAP@3=0.6718, LR=0.000000 (734.3s)

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F_torch
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter
from sklearn.decomposition import PCA

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch = [b for b in batch if b[3] >= 2]
    if len(batch) == 0:
        return None

    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths


class FiBiDLRanker(nn.Module):
    def __init__(self, cat_dims, num_numeric_feats, emb_dim=64, hidden=[512, 256, 128, 64], reduction=4, use_senet=True):
        super().__init__()
        self.cat_fields = list(cat_dims.keys())
        self.emb_dim = emb_dim
        self.num_fields = len(self.cat_fields)
        self.use_senet = use_senet

        # Embedding layers
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], emb_dim, padding_idx=0) for f in self.cat_fields
        })
        self.linear_cat = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], 1, padding_idx=0) for f in self.cat_fields
        })
        self.linear_num = nn.Linear(num_numeric_feats, 1)

        for emb in list(self.embeddings.values()) + list(self.linear_cat.values()):
            nn.init.xavier_uniform_(emb.weight)
        nn.init.xavier_uniform_(self.linear_num.weight)

        # SENet (with LayerNorm)
        if self.use_senet:
            self.se_fc1 = nn.Linear(self.num_fields * emb_dim, self.num_fields * emb_dim // reduction)
            self.se_ln1 = nn.LayerNorm(self.num_fields * emb_dim // reduction)
            self.se_dropout = nn.Dropout(0.1)
            self.se_fc2 = nn.Linear(self.num_fields * emb_dim // reduction, self.num_fields * emb_dim)

        # Bilinear layer (shared)
        self.bilinear = nn.Linear(emb_dim, emb_dim, bias=False)

        # MLP (using LayerNorm instead of BatchNorm)
        input_dim = emb_dim + num_numeric_feats
        layers = []
        for h in hidden:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.LayerNorm(h))     # replaced BatchNorm1d with LayerNorm
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            input_dim = h
        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(input_dim, 1)

    def senet(self, embs):
        B, F, D = embs.shape
        z = embs.view(B, -1)  # Flatten [B, F*D]
        a = self.se_fc1(z)    # Linear
        a = self.se_ln1(a)    # LayerNorm instead of BatchNorm
        a = F_torch.relu(a)
        a = self.se_dropout(a)
        s = torch.sigmoid(self.se_fc2(a)).view(B, F, D)
        return embs * s

    def bilinear_interaction(self, embs):
        transformed = self.bilinear(embs)
        interaction = (embs * transformed).mean(dim=1)
        return interaction

    def forward(self, x_cat, x_num):
        B, G = x_num.shape[:2]
        F, D = self.num_fields, self.emb_dim

        # Embeddings [B, G, F, D]
        embs = torch.stack([self.embeddings[f](x_cat[f]) for f in self.cat_fields], dim=2)

        # Linear output
        linear_cat_sum = torch.stack([
            self.linear_cat[f](x_cat[f]).squeeze(-1) for f in self.cat_fields
        ], dim=0).sum(dim=0)
        linear_num_out = self.linear_num(x_num).squeeze(-1)
        linear_out = linear_cat_sum + linear_num_out  # [B, G]

        # Feature interactions
        embs = embs.view(B * G, F, D)
        if self.use_senet:
            embs = self.senet(embs)
        bi = self.bilinear_interaction(embs)  # [B*G, D]

        # Deep part
        x_num_flat = x_num.view(B * G, -1)
        deep_input = torch.cat([bi, x_num_flat], dim=1)
        deep_out = self.mlp(deep_input)
        scores = self.output(deep_out).view(B, G)

        return scores + linear_out  # final output


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F_torch.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0
    valid_batches = 0

    for batch in tqdm(loader, desc="Training", leave=False):
        if batch is None:
            continue

        x_cat, x_num, y, lengths = batch

        x_num, y = x_num.to(device), y.to(device)
        lengths = lengths.to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        valid_batches += 1

    return total_loss / valid_batches if valid_batches > 0 else float("inf")



def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X

def apply_pca(X, numeric_features, train_mask, explained_variance=0.975):
    train_data = X[numeric_features].to_numpy()[train_mask]
    full_data = X[numeric_features].to_numpy()

    pca = PCA(n_components=explained_variance, svd_solver='full')
    pca.fit(train_data)
    transformed = pca.transform(full_data)

    print(f"📐 PCA: {len(numeric_features)} features → {transformed.shape[1]} components (explained variance ≥ {explained_variance})")

    pca_feature_names = [f'pca_{i}' for i in range(transformed.shape[1])]
    X_pca = pl.DataFrame(transformed, schema=pca_feature_names)

    X = X.drop(numeric_features)
    X = pl.concat([X, X_pca], how="horizontal")

    return X, pca_feature_names

def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size

    for col in num_features:
        nans = X.select(pl.col(col).is_nan().sum()).item()
        if nans > 0:
            print(f"⚠️ Found {nans} NaNs in column '{col}', filling with 0.")
            X = X.with_columns(
                pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
            )

    X = normalize_numeric_features(X, num_features, train_mask)
    X, pca_features = apply_pca(X, num_features, train_mask, explained_variance=0.975)
    num_features = pca_features

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers



class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    SUBMIT_DIR = "submission"
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    learning_rates_to_try = [1e-3, 7e-4, 5e-4]

    for lr in learning_rates_to_try:
        run_name = f"lr_{lr:.0e}".replace("-", "")
        MODEL_DIR = f"model_{run_name}"
        os.makedirs(MODEL_DIR, exist_ok=True)

        print(f"\n\n🚀 Starting training with learning rate = {lr}\nSaved to: {MODEL_DIR}")

        best_val_loss = float("inf")
        best_hitrate = 0.0
        model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        train_losses = []
        val_losses = []
        val_hitrates = []
        val_ndcgs = []
        val_maps = []
        learning_rates = []

        os.makedirs(MODEL_DIR, exist_ok=True)
        os.makedirs(SUBMIT_DIR, exist_ok=True)

        early_stopper = EarlyStopping(patience=3, min_delta=0.00005)

        X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

        # Fill missing values
        float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
        for col in float_cols:
            X = X.with_columns(
                pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
            )

        n1 = 16487352
        # n1 = 17487300
        n2 = train_size

        X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
        y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
        groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

        # Inspect group sizes in validation set
        val_rankers = groups_va["ranker_id"].to_numpy()
        val_group_counts = Counter(val_rankers)

        print("\n📋 Validation group statistics:")
        # Save detailed group info to file
        group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
        with open(group_info_path, "w") as f:
            f.write(f"Total groups: {len(val_group_counts)}\n\n")
            f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

            group_to_rows = defaultdict(list)
            for i, g in enumerate(val_rankers):
                group_to_rows[g].append(i)

            for group_id, indices in sorted(group_to_rows.items()):
                start_idx = indices[0]
                end_idx = indices[-1]
                size = len(indices)
                f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

        print(f"📁 Saved validation group info to {group_info_path}")


        cat_dims = build_cat_dims(X, cat_features)
        num_numeric_feats = len(num_features)

        train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
        val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
        test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

        plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
        print("📊 Group size distribution saved.")

        train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
        val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
        test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

        print("✅ Data loaders created.")

        num_epochs = 10
        num_training_steps = num_epochs * len(train_loader)
        num_warmup_steps = int(0.1 * num_training_steps)

        model = FiBiDLRanker(
            cat_dims=cat_dims,
            num_numeric_feats=num_numeric_feats,
            emb_dim=32,
            hidden=[512, 256, 128, 64]
        ).to(device)

        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)

        scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )

        best_hitrate = 0
        for epoch in range(num_epochs):
            print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
            start_time = time.time()

            train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
            val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

            train_losses.append(train_loss)
            val_losses.append(val_loss)
            val_hitrates.append(val_hitrate.item())
            val_ndcgs.append(val_ndcg.item())
            val_maps.append(val_map.item())

            current_lr = scheduler.get_last_lr()
            learning_rates.append(current_lr[0])

            elapsed = time.time() - start_time
            print(
                f"Epoch {epoch+1}: "
                f"Train Loss={train_loss:.4f}, "
                f"Val Loss={val_loss:.4f}, "
                f"HitRate@3={val_hitrate:.4f}, "
                f"NDCG@3={val_ndcg:.4f}, "
                f"MAP@3={val_map:.4f}, "
                f"LR={current_lr[0]:.6f} "
                f"({elapsed:.1f}s)"
            )


            if val_loss < best_val_loss and val_hitrate > best_hitrate:
                best_val_loss = val_loss
                best_hitrate = val_hitrate
                torch.save(model.state_dict(), model_path)
                print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

            if early_stopper.step(-val_loss):
                print("⛔ Early stopping triggered.")
                break

        # Plot metrics
        epochs = list(range(1, len(train_losses) + 1))

        # Plot Loss Curve
        plt.figure()
        plt.plot(epochs, train_losses, label="Train Loss")
        plt.plot(epochs, val_losses, label="Val Loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.title("Training & Validation Loss")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
        plt.close()

        # Plot HitRate@3
        plt.figure()
        plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
        plt.xlabel("Epoch")
        plt.ylabel("HitRate@3")
        plt.title("Validation HitRate@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
        plt.close()

        # Plot Learning Rate
        plt.figure()
        plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
        plt.xlabel("Epoch")
        plt.ylabel("LR")
        plt.title("Learning Rate Schedule")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
        plt.close()

        # Plot NDCG@3
        plt.figure()
        plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
        plt.xlabel("Epoch")
        plt.ylabel("NDCG@3")
        plt.title("Validation NDCG@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
        plt.close()

        # Plot MAP@3
        plt.figure()
        plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
        plt.xlabel("Epoch")
        plt.ylabel("MAP@3")
        plt.title("Validation MAP@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
        plt.close()

        print("📊 Saved metric plots to model/")

        # Predict test set
        model.load_state_dict(torch.load(model_path))
        model.eval()

        val_ids = X_va["Id"]
        val_rankers = groups_va["ranker_id"]
        val_labels = y_va

        all_val_scores = []

        with torch.no_grad():
            for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
                x_num = x_num.to(device)
                x_cat = {k: v.to(device) for k, v in x_cat.items()}
                scores = model(x_cat, x_num)

                for i in range(scores.size(0)):
                    l = lengths[i]
                    all_val_scores.append(scores[i, :l].cpu().numpy())

        all_val_scores = np.concatenate(all_val_scores)

        assert len(all_val_scores) == X_va.shape[0], \
            f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

        val_df = X_va.with_columns([
            pl.Series("Id", val_ids),
            pl.Series("ranker_id", val_rankers),
            pl.Series("score", all_val_scores),
            pl.Series("label", val_labels)
        ])

        val_df = val_df.with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )

        val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
        val_df.write_csv(val_save_path)
        print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

        all_scores = []

        with torch.no_grad():
            for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
                x_num = x_num.to(device)
                x_cat = {k: v.to(device) for k, v in x_cat.items()}
                scores = model(x_cat, x_num)

                for i in range(scores.size(0)):
                    l = lengths[i]
                    all_scores.append(scores[i, :l].cpu().numpy())

        all_scores = np.concatenate(all_scores)

        assert len(all_scores) == X_te.shape[0], \
            f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

        submission_df = pl.DataFrame({
            "Id": test_ids,
            "ranker_id": test_rankers,
            "score": all_scores
        })

        submission = (
            submission_df
            .with_columns(
                pl.col("score")
                .rank(method="ordinal", descending=True)
                .over("ranker_id")
                .cast(pl.Int32)
                .alias("selected")
            )
            .select(["Id", "ranker_id", "score", "selected"])
        )

        submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
        submission.write_csv(submission_path)
        print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Overwriting deeprec_validate.py


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:477: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3914, Val Loss=0.3045, HitRate@3=0.4658, NDCG@3=0.3745, MAP@3=0.6268, LR=0.001000 (676.1s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2728, Val Loss=0.2828, HitRate@3=0.4873, NDCG@3=0.3919, MAP@3=0.6403, LR=0.000889 (677.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2507, Val Loss=0.2729, HitRate@3=0.4975, NDCG@3=0.3989, MAP@3=0.6378, LR=0.000778 (678.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2382, Val Loss=0.2738, HitRate@3=0.4914, NDCG@3=0.3931, MAP@3=0.6366, LR=0.000667 (678.7s)

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2266, Val Loss=0.2711, HitRate@3=0.5003, NDCG@3=0.4040, MAP@3=0.6473, LR=0.000556 (677.3s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2158, Val Loss=0.2698, HitRate@3=0.5047, NDCG@3=0.4069, MAP@3=0.6491, LR=0.000444 (677.0s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2054, Val Loss=0.2748, HitRate@3=0.5084, NDCG@3=0.4101, MAP@3=0.6543, LR=0.000333 (677.3s)

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.1920, Val Loss=0.2720, HitRate@3=0.5144, NDCG@3=0.4137, MAP@3=0.6528, LR=0.000222 (675.0s)

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.1792, Val Loss=0.2725, HitRate@3=0.5120, NDCG@3=0.4133, MAP@3=0.6567, LR=0.000111 (674.7s)
⛔ Early stopping triggered.

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F_torch
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch = [b for b in batch if b[3] >= 2]
    if len(batch) == 0:
        return None

    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths


class FiBiDLRanker(nn.Module):
    def __init__(self, cat_dims, num_numeric_feats, emb_dim=64, hidden=[512, 256, 128, 64], reduction=4, use_senet=True):
        super().__init__()
        self.cat_fields = list(cat_dims.keys())
        self.emb_dim = emb_dim
        self.num_fields = len(self.cat_fields)
        self.use_senet = use_senet

        # Embedding layers
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], emb_dim, padding_idx=0) for f in self.cat_fields
        })
        self.linear_cat = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], 1, padding_idx=0) for f in self.cat_fields
        })
        self.linear_num = nn.Linear(num_numeric_feats, 1)

        for emb in list(self.embeddings.values()) + list(self.linear_cat.values()):
            nn.init.xavier_uniform_(emb.weight)
        nn.init.xavier_uniform_(self.linear_num.weight)

        # SENet (with LayerNorm)
        if self.use_senet:
            self.se_fc1 = nn.Linear(self.num_fields * emb_dim, self.num_fields * emb_dim // reduction)
            self.se_ln1 = nn.LayerNorm(self.num_fields * emb_dim // reduction)
            self.se_dropout = nn.Dropout(0.1)
            self.se_fc2 = nn.Linear(self.num_fields * emb_dim // reduction, self.num_fields * emb_dim)

        # Bilinear layer (shared)
        self.bilinear = nn.Linear(emb_dim, emb_dim, bias=False)

        # MLP (using LayerNorm instead of BatchNorm)
        input_dim = emb_dim + num_numeric_feats
        layers = []
        for h in hidden:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.LayerNorm(h))     # replaced BatchNorm1d with LayerNorm
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            input_dim = h
        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(input_dim, 1)

    def senet(self, embs):
        B, F, D = embs.shape
        z = embs.view(B, -1)  # Flatten [B, F*D]
        a = self.se_fc1(z)    # Linear
        a = self.se_ln1(a)    # LayerNorm instead of BatchNorm
        a = F_torch.relu(a)
        a = self.se_dropout(a)
        s = torch.sigmoid(self.se_fc2(a)).view(B, F, D)
        return embs * s

    def bilinear_interaction(self, embs):
        transformed = self.bilinear(embs)
        interaction = (embs * transformed).mean(dim=1)
        return interaction

    def forward(self, x_cat, x_num):
        B, G = x_num.shape[:2]
        F, D = self.num_fields, self.emb_dim

        # Embeddings [B, G, F, D]
        embs = torch.stack([self.embeddings[f](x_cat[f]) for f in self.cat_fields], dim=2)

        # Linear output
        linear_cat_sum = torch.stack([
            self.linear_cat[f](x_cat[f]).squeeze(-1) for f in self.cat_fields
        ], dim=0).sum(dim=0)
        linear_num_out = self.linear_num(x_num).squeeze(-1)
        linear_out = linear_cat_sum + linear_num_out  # [B, G]

        # Feature interactions
        embs = embs.view(B * G, F, D)
        if self.use_senet:
            embs = self.senet(embs)
        bi = self.bilinear_interaction(embs)  # [B*G, D]

        # Deep part
        x_num_flat = x_num.view(B * G, -1)
        deep_input = torch.cat([bi, x_num_flat], dim=1)
        deep_out = self.mlp(deep_input)
        scores = self.output(deep_out).view(B, G)

        return scores + linear_out  # final output


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F_torch.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0
    valid_batches = 0

    for batch in tqdm(loader, desc="Training", leave=False):
        if batch is None:
            continue

        x_cat, x_num, y, lengths = batch

        x_num, y = x_num.to(device), y.to(device)
        lengths = lengths.to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        valid_batches += 1

    return total_loss / valid_batches if valid_batches > 0 else float("inf")



def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    SUBMIT_DIR = "submission"
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    learning_rates_to_try = [1e-3, 7e-4, 5e-4]

    for lr in learning_rates_to_try:
        run_name = f"lr_{lr:.0e}".replace("-", "")
        MODEL_DIR = f"model_{run_name}"
        os.makedirs(MODEL_DIR, exist_ok=True)

        print(f"\n\n🚀 Starting training with learning rate = {lr}\nSaved to: {MODEL_DIR}")

        best_val_loss = float("inf")
        best_hitrate = 0.0
        model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        train_losses = []
        val_losses = []
        val_hitrates = []
        val_ndcgs = []
        val_maps = []
        learning_rates = []

        os.makedirs(MODEL_DIR, exist_ok=True)
        os.makedirs(SUBMIT_DIR, exist_ok=True)

        early_stopper = EarlyStopping(patience=3, min_delta=0.00005)

        X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

        # Fill missing values
        float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
        for col in float_cols:
            X = X.with_columns(
                pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
            )

        n1 = 16487352
        # n1 = 17487300
        n2 = train_size

        X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
        y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
        groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

        # Inspect group sizes in validation set
        val_rankers = groups_va["ranker_id"].to_numpy()
        val_group_counts = Counter(val_rankers)

        print("\n📋 Validation group statistics:")
        # Save detailed group info to file
        group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
        with open(group_info_path, "w") as f:
            f.write(f"Total groups: {len(val_group_counts)}\n\n")
            f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

            group_to_rows = defaultdict(list)
            for i, g in enumerate(val_rankers):
                group_to_rows[g].append(i)

            for group_id, indices in sorted(group_to_rows.items()):
                start_idx = indices[0]
                end_idx = indices[-1]
                size = len(indices)
                f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

        print(f"📁 Saved validation group info to {group_info_path}")


        cat_dims = build_cat_dims(X, cat_features)
        num_numeric_feats = len(num_features)

        train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
        val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
        test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

        plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
        print("📊 Group size distribution saved.")

        train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
        val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
        test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

        print("✅ Data loaders created.")

        num_epochs = 10
        num_training_steps = num_epochs * len(train_loader)
        num_warmup_steps = int(0.1 * num_training_steps)

        model = FiBiDLRanker(
            cat_dims=cat_dims,
            num_numeric_feats=num_numeric_feats,
            emb_dim=32,
            hidden=[512, 256, 128, 64]
        ).to(device)

        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)

        scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )

        best_hitrate = 0
        for epoch in range(num_epochs):
            print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
            start_time = time.time()

            train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
            val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

            train_losses.append(train_loss)
            val_losses.append(val_loss)
            val_hitrates.append(val_hitrate.item())
            val_ndcgs.append(val_ndcg.item())
            val_maps.append(val_map.item())

            current_lr = scheduler.get_last_lr()
            learning_rates.append(current_lr[0])

            elapsed = time.time() - start_time
            print(
                f"Epoch {epoch+1}: "
                f"Train Loss={train_loss:.4f}, "
                f"Val Loss={val_loss:.4f}, "
                f"HitRate@3={val_hitrate:.4f}, "
                f"NDCG@3={val_ndcg:.4f}, "
                f"MAP@3={val_map:.4f}, "
                f"LR={current_lr[0]:.6f} "
                f"({elapsed:.1f}s)"
            )


            if val_loss < best_val_loss and val_hitrate > best_hitrate:
                best_val_loss = val_loss
                best_hitrate = val_hitrate
                torch.save(model.state_dict(), model_path)
                print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

            if early_stopper.step(-val_loss):
                print("⛔ Early stopping triggered.")
                break

        # Plot metrics
        epochs = list(range(1, len(train_losses) + 1))

        # Plot Loss Curve
        plt.figure()
        plt.plot(epochs, train_losses, label="Train Loss")
        plt.plot(epochs, val_losses, label="Val Loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.title("Training & Validation Loss")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
        plt.close()

        # Plot HitRate@3
        plt.figure()
        plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
        plt.xlabel("Epoch")
        plt.ylabel("HitRate@3")
        plt.title("Validation HitRate@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
        plt.close()

        # Plot Learning Rate
        plt.figure()
        plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
        plt.xlabel("Epoch")
        plt.ylabel("LR")
        plt.title("Learning Rate Schedule")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
        plt.close()

        # Plot NDCG@3
        plt.figure()
        plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
        plt.xlabel("Epoch")
        plt.ylabel("NDCG@3")
        plt.title("Validation NDCG@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
        plt.close()

        # Plot MAP@3
        plt.figure()
        plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
        plt.xlabel("Epoch")
        plt.ylabel("MAP@3")
        plt.title("Validation MAP@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
        plt.close()

        print("📊 Saved metric plots to model/")

        # Predict test set
        model.load_state_dict(torch.load(model_path))
        model.eval()

        val_ids = X_va["Id"]
        val_rankers = groups_va["ranker_id"]
        val_labels = y_va

        all_val_scores = []

        with torch.no_grad():
            for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
                x_num = x_num.to(device)
                x_cat = {k: v.to(device) for k, v in x_cat.items()}
                scores = model(x_cat, x_num)

                for i in range(scores.size(0)):
                    l = lengths[i]
                    all_val_scores.append(scores[i, :l].cpu().numpy())

        all_val_scores = np.concatenate(all_val_scores)

        assert len(all_val_scores) == X_va.shape[0], \
            f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

        val_df = X_va.with_columns([
            pl.Series("Id", val_ids),
            pl.Series("ranker_id", val_rankers),
            pl.Series("score", all_val_scores),
            pl.Series("label", val_labels)
        ])

        val_df = val_df.with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )

        val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
        val_df.write_csv(val_save_path)
        print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

        all_scores = []

        with torch.no_grad():
            for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
                x_num = x_num.to(device)
                x_cat = {k: v.to(device) for k, v in x_cat.items()}
                scores = model(x_cat, x_num)

                for i in range(scores.size(0)):
                    l = lengths[i]
                    all_scores.append(scores[i, :l].cpu().numpy())

        all_scores = np.concatenate(all_scores)

        assert len(all_scores) == X_te.shape[0], \
            f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

        submission_df = pl.DataFrame({
            "Id": test_ids,
            "ranker_id": test_rankers,
            "score": all_scores
        })

        submission = (
            submission_df
            .with_columns(
                pl.col("score")
                .rank(method="ordinal", descending=True)
                .over("ranker_id")
                .cast(pl.Int32)
                .alias("selected")
            )
            .select(["Id", "ranker_id", "score", "selected"])
        )

        submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
        submission.write_csv(submission_path)
        print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Overwriting deeprec_validate.py


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:476: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.4424, Val Loss=0.3009, HitRate@3=0.4628, NDCG@3=0.3676, MAP@3=0.6147, LR=0.001000 (677.8s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2698, Val Loss=0.2787, HitRate@3=0.4965, NDCG@3=0.3992, MAP@3=0.6466, LR=0.000889 (675.4s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2496, Val Loss=0.2811, HitRate@3=0.4974, NDCG@3=0.4020, MAP@3=0.6474, LR=0.000778 (677.2s)

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2372, Val Loss=0.2765, HitRate@3=0.4972, NDCG@3=0.3997, MAP@3=0.6492, LR=0.000667 (676.6s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2285, Val Loss=0.2752, HitRate@3=0.5032, NDCG@3=0.4051, MAP@3=0.6495, LR=0.000556 (679.4s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2158, Val Loss=0.2673, HitRate@3=0.5006, NDCG@3=0.4048, MAP@3=0.6558, LR=0.000444 (677.1s)

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2041, Val Loss=0.2659, HitRate@3=0.5060, NDCG@3=0.4099, MAP@3=0.6584, LR=0.000333 (677.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.1934, Val Loss=0.2687, HitRate@3=0.5033, NDCG@3=0.4066, MAP@3=0.6519, LR=0.000222 (682.1s)

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.1804, Val Loss=0.2782, HitRate@3=0.5061, NDCG@3=0.4094, MAP@3=0.6585, LR=0.000111 (682.8s)

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.1679, Val Loss=0.2728, HitRate@3=0.5081, NDCG@3=0.4104, MAP@3=0.6588, LR=0.000000 (677.5s)
⛔ Early stopping triggered.

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F_torch
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch = [b for b in batch if b[3] >= 2]
    if len(batch) == 0:
        return None

    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths


class FiBiDLRanker(nn.Module):
    def __init__(self, cat_dims, num_numeric_feats, emb_dim=64, hidden=[512, 256, 128, 64], reduction=4, use_senet=True):
        super().__init__()
        self.cat_fields = list(cat_dims.keys())
        self.emb_dim = emb_dim
        self.num_fields = len(self.cat_fields)
        self.use_senet = use_senet

        # Embedding layers
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], emb_dim, padding_idx=0) for f in self.cat_fields
        })
        self.linear_cat = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], 1, padding_idx=0) for f in self.cat_fields
        })
        self.linear_num = nn.Linear(num_numeric_feats, 1)

        for emb in list(self.embeddings.values()) + list(self.linear_cat.values()):
            nn.init.xavier_uniform_(emb.weight)
        nn.init.xavier_uniform_(self.linear_num.weight)

        # SENet (with LayerNorm)
        if self.use_senet:
            self.se_fc1 = nn.Linear(self.num_fields * emb_dim, self.num_fields * emb_dim // reduction)
            self.se_ln1 = nn.LayerNorm(self.num_fields * emb_dim // reduction)
            self.se_dropout = nn.Dropout(0.1)
            self.se_fc2 = nn.Linear(self.num_fields * emb_dim // reduction, self.num_fields * emb_dim)

        # Bilinear layer (shared)
        self.bilinear = nn.Linear(emb_dim, emb_dim, bias=False)

        # MLP (using LayerNorm instead of BatchNorm)
        input_dim = emb_dim + num_numeric_feats
        layers = []
        for h in hidden:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.LayerNorm(h))     # replaced BatchNorm1d with LayerNorm
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            input_dim = h
        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(input_dim, 1)

    def senet(self, embs):
        B, F, D = embs.shape
        z = embs.view(B, -1)  # Flatten [B, F*D]
        a = self.se_fc1(z)    # Linear
        a = self.se_ln1(a)    # LayerNorm instead of BatchNorm
        a = F_torch.relu(a)
        a = self.se_dropout(a)
        s = torch.sigmoid(self.se_fc2(a)).view(B, F, D)
        return embs * s

    def bilinear_interaction(self, embs):
        transformed = self.bilinear(embs)
        interaction = (embs * transformed).mean(dim=1)
        return interaction

    def forward(self, x_cat, x_num):
        B, G = x_num.shape[:2]
        F, D = self.num_fields, self.emb_dim

        # Embeddings [B, G, F, D]
        embs = torch.stack([self.embeddings[f](x_cat[f]) for f in self.cat_fields], dim=2)

        # Linear output
        linear_cat_sum = torch.stack([
            self.linear_cat[f](x_cat[f]).squeeze(-1) for f in self.cat_fields
        ], dim=0).sum(dim=0)
        linear_num_out = self.linear_num(x_num).squeeze(-1)
        linear_out = linear_cat_sum + linear_num_out  # [B, G]

        # Feature interactions
        embs = embs.view(B * G, F, D)
        if self.use_senet:
            embs = self.senet(embs)
        bi = self.bilinear_interaction(embs)  # [B*G, D]

        # Deep part
        x_num_flat = x_num.view(B * G, -1)
        deep_input = torch.cat([bi, x_num_flat], dim=1)
        deep_out = self.mlp(deep_input)
        scores = self.output(deep_out).view(B, G)

        return scores + linear_out  # final output


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F_torch.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0
    valid_batches = 0

    for batch in tqdm(loader, desc="Training", leave=False):
        if batch is None:
            continue

        x_cat, x_num, y, lengths = batch

        x_num, y = x_num.to(device), y.to(device)
        lengths = lengths.to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        valid_batches += 1

    return total_loss / valid_batches if valid_batches > 0 else float("inf")



def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    SUBMIT_DIR = "submission"
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    learning_rates_to_try = [7e-4, 5e-4]

    for lr in learning_rates_to_try:
        run_name = f"lr_{lr:.0e}".replace("-", "")
        MODEL_DIR = f"model_{run_name}"
        os.makedirs(MODEL_DIR, exist_ok=True)

        print(f"\n\n🚀 Starting training with learning rate = {lr}\nSaved to: {MODEL_DIR}")

        best_val_loss = float("inf")
        best_hitrate = 0.0
        model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        train_losses = []
        val_losses = []
        val_hitrates = []
        val_ndcgs = []
        val_maps = []
        learning_rates = []

        os.makedirs(MODEL_DIR, exist_ok=True)
        os.makedirs(SUBMIT_DIR, exist_ok=True)

        early_stopper = EarlyStopping(patience=3, min_delta=0.00005)

        X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

        # Fill missing values
        float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
        for col in float_cols:
            X = X.with_columns(
                pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
            )

        n1 = 16487352
        # n1 = 17487300
        n2 = train_size

        X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
        y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
        groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

        # Inspect group sizes in validation set
        val_rankers = groups_va["ranker_id"].to_numpy()
        val_group_counts = Counter(val_rankers)

        print("\n📋 Validation group statistics:")
        # Save detailed group info to file
        group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
        with open(group_info_path, "w") as f:
            f.write(f"Total groups: {len(val_group_counts)}\n\n")
            f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

            group_to_rows = defaultdict(list)
            for i, g in enumerate(val_rankers):
                group_to_rows[g].append(i)

            for group_id, indices in sorted(group_to_rows.items()):
                start_idx = indices[0]
                end_idx = indices[-1]
                size = len(indices)
                f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

        print(f"📁 Saved validation group info to {group_info_path}")


        cat_dims = build_cat_dims(X, cat_features)
        num_numeric_feats = len(num_features)

        train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
        val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
        test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

        plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
        print("📊 Group size distribution saved.")

        train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
        val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
        test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

        print("✅ Data loaders created.")

        num_epochs = 10
        num_training_steps = num_epochs * len(train_loader)
        num_warmup_steps = int(0.1 * num_training_steps)

        model = FiBiDLRanker(
            cat_dims=cat_dims,
            num_numeric_feats=num_numeric_feats,
            emb_dim=48,
            hidden=[1024, 512, 256, 128, 64]
        ).to(device)

        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)

        scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )

        best_hitrate = 0
        for epoch in range(num_epochs):
            print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
            start_time = time.time()

            train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
            val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

            train_losses.append(train_loss)
            val_losses.append(val_loss)
            val_hitrates.append(val_hitrate.item())
            val_ndcgs.append(val_ndcg.item())
            val_maps.append(val_map.item())

            current_lr = scheduler.get_last_lr()
            learning_rates.append(current_lr[0])

            elapsed = time.time() - start_time
            print(
                f"Epoch {epoch+1}: "
                f"Train Loss={train_loss:.4f}, "
                f"Val Loss={val_loss:.4f}, "
                f"HitRate@3={val_hitrate:.4f}, "
                f"NDCG@3={val_ndcg:.4f}, "
                f"MAP@3={val_map:.4f}, "
                f"LR={current_lr[0]:.6f} "
                f"({elapsed:.1f}s)"
            )


            if val_loss < best_val_loss and val_hitrate > best_hitrate:
                best_val_loss = val_loss
                best_hitrate = val_hitrate
                torch.save(model.state_dict(), model_path)
                print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

            if early_stopper.step(-val_loss):
                print("⛔ Early stopping triggered.")
                break

        # Plot metrics
        epochs = list(range(1, len(train_losses) + 1))

        # Plot Loss Curve
        plt.figure()
        plt.plot(epochs, train_losses, label="Train Loss")
        plt.plot(epochs, val_losses, label="Val Loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.title("Training & Validation Loss")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
        plt.close()

        # Plot HitRate@3
        plt.figure()
        plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
        plt.xlabel("Epoch")
        plt.ylabel("HitRate@3")
        plt.title("Validation HitRate@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
        plt.close()

        # Plot Learning Rate
        plt.figure()
        plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
        plt.xlabel("Epoch")
        plt.ylabel("LR")
        plt.title("Learning Rate Schedule")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
        plt.close()

        # Plot NDCG@3
        plt.figure()
        plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
        plt.xlabel("Epoch")
        plt.ylabel("NDCG@3")
        plt.title("Validation NDCG@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
        plt.close()

        # Plot MAP@3
        plt.figure()
        plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
        plt.xlabel("Epoch")
        plt.ylabel("MAP@3")
        plt.title("Validation MAP@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
        plt.close()

        print("📊 Saved metric plots to model/")

        # Predict test set
        model.load_state_dict(torch.load(model_path))
        model.eval()

        val_ids = X_va["Id"]
        val_rankers = groups_va["ranker_id"]
        val_labels = y_va

        all_val_scores = []

        with torch.no_grad():
            for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
                x_num = x_num.to(device)
                x_cat = {k: v.to(device) for k, v in x_cat.items()}
                scores = model(x_cat, x_num)

                for i in range(scores.size(0)):
                    l = lengths[i]
                    all_val_scores.append(scores[i, :l].cpu().numpy())

        all_val_scores = np.concatenate(all_val_scores)

        assert len(all_val_scores) == X_va.shape[0], \
            f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

        val_df = X_va.with_columns([
            pl.Series("Id", val_ids),
            pl.Series("ranker_id", val_rankers),
            pl.Series("score", all_val_scores),
            pl.Series("label", val_labels)
        ])

        val_df = val_df.with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )

        val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
        val_df.write_csv(val_save_path)
        print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

        all_scores = []

        with torch.no_grad():
            for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
                x_num = x_num.to(device)
                x_cat = {k: v.to(device) for k, v in x_cat.items()}
                scores = model(x_cat, x_num)

                for i in range(scores.size(0)):
                    l = lengths[i]
                    all_scores.append(scores[i, :l].cpu().numpy())

        all_scores = np.concatenate(all_scores)

        assert len(all_scores) == X_te.shape[0], \
            f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

        submission_df = pl.DataFrame({
            "Id": test_ids,
            "ranker_id": test_rankers,
            "score": all_scores
        })

        submission = (
            submission_df
            .with_columns(
                pl.col("score")
                .rank(method="ordinal", descending=True)
                .over("ranker_id")
                .cast(pl.Int32)
                .alias("selected")
            )
            .select(["Id", "ranker_id", "score", "selected"])
        )

        submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
        submission.write_csv(submission_path)
        print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Overwriting deeprec_validate.py


In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F_torch
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch = [b for b in batch if b[3] >= 2]
    if len(batch) == 0:
        return None

    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths


class SelfAttentionLayer(nn.Module):
    def __init__(self, emb_dim, n_heads=4):
        super().__init__()
        self.attention = nn.MultiheadAttention(embed_dim=emb_dim, num_heads=n_heads, batch_first=True)

    def forward(self, x):
        # x: [B, F, D]
        attn_output, _ = self.attention(x, x, x)
        return attn_output.mean(dim=1)  # [B, D]


class FiBiDLRanker(nn.Module):
    def __init__(self, cat_dims, num_numeric_feats, emb_dim=64, hidden=[512, 256, 128, 64], use_senet=True):
        super().__init__()
        self.cat_fields = list(cat_dims.keys())
        self.emb_dim = emb_dim
        self.num_fields = len(self.cat_fields)
        self.use_senet = use_senet

        # Embedding layers
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], emb_dim, padding_idx=0) for f in self.cat_fields
        })
        self.linear_cat = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], 1, padding_idx=0) for f in self.cat_fields
        })
        self.linear_num = nn.Linear(num_numeric_feats, 1)

        # Init weights
        for emb in list(self.embeddings.values()) + list(self.linear_cat.values()):
            nn.init.xavier_uniform_(emb.weight)
        nn.init.xavier_uniform_(self.linear_num.weight)

        # SENet (opțional)
        if self.use_senet:
            self.se_fc1 = nn.Linear(self.num_fields * emb_dim, self.num_fields * emb_dim // 4)
            self.se_ln1 = nn.LayerNorm(self.num_fields * emb_dim // 4)
            self.se_dropout = nn.Dropout(0.1)
            self.se_fc2 = nn.Linear(self.num_fields * emb_dim // 4, self.num_fields * emb_dim)

        # 🔁 Self-Attention layer
        self.attention = SelfAttentionLayer(emb_dim, n_heads=4)

        # MLP
        input_dim = emb_dim + num_numeric_feats
        layers = []
        for h in hidden:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.LayerNorm(h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            input_dim = h
        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(input_dim, 1)

    def senet(self, embs):
        B, F, D = embs.shape
        z = embs.view(B, -1)
        a = self.se_fc1(z)
        a = self.se_ln1(a)
        a = F_torch.relu(a)
        a = self.se_dropout(a)
        s = torch.sigmoid(self.se_fc2(a)).view(B, F, D)
        return embs * s

    def forward(self, x_cat, x_num):
        B, G = x_num.shape[:2]
        F, D = self.num_fields, self.emb_dim

        embs = torch.stack([self.embeddings[f](x_cat[f]) for f in self.cat_fields], dim=2)  # [B, G, F, D]
        linear_cat_sum = torch.stack([
            self.linear_cat[f](x_cat[f]).squeeze(-1) for f in self.cat_fields
        ], dim=0).sum(dim=0)  # [B, G]
        linear_num_out = self.linear_num(x_num).squeeze(-1)
        linear_out = linear_cat_sum + linear_num_out

        embs = embs.view(B * G, F, D)
        if self.use_senet:
            embs = self.senet(embs)

        bi = self.attention(embs)  # [B*G, D]
        x_num_flat = x_num.view(B * G, -1)
        deep_input = torch.cat([bi, x_num_flat], dim=1)
        deep_out = self.mlp(deep_input)
        scores = self.output(deep_out).view(B, G)

        return scores + linear_out



"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F_torch.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0
    valid_batches = 0

    for batch in tqdm(loader, desc="Training", leave=False):
        if batch is None:
            continue

        x_cat, x_num, y, lengths = batch

        x_num, y = x_num.to(device), y.to(device)
        lengths = lengths.to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        valid_batches += 1

    return total_loss / valid_batches if valid_batches > 0 else float("inf")



def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    SUBMIT_DIR = "submission"
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    learning_rates_to_try = [1e-3, 7e-4]

    for lr in learning_rates_to_try:
        run_name = f"lr_{lr:.0e}".replace("-", "")
        MODEL_DIR = f"model_{run_name}"
        os.makedirs(MODEL_DIR, exist_ok=True)

        print(f"\n\n🚀 Starting training with learning rate = {lr}\nSaved to: {MODEL_DIR}")

        best_val_loss = float("inf")
        best_hitrate = 0.0
        model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        train_losses = []
        val_losses = []
        val_hitrates = []
        val_ndcgs = []
        val_maps = []
        learning_rates = []

        os.makedirs(MODEL_DIR, exist_ok=True)
        os.makedirs(SUBMIT_DIR, exist_ok=True)

        early_stopper = EarlyStopping(patience=3, min_delta=0.00005)

        X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

        # Fill missing values
        float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
        for col in float_cols:
            X = X.with_columns(
                pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
            )

        n1 = 16487352
        # n1 = 17487300
        n2 = train_size

        X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
        y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
        groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

        # Inspect group sizes in validation set
        val_rankers = groups_va["ranker_id"].to_numpy()
        val_group_counts = Counter(val_rankers)

        print("\n📋 Validation group statistics:")
        # Save detailed group info to file
        group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
        with open(group_info_path, "w") as f:
            f.write(f"Total groups: {len(val_group_counts)}\n\n")
            f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

            group_to_rows = defaultdict(list)
            for i, g in enumerate(val_rankers):
                group_to_rows[g].append(i)

            for group_id, indices in sorted(group_to_rows.items()):
                start_idx = indices[0]
                end_idx = indices[-1]
                size = len(indices)
                f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

        print(f"📁 Saved validation group info to {group_info_path}")


        cat_dims = build_cat_dims(X, cat_features)
        num_numeric_feats = len(num_features)

        train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
        val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
        test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

        plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
        print("📊 Group size distribution saved.")

        train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
        val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
        test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

        print("✅ Data loaders created.")

        num_epochs = 10
        num_training_steps = num_epochs * len(train_loader)
        num_warmup_steps = int(0.1 * num_training_steps)

        model = FiBiDLRanker(
            cat_dims=cat_dims,
            num_numeric_feats=num_numeric_feats,
            emb_dim=16,
            hidden=[256, 128]
        ).to(device)

        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)

        scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )

        best_hitrate = 0
        for epoch in range(num_epochs):
            print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
            start_time = time.time()

            train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
            val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

            train_losses.append(train_loss)
            val_losses.append(val_loss)
            val_hitrates.append(val_hitrate.item())
            val_ndcgs.append(val_ndcg.item())
            val_maps.append(val_map.item())

            current_lr = scheduler.get_last_lr()
            learning_rates.append(current_lr[0])

            elapsed = time.time() - start_time
            print(
                f"Epoch {epoch+1}: "
                f"Train Loss={train_loss:.4f}, "
                f"Val Loss={val_loss:.4f}, "
                f"HitRate@3={val_hitrate:.4f}, "
                f"NDCG@3={val_ndcg:.4f}, "
                f"MAP@3={val_map:.4f}, "
                f"LR={current_lr[0]:.6f} "
                f"({elapsed:.1f}s)"
            )


            if val_loss < best_val_loss and val_hitrate > best_hitrate:
                best_val_loss = val_loss
                best_hitrate = val_hitrate
                torch.save(model.state_dict(), model_path)
                print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

            if early_stopper.step(-val_loss):
                print("⛔ Early stopping triggered.")
                break

        # Plot metrics
        epochs = list(range(1, len(train_losses) + 1))

        # Plot Loss Curve
        plt.figure()
        plt.plot(epochs, train_losses, label="Train Loss")
        plt.plot(epochs, val_losses, label="Val Loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.title("Training & Validation Loss")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
        plt.close()

        # Plot HitRate@3
        plt.figure()
        plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
        plt.xlabel("Epoch")
        plt.ylabel("HitRate@3")
        plt.title("Validation HitRate@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
        plt.close()

        # Plot Learning Rate
        plt.figure()
        plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
        plt.xlabel("Epoch")
        plt.ylabel("LR")
        plt.title("Learning Rate Schedule")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
        plt.close()

        # Plot NDCG@3
        plt.figure()
        plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
        plt.xlabel("Epoch")
        plt.ylabel("NDCG@3")
        plt.title("Validation NDCG@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
        plt.close()

        # Plot MAP@3
        plt.figure()
        plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
        plt.xlabel("Epoch")
        plt.ylabel("MAP@3")
        plt.title("Validation MAP@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
        plt.close()

        print("📊 Saved metric plots to model/")

        # Predict test set
        model.load_state_dict(torch.load(model_path))
        model.eval()

        val_ids = X_va["Id"]
        val_rankers = groups_va["ranker_id"]
        val_labels = y_va

        all_val_scores = []

        with torch.no_grad():
            for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
                x_num = x_num.to(device)
                x_cat = {k: v.to(device) for k, v in x_cat.items()}
                scores = model(x_cat, x_num)

                for i in range(scores.size(0)):
                    l = lengths[i]
                    all_val_scores.append(scores[i, :l].cpu().numpy())

        all_val_scores = np.concatenate(all_val_scores)

        assert len(all_val_scores) == X_va.shape[0], \
            f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

        val_df = X_va.with_columns([
            pl.Series("Id", val_ids),
            pl.Series("ranker_id", val_rankers),
            pl.Series("score", all_val_scores),
            pl.Series("label", val_labels)
        ])

        val_df = val_df.with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )

        val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
        val_df.write_csv(val_save_path)
        print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

        all_scores = []

        with torch.no_grad():
            for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
                x_num = x_num.to(device)
                x_cat = {k: v.to(device) for k, v in x_cat.items()}
                scores = model(x_cat, x_num)

                for i in range(scores.size(0)):
                    l = lengths[i]
                    all_scores.append(scores[i, :l].cpu().numpy())

        all_scores = np.concatenate(all_scores)

        assert len(all_scores) == X_te.shape[0], \
            f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

        submission_df = pl.DataFrame({
            "Id": test_ids,
            "ranker_id": test_rankers,
            "score": all_scores
        })

        submission = (
            submission_df
            .with_columns(
                pl.col("score")
                .rank(method="ordinal", descending=True)
                .over("ranker_id")
                .cast(pl.Int32)
                .alias("selected")
            )
            .select(["Id", "ranker_id", "score", "selected"])
        )

        submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
        submission.write_csv(submission_path)
        print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Overwriting deeprec_validate.py


# **New Model**

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, N, C]
        w = self.fc(x.mean(dim=1))
        w = w.unsqueeze(1)  # [B, 1, C]
        return x * w


class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict(
            {
                f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
                for f in cat_dims
            }
        )
        self.emb_dropout = nn.Dropout(emb_dropout)

        total_emb_dim = sum(self.emb_dims.values())
        print(
            f"Categorial emb dim sum: {total_emb_dim}, Numerical dim: {num_numeric_feats}"
        )

        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )


        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        self.transformer_layers = nn.ModuleList(
            [
                nn.TransformerEncoderLayer(
                    d_model=2 * proj_dim,
                    nhead=num_heads,
                    dim_feedforward=dim_feedforward,
                    dropout=dropout,
                    batch_first=True,
                )
                for _ in range(num_layers)
            ]
        )

        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape  # x_num: [B, N, F_num]

        cat_emb_list = []
        for f, emb_layer in self.embeddings.items():
            emb = emb_layer(x_cat[f])  # [B, N]
            cat_emb_list.append(emb)
        cat_embs = torch.cat(cat_emb_list, dim=-1)  # [B, N, total_emb_dim]
        cat_embs = self.emb_dropout(cat_embs)
        cat_feat = self.cat_proj(cat_embs)  # [B, N, proj_dim]
        cat_feat = self.cat_transformer(cat_feat)  # [B, N, proj_dim]

        num_feat = self.num_proj_in(x_num)  # [B, N, proj_dim]
        num_feat = self.num_transformer(num_feat)  # [B, N, proj_dim]
        num_feat = self.num_se(num_feat)  # [B, N, proj_dim]
        num_feat = self.num_proj_out(num_feat)  # [B, N, proj_dim]

        x = torch.cat([cat_feat, num_feat], dim=-1)  # [B, N, 2*proj_dim]

        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        scores = self.mlp(x).squeeze(-1)  # [B, N]

        return scores


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 1e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Writing deeprec_validate.py


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:507: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3812, Val Loss=0.2907, HitRate@3=0.4687, NDCG@3=0.3718, MAP@3=0.6123, LR=0.000100 (600.4s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2850, Val Loss=0.2799, HitRate@3=0.4891, NDCG@3=0.3904, MAP@3=0.6328, LR=0.000089 (601.3s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2639, Val Loss=0.2602, HitRate@3=0.5026, NDCG@3=0.4052, MAP@3=0.6515, LR=0.000078 (602.0s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2495, Val Loss=0.2569, HitRate@3=0.5059, NDCG@3=0.4057, MAP@3=0.6464, LR=0.000067 (602.4s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2383, Val Loss=0.2471, HitRate@3=0.5104, NDCG@3=0.4107, MAP@3=0.6527, LR=0.000056 (602.1s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2275, Val Loss=0.2495, HitRate@3=0.5132, NDCG@3=0.4138, MAP@3=0.6553, LR=0.000044 (602.8s)

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2204, Val Loss=0.2442, HitRate@3=0.5157, NDCG@3=0.4142, MAP@3=0.6537, LR=0.000033 (602.4s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.2116, Val Loss=0.2445, HitRate@3=0.5193, NDCG@3=0.4168, MAP@3=0.6530, LR=0.000022 (603.0s)

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.2049, Val Loss=0.2437, HitRate@3=0.5210, NDCG@3=0.4191, MAP@3=0.6604, LR=0.000011 (603.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.1996, Val Loss=0.2446, HitRate@3=0.5203, NDCG@3=0.4186, MAP@3=0.6592, LR=0.000000 (603.8s)

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, N, C]
        w = self.fc(x.mean(dim=1))
        w = w.unsqueeze(1)  # [B, 1, C]
        return x * w


class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict(
            {
                f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
                for f in cat_dims
            }
        )
        self.emb_dropout = nn.Dropout(emb_dropout)

        total_emb_dim = sum(self.emb_dims.values())
        print(
            f"Categorial emb dim sum: {total_emb_dim}, Numerical dim: {num_numeric_feats}"
        )

        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )


        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        self.transformer_layers = nn.ModuleList(
            [
                nn.TransformerEncoderLayer(
                    d_model=2 * proj_dim,
                    nhead=num_heads,
                    dim_feedforward=dim_feedforward,
                    dropout=dropout,
                    batch_first=True,
                )
                for _ in range(num_layers)
            ]
        )

        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape  # x_num: [B, N, F_num]

        cat_emb_list = []
        for f, emb_layer in self.embeddings.items():
            emb = emb_layer(x_cat[f])  # [B, N]
            cat_emb_list.append(emb)
        cat_embs = torch.cat(cat_emb_list, dim=-1)  # [B, N, total_emb_dim]
        cat_embs = self.emb_dropout(cat_embs)
        cat_feat = self.cat_proj(cat_embs)  # [B, N, proj_dim]
        cat_feat = self.cat_transformer(cat_feat)  # [B, N, proj_dim]

        num_feat = self.num_proj_in(x_num)  # [B, N, proj_dim]
        num_feat = self.num_transformer(num_feat)  # [B, N, proj_dim]
        num_feat = self.num_se(num_feat)  # [B, N, proj_dim]
        num_feat = self.num_proj_out(num_feat)  # [B, N, proj_dim]

        x = torch.cat([cat_feat, num_feat], dim=-1)  # [B, N, 2*proj_dim]

        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        scores = self.mlp(x).squeeze(-1)  # [B, N]

        return scores


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 1e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.2,
        dropout=0.2,
        num_heads=8,
        num_layers=3,
        dim_feedforward=256,
        proj_dim=64
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Overwriting deeprec_validate.py


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:507: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.4052, Val Loss=0.3081, HitRate@3=0.4668, NDCG@3=0.3688, MAP@3=0.6106, LR=0.000100 (712.8s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2998, Val Loss=0.2927, HitRate@3=0.4705, NDCG@3=0.3740, MAP@3=0.6192, LR=0.000089 (713.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2819, Val Loss=0.2735, HitRate@3=0.4969, NDCG@3=0.3980, MAP@3=0.6364, LR=0.000078 (714.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2688, Val Loss=0.2676, HitRate@3=0.4995, NDCG@3=0.4034, MAP@3=0.6480, LR=0.000067 (713.3s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2604, Val Loss=0.2637, HitRate@3=0.4975, NDCG@3=0.3973, MAP@3=0.6355, LR=0.000056 (715.3s)

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2508, Val Loss=0.2598, HitRate@3=0.5024, NDCG@3=0.4057, MAP@3=0.6475, LR=0.000044 (716.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2424, Val Loss=0.2582, HitRate@3=0.5061, NDCG@3=0.4069, MAP@3=0.6486, LR=0.000033 (714.8s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.2370, Val Loss=0.2533, HitRate@3=0.5108, NDCG@3=0.4109, MAP@3=0.6505, LR=0.000022 (714.3s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.2311, Val Loss=0.2529, HitRate@3=0.5183, NDCG@3=0.4153, MAP@3=0.6502, LR=0.000011 (712.8s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.2269, Val Loss=0.2518, HitRate@3=0.5139, NDCG@3=0.4138, MAP@3=0.6534, LR=0.000000 (716.0s)

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, N, C]
        w = self.fc(x.mean(dim=1))
        w = w.unsqueeze(1)  # [B, 1, C]
        return x * w


class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict(
            {
                f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
                for f in cat_dims
            }
        )
        self.emb_dropout = nn.Dropout(emb_dropout)

        total_emb_dim = sum(self.emb_dims.values())
        print(
            f"Categorial emb dim sum: {total_emb_dim}, Numerical dim: {num_numeric_feats}"
        )

        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )


        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        self.transformer_layers = nn.ModuleList(
            [
                nn.TransformerEncoderLayer(
                    d_model=2 * proj_dim,
                    nhead=num_heads,
                    dim_feedforward=dim_feedforward,
                    dropout=dropout,
                    batch_first=True,
                )
                for _ in range(num_layers)
            ]
        )

        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape  # x_num: [B, N, F_num]

        cat_emb_list = []
        for f, emb_layer in self.embeddings.items():
            emb = emb_layer(x_cat[f])  # [B, N]
            cat_emb_list.append(emb)
        cat_embs = torch.cat(cat_emb_list, dim=-1)  # [B, N, total_emb_dim]
        cat_embs = self.emb_dropout(cat_embs)
        cat_feat = self.cat_proj(cat_embs)  # [B, N, proj_dim]
        cat_feat = self.cat_transformer(cat_feat)  # [B, N, proj_dim]

        num_feat = self.num_proj_in(x_num)  # [B, N, proj_dim]
        num_feat = self.num_transformer(num_feat)  # [B, N, proj_dim]
        num_feat = self.num_se(num_feat)  # [B, N, proj_dim]
        num_feat = self.num_proj_out(num_feat)  # [B, N, proj_dim]

        x = torch.cat([cat_feat, num_feat], dim=-1)  # [B, N, 2*proj_dim]

        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        scores = self.mlp(x).squeeze(-1)  # [B, N]

        return scores


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 1e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.25,
        dropout=0.25,
        num_heads=8,
        num_layers=4,
        dim_feedforward=512,
        proj_dim=128
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Writing deeprec_validate.py


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:507: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3997, Val Loss=0.3201, HitRate@3=0.4613, NDCG@3=0.3643, MAP@3=0.6071, LR=0.000100 (864.1s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.3148, Val Loss=0.2894, HitRate@3=0.4814, NDCG@3=0.3850, MAP@3=0.6309, LR=0.000089 (862.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2954, Val Loss=0.2887, HitRate@3=0.4845, NDCG@3=0.3857, MAP@3=0.6258, LR=0.000078 (864.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2842, Val Loss=0.2762, HitRate@3=0.4923, NDCG@3=0.3928, MAP@3=0.6363, LR=0.000067 (865.1s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2741, Val Loss=0.2660, HitRate@3=0.4970, NDCG@3=0.3983, MAP@3=0.6404, LR=0.000056 (864.5s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2656, Val Loss=0.2749, HitRate@3=0.5043, NDCG@3=0.4017, MAP@3=0.6417, LR=0.000044 (864.4s)

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2557, Val Loss=0.2626, HitRate@3=0.5066, NDCG@3=0.4066, MAP@3=0.6499, LR=0.000033 (865.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.2512, Val Loss=0.2622, HitRate@3=0.5017, NDCG@3=0.4032, MAP@3=0.6473, LR=0.000022 (865.0s)

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.2433, Val Loss=0.2569, HitRate@3=0.5095, NDCG@3=0.4094, MAP@3=0.6544, LR=0.000011 (862.8s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.2376, Val Loss=0.2574, HitRate@3=0.5089, NDCG@3=0.4087, MAP@3=0.6519, LR=0.000000 (865.3s)

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter
from torch.optim.lr_scheduler import OneCycleLR

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, N, C]
        w = self.fc(x.mean(dim=1))
        w = w.unsqueeze(1)  # [B, 1, C]
        return x * w


class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict(
            {
                f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
                for f in cat_dims
            }
        )
        self.emb_dropout = nn.Dropout(emb_dropout)

        total_emb_dim = sum(self.emb_dims.values())
        print(
            f"Categorial emb dim sum: {total_emb_dim}, Numerical dim: {num_numeric_feats}"
        )

        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )


        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        self.transformer_layers = nn.ModuleList(
            [
                nn.TransformerEncoderLayer(
                    d_model=2 * proj_dim,
                    nhead=num_heads,
                    dim_feedforward=dim_feedforward,
                    dropout=dropout,
                    batch_first=True,
                )
                for _ in range(num_layers)
            ]
        )

        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape  # x_num: [B, N, F_num]

        cat_emb_list = []
        for f, emb_layer in self.embeddings.items():
            emb = emb_layer(x_cat[f])  # [B, N]
            cat_emb_list.append(emb)
        cat_embs = torch.cat(cat_emb_list, dim=-1)  # [B, N, total_emb_dim]
        cat_embs = self.emb_dropout(cat_embs)
        cat_feat = self.cat_proj(cat_embs)  # [B, N, proj_dim]
        cat_feat = self.cat_transformer(cat_feat)  # [B, N, proj_dim]

        num_feat = self.num_proj_in(x_num)  # [B, N, proj_dim]
        num_feat = self.num_transformer(num_feat)  # [B, N, proj_dim]
        num_feat = self.num_se(num_feat)  # [B, N, proj_dim]
        num_feat = self.num_proj_out(num_feat)  # [B, N, proj_dim]

        x = torch.cat([cat_feat, num_feat], dim=-1)  # [B, N, 2*proj_dim]

        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        scores = self.mlp(x).squeeze(-1)  # [B, N]

        return scores


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 3e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs,
        pct_start=0.3,
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Writing deeprec_validate.py


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:508: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3545, Val Loss=0.2895, HitRate@3=0.4698, NDCG@3=0.3729, MAP@3=0.6141, LR=0.000084 (525.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2918, Val Loss=0.2843, HitRate@3=0.4837, NDCG@3=0.3828, MAP@3=0.6231, LR=0.000228 (526.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2820, Val Loss=0.2823, HitRate@3=0.4911, NDCG@3=0.3908, MAP@3=0.6333, LR=0.000300 (526.3s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2728, Val Loss=0.2706, HitRate@3=0.4956, NDCG@3=0.3971, MAP@3=0.6382, LR=0.000285 (526.6s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2612, Val Loss=0.2626, HitRate@3=0.5017, NDCG@3=0.4021, MAP@3=0.6423, LR=0.000244 (527.3s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2475, Val Loss=0.2570, HitRate@3=0.5114, NDCG@3=0.4110, MAP@3=0.6468, LR=0.000183 (527.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2366, Val Loss=0.2457, HitRate@3=0.5101, NDCG@3=0.4140, MAP@3=0.6579, LR=0.000117 (526.7s)

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.2253, Val Loss=0.2430, HitRate@3=0.5141, NDCG@3=0.4162, MAP@3=0.6548, LR=0.000056 (526.6s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.2166, Val Loss=0.2372, HitRate@3=0.5215, NDCG@3=0.4208, MAP@3=0.6594, LR=0.000015 (527.6s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.2094, Val Loss=0.2379, HitRate@3=0.5202, NDCG@3=0.4192, MAP@3=0.6588, LR=0.000000 (526.7s)

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter
from torch.optim.lr_scheduler import OneCycleLR

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.attn_weights = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, 1)
        )
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # x: [B, N, C]
        # Compute attention weights: [B, N, 1]
        attn_raw = self.attn_weights(x)
        attn = torch.softmax(attn_raw, dim=1)  # [B, N, 1]

        # Weighted aggregation
        pooled = (attn * x).sum(dim=1, keepdim=True)  # [B, 1, C]

        scale = self.sigmoid(pooled)  # [B, 1, C]
        return x * scale  # [B, N, C]


class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict(
            {
                f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
                for f in cat_dims
            }
        )
        self.emb_dropout = nn.Dropout(emb_dropout)

        total_emb_dim = sum(self.emb_dims.values())
        print(
            f"Categorial emb dim sum: {total_emb_dim}, Numerical dim: {num_numeric_feats}"
        )

        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )


        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        self.transformer_layers = nn.ModuleList(
            [
                nn.TransformerEncoderLayer(
                    d_model=2 * proj_dim,
                    nhead=num_heads,
                    dim_feedforward=dim_feedforward,
                    dropout=dropout,
                    batch_first=True,
                )
                for _ in range(num_layers)
            ]
        )

        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape  # x_num: [B, N, F_num]

        cat_emb_list = []
        for f, emb_layer in self.embeddings.items():
            emb = emb_layer(x_cat[f])  # [B, N]
            cat_emb_list.append(emb)
        cat_embs = torch.cat(cat_emb_list, dim=-1)  # [B, N, total_emb_dim]
        cat_embs = self.emb_dropout(cat_embs)
        cat_feat = self.cat_proj(cat_embs)  # [B, N, proj_dim]
        cat_feat = self.cat_transformer(cat_feat)  # [B, N, proj_dim]

        num_feat = self.num_proj_in(x_num)  # [B, N, proj_dim]
        num_feat = self.num_transformer(num_feat)  # [B, N, proj_dim]
        num_feat = self.num_se(num_feat)  # [B, N, proj_dim]
        num_feat = self.num_proj_out(num_feat)  # [B, N, proj_dim]

        x = torch.cat([cat_feat, num_feat], dim=-1)  # [B, N, 2*proj_dim]

        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        scores = self.mlp(x).squeeze(-1)  # [B, N]

        return scores


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 3e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs,
        pct_start=0.3,
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Overwriting deeprec_validate.py


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:514: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3545, Val Loss=0.2954, HitRate@3=0.4755, NDCG@3=0.3781, MAP@3=0.6190, LR=0.000084 (528.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2903, Val Loss=0.2838, HitRate@3=0.4892, NDCG@3=0.3928, MAP@3=0.6393, LR=0.000228 (528.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2820, Val Loss=0.2683, HitRate@3=0.4981, NDCG@3=0.3975, MAP@3=0.6394, LR=0.000300 (526.6s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2730, Val Loss=0.2664, HitRate@3=0.4992, NDCG@3=0.3984, MAP@3=0.6379, LR=0.000285 (527.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2615, Val Loss=0.2689, HitRate@3=0.5097, NDCG@3=0.4082, MAP@3=0.6508, LR=0.000244 (527.0s)

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2497, Val Loss=0.2477, HitRate@3=0.5105, NDCG@3=0.4106, MAP@3=0.6481, LR=0.000183 (526.8s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2383, Val Loss=0.2443, HitRate@3=0.5101, NDCG@3=0.4102, MAP@3=0.6529, LR=0.000117 (526.7s)

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.2265, Val Loss=0.2414, HitRate@3=0.5136, NDCG@3=0.4158, MAP@3=0.6597, LR=0.000056 (529.3s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.2172, Val Loss=0.2373, HitRate@3=0.5187, NDCG@3=0.4203, MAP@3=0.6603, LR=0.000015 (528.3s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.2116, Val Loss=0.2382, HitRate@3=0.5174, NDCG@3=0.4185, MAP@3=0.6578, LR=0.000000 (527.6s)

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F_torch
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter
from torch.optim.lr_scheduler import OneCycleLR

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, N, C]
        w = self.fc(x.mean(dim=1))
        w = w.unsqueeze(1)  # [B, 1, C]
        return x * w


class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict(
            {
                f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
                for f in cat_dims
            }
        )
        self.emb_dropout = nn.Dropout(emb_dropout)

        total_emb_dim = sum(self.emb_dims.values())
        print(
            f"Categorial emb dim sum: {total_emb_dim}, Numerical dim: {num_numeric_feats}"
        )

        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )


        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        self.transformer_layers = nn.ModuleList(
            [
                nn.TransformerEncoderLayer(
                    d_model=2 * proj_dim,
                    nhead=num_heads,
                    dim_feedforward=dim_feedforward,
                    dropout=dropout,
                    batch_first=True,
                )
                for _ in range(num_layers)
            ]
        )

        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape  # x_num: [B, N, F_num]

        cat_emb_list = []
        for f, emb_layer in self.embeddings.items():
            emb = emb_layer(x_cat[f])  # [B, N]
            cat_emb_list.append(emb)
        cat_embs = torch.cat(cat_emb_list, dim=-1)  # [B, N, total_emb_dim]
        cat_embs = self.emb_dropout(cat_embs)
        cat_feat = self.cat_proj(cat_embs)  # [B, N, proj_dim]
        cat_feat = self.cat_transformer(cat_feat)  # [B, N, proj_dim]

        num_feat = self.num_proj_in(x_num)  # [B, N, proj_dim]
        num_feat = self.num_transformer(num_feat)  # [B, N, proj_dim]
        num_feat = self.num_se(num_feat)  # [B, N, proj_dim]
        num_feat = self.num_proj_out(num_feat)  # [B, N, proj_dim]

        x = torch.cat([cat_feat, num_feat], dim=-1)  # [B, N, 2*proj_dim]

        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        scores = self.mlp(x).squeeze(-1)  # [B, N]

        return scores


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F_torch.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 3e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs,
        pct_start=0.3,
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()


Writing deeprec_validate.py


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:508: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3580, Val Loss=0.3073, HitRate@3=0.4722, NDCG@3=0.3754, MAP@3=0.6183, LR=0.000084 (526.0s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2921, Val Loss=0.2864, HitRate@3=0.4816, NDCG@3=0.3847, MAP@3=0.6289, LR=0.000228 (523.1s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2842, Val Loss=0.2741, HitRate@3=0.4945, NDCG@3=0.3944, MAP@3=0.6335, LR=0.000300 (524.0s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2726, Val Loss=0.2650, HitRate@3=0.5035, NDCG@3=0.4064, MAP@3=0.6556, LR=0.000285 (525.6s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2621, Val Loss=0.2646, HitRate@3=0.5002, NDCG@3=0.4039, MAP@3=0.6522, LR=0.000244 (523.4s)

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2503, Val Loss=0.2512, HitRate@3=0.5078, NDCG@3=0.4116, MAP@3=0.6629, LR=0.000183 (525.8s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2384, Val Loss=0.2425, HitRate@3=0.5176, NDCG@3=0.4197, MAP@3=0.6664, LR=0.000117 (524.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.2274, Val Loss=0.2395, HitRate@3=0.5143, NDCG@3=0.4200, MAP@3=0.6729, LR=0.000056 (524.9s)

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.2173, Val Loss=0.2351, HitRate@3=0.5193, NDCG@3=0.4224, MAP@3=0.6707, LR=0.000015 (524.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.2132, Val Loss=0.2355, HitRate@3=0.5183, NDCG@3=0.4220, MAP@3=0.6721, LR=0.000000 (524.9s)

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter
from torch.optim.lr_scheduler import OneCycleLR

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, N, C]
        w = self.fc(x.mean(dim=1))
        w = w.unsqueeze(1)  # [B, 1, C]
        return x * w


class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
            for f in cat_dims
        })
        self.emb_dropout = nn.Dropout(emb_dropout)

        total_emb_dim = sum(self.emb_dims.values())
        print(f"Categorial emb dim sum: {total_emb_dim}, Numerical dim: {num_numeric_feats}")

        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )

        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        # 🔁 Cross-attention: cat attends to num
        self.cross_attn_cat2num = nn.MultiheadAttention(
            embed_dim=proj_dim,
            num_heads=4,
            dropout=dropout,
            batch_first=True,
        )

        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=2 * proj_dim,
                nhead=num_heads,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            )
            for _ in range(num_layers)
        ])

        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape  # x_num: [B, N, F_num]

        # Categorical embeddings
        cat_embs = torch.cat([self.embeddings[f](x_cat[f]) for f in x_cat], dim=-1)
        cat_embs = self.emb_dropout(cat_embs)
        cat_feat = self.cat_proj(cat_embs)  # [B, N, D]
        cat_feat = self.cat_transformer(cat_feat)  # [B, N, D]

        # Numeric features
        num_feat = self.num_proj_in(x_num)
        num_feat = self.num_transformer(num_feat)
        num_feat = self.num_se(num_feat)
        num_feat = self.num_proj_out(num_feat)

        # Cross-attention: categorical features attend over numeric ones
        cat_feat_attn, _ = self.cross_attn_cat2num(
            query=cat_feat, key=num_feat, value=num_feat
        )
        cat_feat = cat_feat + cat_feat_attn  # residual connection

        # Final concatenation
        x = torch.cat([cat_feat, num_feat], dim=-1)

        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        scores = self.mlp(x).squeeze(-1)  # [B, N]
        return scores



"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 3e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs,
        pct_start=0.3,
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Writing deeprec_validate.py


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:514: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3553, Val Loss=0.2916, HitRate@3=0.4702, NDCG@3=0.3754, MAP@3=0.6230, LR=0.000084 (603.4s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2888, Val Loss=0.2995, HitRate@3=0.4790, NDCG@3=0.3844, MAP@3=0.6299, LR=0.000228 (606.1s)

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2827, Val Loss=0.2797, HitRate@3=0.4871, NDCG@3=0.3905, MAP@3=0.6381, LR=0.000300 (605.5s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2730, Val Loss=0.2689, HitRate@3=0.4972, NDCG@3=0.4001, MAP@3=0.6466, LR=0.000285 (605.4s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2611, Val Loss=0.2590, HitRate@3=0.5090, NDCG@3=0.4130, MAP@3=0.6582, LR=0.000244 (606.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2488, Val Loss=0.2531, HitRate@3=0.5133, NDCG@3=0.4112, MAP@3=0.6480, LR=0.000183 (606.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2380, Val Loss=0.2465, HitRate@3=0.5169, NDCG@3=0.4165, MAP@3=0.6582, LR=0.000117 (605.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.2273, Val Loss=0.2419, HitRate@3=0.5184, NDCG@3=0.4189, MAP@3=0.6609, LR=0.000056 (607.1s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.2176, Val Loss=0.2353, HitRate@3=0.5224, NDCG@3=0.4229, MAP@3=0.6617, LR=0.000015 (606.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.2144, Val Loss=0.2350, HitRate@3=0.5216, NDCG@3=0.4224, MAP@3=0.6614, LR=0.000000 (605.8s)

**0.49242**

In [None]:
%%writefile deeprec_full_training.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F_torch
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter
from torch.optim.lr_scheduler import OneCycleLR

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, N, C]
        w = self.fc(x.mean(dim=1))
        w = w.unsqueeze(1)  # [B, 1, C]
        return x * w


class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
            for f in cat_dims
        })
        self.emb_dropout = nn.Dropout(emb_dropout)

        total_emb_dim = sum(self.emb_dims.values())
        print(f"Categorial emb dim sum: {total_emb_dim}, Numerical dim: {num_numeric_feats}")

        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )

        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        # 🔁 Cross-attention: cat attends to num
        self.cross_attn_cat2num = nn.MultiheadAttention(
            embed_dim=proj_dim,
            num_heads=4,
            dropout=dropout,
            batch_first=True,
        )

        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=2 * proj_dim,
                nhead=num_heads,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            )
            for _ in range(num_layers)
        ])

        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape  # x_num: [B, N, F_num]

        # Categorical embeddings
        cat_embs = torch.cat([self.embeddings[f](x_cat[f]) for f in x_cat], dim=-1)
        cat_embs = self.emb_dropout(cat_embs)
        cat_feat = self.cat_proj(cat_embs)  # [B, N, D]
        cat_feat = self.cat_transformer(cat_feat)  # [B, N, D]

        # Numeric features
        num_feat = self.num_proj_in(x_num)
        num_feat = self.num_transformer(num_feat)
        num_feat = self.num_se(num_feat)
        num_feat = self.num_proj_out(num_feat)

        # Cross-attention: categorical features attend over numeric ones
        cat_feat_attn, _ = self.cross_attn_cat2num(
            query=cat_feat, key=num_feat, value=num_feat
        )
        cat_feat = cat_feat + cat_feat_attn  # residual connection

        # Final concatenation
        x = torch.cat([cat_feat, num_feat], dim=-1)

        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        scores = self.mlp(x).squeeze(-1)  # [B, N]
        return scores

"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3):
    batch_size = scores.size(0)
    hits = 0
    for i in range(batch_size):
        l = lengths[i]
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
    return hits / batch_size

"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F_torch.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device):
    model.eval()
    total_hitrate = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):

            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            total_hitrate += hitrate * scores.size(0)
            total_loss += loss.item() * scores.size(0)
            count += scores.size(0)

    avg_hitrate = total_hitrate / count
    avg_loss = total_loss / count
    return avg_loss, avg_hitrate


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n2 = train_size

    X_tr, X_te = X[:n2], X[n2:]
    y_tr, y_te = y[:n2], y[n2:]
    groups_tr, groups_te = groups[:n2], groups[n2:]

    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 3e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.05)

    scheduler = OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs,
        pct_start=0.3,
    )

    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)

        train_losses.append(train_loss)

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )

        torch.save(model.state_dict(), model_path)
        print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()
    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "selected"])
    )


    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.parquet")
    submission.write_parquet(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Writing deeprec_full_training.py


In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter
from torch.optim.lr_scheduler import OneCycleLR

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, N, C]
        w = self.fc(x.mean(dim=1))
        w = w.unsqueeze(1)  # [B, 1, C]
        return x * w


class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
            for f in cat_dims
        })
        self.emb_dropout = nn.Dropout(emb_dropout)

        total_emb_dim = sum(self.emb_dims.values())
        print(f"Categorial emb dim sum: {total_emb_dim}, Numerical dim: {num_numeric_feats}")

        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )

        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        # 🔁 Cross-attention: cat attends to num
        self.cross_attn_cat2num = nn.MultiheadAttention(
            embed_dim=proj_dim,
            num_heads=4,
            dropout=dropout,
            batch_first=True,
        )

        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=2 * proj_dim,
                nhead=num_heads,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            )
            for _ in range(num_layers)
        ])

        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape  # x_num: [B, N, F_num]

        # Categorical embeddings
        cat_embs = torch.cat([self.embeddings[f](x_cat[f]) for f in x_cat], dim=-1)
        cat_embs = self.emb_dropout(cat_embs)
        cat_feat = self.cat_proj(cat_embs)  # [B, N, D]
        cat_feat = self.cat_transformer(cat_feat)  # [B, N, D]

        # Numeric features
        num_feat = self.num_proj_in(x_num)
        num_feat = self.num_transformer(num_feat)
        num_feat = self.num_se(num_feat)
        num_feat = self.num_proj_out(num_feat)

        # Cross-attention: categorical features attend over numeric ones
        cat_feat_attn, _ = self.cross_attn_cat2num(
            query=cat_feat, key=num_feat, value=num_feat
        )
        cat_feat = cat_feat + cat_feat_attn  # residual connection

        # Final concatenation
        x = torch.cat([cat_feat, num_feat], dim=-1)

        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        scores = self.mlp(x).squeeze(-1)  # [B, N]
        return scores



"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 3e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs,
        pct_start=0.3,
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:514: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3580, Val Loss=0.2927, HitRate@3=0.4690, NDCG@3=0.3748, MAP@3=0.6188, LR=0.000084 (603.3s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2948, Val Loss=0.2915, HitRate@3=0.4916, NDCG@3=0.3909, MAP@3=0.6313, LR=0.000228 (602.0s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2844, Val Loss=0.2868, HitRate@3=0.4795, NDCG@3=0.3840, MAP@3=0.6288, LR=0.000300 (602.7s)

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2730, Val Loss=0.2706, HitRate@3=0.5048, NDCG@3=0.4039, MAP@3=0.6460, LR=0.000285 (602.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2602, Val Loss=0.2665, HitRate@3=0.5052, NDCG@3=0.4054, MAP@3=0.6491, LR=0.000244 (604.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2494, Val Loss=0.2492, HitRate@3=0.5137, NDCG@3=0.4133, MAP@3=0.6567, LR=0.000183 (603.5s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2389, Val Loss=0.2436, HitRate@3=0.5102, NDCG@3=0.4138, MAP@3=0.6654, LR=0.000117 (603.3s)

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.2268, Val Loss=0.2384, HitRate@3=0.5139, NDCG@3=0.4175, MAP@3=0.6631, LR=0.000056 (603.8s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.2170, Val Loss=0.2367, HitRate@3=0.5182, NDCG@3=0.4185, MAP@3=0.6599, LR=0.000015 (605.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.2123, Val Loss=0.2378, HitRate@3=0.5173, NDCG@3=0.4195, MAP@3=0.6624, LR=0.000000 (602.7s)

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter
from torch.optim.lr_scheduler import OneCycleLR

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, N, C]
        w = self.fc(x.mean(dim=1))
        w = w.unsqueeze(1)  # [B, 1, C]
        return x * w


class FiLM(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.gamma = nn.Linear(feature_dim, feature_dim)
        self.beta = nn.Linear(feature_dim, feature_dim)

    def forward(self, x, cond):
        gamma = self.gamma(cond)
        beta = self.beta(cond)
        return gamma * x + beta


class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
            for f in cat_dims
        })
        self.emb_dropout = nn.Dropout(emb_dropout)

        total_emb_dim = sum(self.emb_dims.values())
        print(f"Categorial emb dim sum: {total_emb_dim}, Numerical dim: {num_numeric_feats}")

        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )

        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        # 🔁 Cross-attention: cat attends to num
        self.cross_attn_cat2num = nn.MultiheadAttention(
            embed_dim=proj_dim,
            num_heads=4,
            dropout=dropout,
            batch_first=True,
        )

        self.film = FiLM(proj_dim)  # 🔧 FiLM added here

        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=2 * proj_dim,
                nhead=num_heads,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            )
            for _ in range(num_layers)
        ])

        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape  # x_num: [B, N, F_num]

        # Categorical embeddings
        cat_embs = torch.cat([self.embeddings[f](x_cat[f]) for f in x_cat], dim=-1)
        cat_embs = self.emb_dropout(cat_embs)
        cat_feat = self.cat_proj(cat_embs)  # [B, N, D]
        cat_feat = self.cat_transformer(cat_feat)  # [B, N, D]

        # Numeric features
        num_feat = self.num_proj_in(x_num)
        num_feat = self.num_transformer(num_feat)
        num_feat = self.num_se(num_feat)
        num_feat = self.num_proj_out(num_feat)

        # 🔧 FiLM: condition cat_feat on num_feat
        cat_feat = self.film(cat_feat, num_feat)

        # Cross-attention: categorical features attend over numeric ones
        cat_feat_attn, _ = self.cross_attn_cat2num(
            query=cat_feat, key=num_feat, value=num_feat
        )
        cat_feat = cat_feat + cat_feat_attn  # residual connection

        # Final concatenation
        x = torch.cat([cat_feat, num_feat], dim=-1)

        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        scores = self.mlp(x).squeeze(-1)  # [B, N]
        return scores




"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 3e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs,
        pct_start=0.3,
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Writing deeprec_validate.py


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:539: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3532, Val Loss=0.2842, HitRate@3=0.4737, NDCG@3=0.3744, MAP@3=0.6171, LR=0.000084 (606.6s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2870, Val Loss=0.2817, HitRate@3=0.4937, NDCG@3=0.3932, MAP@3=0.6340, LR=0.000228 (606.0s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2801, Val Loss=0.2765, HitRate@3=0.4908, NDCG@3=0.3937, MAP@3=0.6371, LR=0.000300 (606.7s)

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2674, Val Loss=0.2707, HitRate@3=0.5011, NDCG@3=0.4046, MAP@3=0.6516, LR=0.000285 (607.3s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2552, Val Loss=0.2633, HitRate@3=0.5073, NDCG@3=0.4089, MAP@3=0.6550, LR=0.000244 (605.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2444, Val Loss=0.2532, HitRate@3=0.5089, NDCG@3=0.4110, MAP@3=0.6536, LR=0.000183 (608.0s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2333, Val Loss=0.2459, HitRate@3=0.5113, NDCG@3=0.4127, MAP@3=0.6572, LR=0.000117 (607.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.2222, Val Loss=0.2433, HitRate@3=0.5203, NDCG@3=0.4194, MAP@3=0.6590, LR=0.000056 (607.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.2128, Val Loss=0.2403, HitRate@3=0.5148, NDCG@3=0.4175, MAP@3=0.6627, LR=0.000015 (605.9s)

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.2067, Val Loss=0.2391, HitRate@3=0.5153, NDCG@3=0.4173, MAP@3=0.6602, LR=0.000000 (606.9s)

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter
from torch.optim.lr_scheduler import OneCycleLR

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, N, C]
        w = self.fc(x.mean(dim=1))
        w = w.unsqueeze(1)  # [B, 1, C]
        return x * w


class FiLM(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.gamma = nn.Linear(feature_dim, feature_dim)
        self.beta = nn.Linear(feature_dim, feature_dim)

    def forward(self, x, cond):
        gamma = self.gamma(cond)
        beta = self.beta(cond)
        return gamma * x + beta

class GatedFiLM(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.gamma = nn.Linear(feature_dim, feature_dim)
        self.beta = nn.Linear(feature_dim, feature_dim)
        self.gate = nn.Sequential(
            nn.Linear(feature_dim, feature_dim),
            nn.Sigmoid()
        )

    def forward(self, x, cond):
        gamma = self.gamma(cond)
        beta = self.beta(cond)
        modulated = gamma * x + beta
        gate = self.gate(cond).unsqueeze(1)  # [B, 1, D]
        return x * (1 - gate) + modulated * gate

class FiLMWithGate(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.gamma = nn.Linear(feature_dim, feature_dim)
        self.beta = nn.Linear(feature_dim, feature_dim)
        self.gate = nn.Sequential(
            nn.Linear(feature_dim, feature_dim),
            nn.Sigmoid()
        )

    def forward(self, x, cond):
        gamma = self.gamma(cond)
        beta = self.beta(cond)
        modulated = gamma * x + beta
        gate = self.gate(cond)
        return x * (1 - gate) + modulated * gate

class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
            for f in cat_dims
        })
        self.emb_dropout = nn.Dropout(emb_dropout)

        total_emb_dim = sum(self.emb_dims.values())
        print(f"Categorial emb dim sum: {total_emb_dim}, Numerical dim: {num_numeric_feats}")

        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )

        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        # 🔁 Cross-attention: cat attends to num
        self.cross_attn_cat2num = nn.MultiheadAttention(
            embed_dim=proj_dim,
            num_heads=4,
            dropout=dropout,
            batch_first=True,
        )

        self.film = FiLMWithGate(proj_dim)

        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=2 * proj_dim,
                nhead=num_heads,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            )
            for _ in range(num_layers)
        ])

        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape  # x_num: [B, N, F_num]

        # Categorical embeddings
        cat_embs = torch.cat([self.embeddings[f](x_cat[f]) for f in x_cat], dim=-1)
        cat_embs = self.emb_dropout(cat_embs)
        cat_feat = self.cat_proj(cat_embs)  # [B, N, D]
        cat_feat = self.cat_transformer(cat_feat)  # [B, N, D]

        # Numeric features
        num_feat = self.num_proj_in(x_num)
        num_feat = self.num_transformer(num_feat)
        num_feat = self.num_se(num_feat)
        num_feat = self.num_proj_out(num_feat)

        cat_feat = self.film(cat_feat, num_feat)

        # Cross-attention: categorical features attend over numeric ones
        cat_feat_attn, _ = self.cross_attn_cat2num(
            query=cat_feat, key=num_feat, value=num_feat
        )
        cat_feat = cat_feat + cat_feat_attn  # residual connection

        # Final concatenation
        x = torch.cat([cat_feat, num_feat], dim=-1)

        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        scores = self.mlp(x).squeeze(-1)  # [B, N]
        return scores





"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 3e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs,
        pct_start=0.3,
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Writing deeprec_validate.py


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:572: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3535, Val Loss=0.2908, HitRate@3=0.4750, NDCG@3=0.3747, MAP@3=0.6154, LR=0.000084 (608.8s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2880, Val Loss=0.2807, HitRate@3=0.4787, NDCG@3=0.3795, MAP@3=0.6216, LR=0.000228 (607.4s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2791, Val Loss=0.2792, HitRate@3=0.4949, NDCG@3=0.3958, MAP@3=0.6354, LR=0.000300 (607.1s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2690, Val Loss=0.2638, HitRate@3=0.4938, NDCG@3=0.3942, MAP@3=0.6413, LR=0.000285 (608.7s)

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2589, Val Loss=0.2574, HitRate@3=0.5080, NDCG@3=0.4081, MAP@3=0.6508, LR=0.000244 (608.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2463, Val Loss=0.2520, HitRate@3=0.5059, NDCG@3=0.4104, MAP@3=0.6566, LR=0.000183 (609.7s)

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2340, Val Loss=0.2463, HitRate@3=0.5077, NDCG@3=0.4089, MAP@3=0.6525, LR=0.000117 (610.3s)

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.2219, Val Loss=0.2379, HitRate@3=0.5146, NDCG@3=0.4153, MAP@3=0.6556, LR=0.000056 (609.1s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.2123, Val Loss=0.2381, HitRate@3=0.5138, NDCG@3=0.4137, MAP@3=0.6546, LR=0.000015 (609.5s)

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.2071, Val Loss=0.2386, HitRate@3=0.5153, NDCG@3=0.4158, MAP@3=0.6595, LR=0.000000 (609.3s)

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter
from torch.optim.lr_scheduler import OneCycleLR

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, N, C]
        w = self.fc(x.mean(dim=1))  # [B, C]
        return x * w.unsqueeze(1)   # [B, 1, C] => broadcasted


class FiLM(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.gamma = nn.Linear(feature_dim, feature_dim)
        self.beta = nn.Linear(feature_dim, feature_dim)

    def forward(self, x, cond):
        return self.gamma(cond) * x + self.beta(cond)


class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        # 1. Categorical embedding layers
        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
            for f in cat_dims
        })
        self.emb_dropout = nn.Dropout(emb_dropout)
        total_emb_dim = sum(self.emb_dims.values())

        # 2. Category & numeric projections
        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)

        # 3. Per-type Transformer encoding
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=num_heads,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=num_heads,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )

        # 4. Numeric feature squeeze-excite & MLP
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        # 5. Cross-attention: cat → num and num → cat
        self.cross_attn_cat2num = nn.MultiheadAttention(
            embed_dim=proj_dim, num_heads=num_heads, dropout=dropout, batch_first=True
        )
        self.cross_attn_num2cat = nn.MultiheadAttention(
            embed_dim=proj_dim, num_heads=num_heads, dropout=dropout, batch_first=True
        )

        # 6. FiLM layer (condition cat on num)
        self.film = FiLM(proj_dim)

        # 7. Joint Transformer stack (after fusion)
        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=2 * proj_dim,
                nhead=num_heads,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            )
            for _ in range(num_layers)
        ])

        # 8. Final MLP
        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

        print(f"✅ DLRankerAttention initialized | Cat emb dim: {total_emb_dim} → {proj_dim}, Num feats: {num_numeric_feats}")

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape  # x_num: [B, N, num_feats]

        # 1. Embed and encode categorical inputs
        cat_embs = torch.cat([self.embeddings[f](x_cat[f]) for f in x_cat], dim=-1)
        cat_feat = self.cat_proj(self.emb_dropout(cat_embs))  # [B, N, D]
        cat_feat = self.cat_transformer(cat_feat)

        # 2. Encode numeric features
        num_feat = self.num_proj_in(x_num)
        num_feat = self.num_transformer(num_feat)
        num_feat = self.num_se(num_feat)
        num_feat = self.num_proj_out(num_feat)

        # 3. FiLM conditioning: cat_feat conditioned on num_feat
        cat_feat = self.film(cat_feat, num_feat)

        # 4. Dual cross-attention
        # cat → num
        cat_attn, _ = self.cross_attn_cat2num(query=cat_feat, key=num_feat, value=num_feat)
        cat_feat = cat_feat + cat_attn  # residual
        # num → cat
        num_attn, _ = self.cross_attn_num2cat(query=num_feat, key=cat_feat, value=cat_feat)
        num_feat = num_feat + num_attn  # residual

        # 5. Fusion & joint transformer
        x = torch.cat([cat_feat, num_feat], dim=-1)
        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        # 6. Final MLP score
        return self.mlp(x).squeeze(-1)  # [B, N]




"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    valid_batches = 0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss
        valid_batches += 1

    return total_loss / valid_batches if valid_batches > 0 else torch.tensor(0.0, device=scores.device)


"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss_xrank = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss_bpr = bpr_loss(scores, y, lengths)
        loss = 0.8 * loss_xrank + 0.2 * loss_bpr
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss_xrank = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
            loss_bpr = bpr_loss(scores, y, lengths)
            loss = 0.8 * loss_xrank + 0.2 * loss_bpr

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 3e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs,
        pct_start=0.3,
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Overwriting deeprec_validate.py


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/5271 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:548: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3237, Val Loss=0.2817, HitRate@3=0.4741, NDCG@3=0.3777, MAP@3=0.4713, LR=0.000084 (704.4s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2787, Val Loss=0.2757, HitRate@3=0.4745, NDCG@3=0.3811, MAP@3=0.4802, LR=0.000228 (704.8s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2752, Val Loss=0.2666, HitRate@3=0.4797, NDCG@3=0.3843, MAP@3=0.4793, LR=0.000300 (704.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2670, Val Loss=0.2687, HitRate@3=0.4779, NDCG@3=0.3859, MAP@3=0.4843, LR=0.000285 (704.3s)

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2574, Val Loss=0.2619, HitRate@3=0.4943, NDCG@3=0.3993, MAP@3=0.4953, LR=0.000244 (705.1s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2471, Val Loss=0.2546, HitRate@3=0.4929, NDCG@3=0.3953, MAP@3=0.4882, LR=0.000183 (705.4s)

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2375, Val Loss=0.2474, HitRate@3=0.5009, NDCG@3=0.4054, MAP@3=0.5024, LR=0.000117 (704.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.2256, Val Loss=0.2428, HitRate@3=0.5071, NDCG@3=0.4120, MAP@3=0.5097, LR=0.000056 (705.4s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.2163, Val Loss=0.2407, HitRate@3=0.5057, NDCG@3=0.4102, MAP@3=0.5077, LR=0.000015 (705.6s)

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.2115, Val Loss=0.2413, HitRate@3=0.5083, NDCG@3=0.4108, MAP@3=0.5073, LR=0.000000 (706.1s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter
from torch.optim.lr_scheduler import OneCycleLR

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, N, C]
        w = self.fc(x.mean(dim=1))
        w = w.unsqueeze(1)  # [B, 1, C]
        return x * w


class FiLM(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.gamma = nn.Linear(feature_dim, feature_dim)
        self.beta = nn.Linear(feature_dim, feature_dim)

    def forward(self, x, cond):
        gamma = self.gamma(cond)
        beta = self.beta(cond)
        return gamma * x + beta


class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
            for f in cat_dims
        })
        self.emb_dropout = nn.Dropout(emb_dropout)

        total_emb_dim = sum(self.emb_dims.values())
        print(f"Categorial emb dim sum: {total_emb_dim}, Numerical dim: {num_numeric_feats}")

        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )

        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        # 🔁 Cross-attention: cat attends to num
        self.cross_attn_cat2num = nn.MultiheadAttention(
            embed_dim=proj_dim,
            num_heads=4,
            dropout=dropout,
            batch_first=True,
        )

        self.film = FiLM(proj_dim)  # 🔧 FiLM added here

        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=2 * proj_dim,
                nhead=num_heads,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            )
            for _ in range(num_layers)
        ])

        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape  # x_num: [B, N, F_num]

        # Categorical embeddings
        cat_embs = torch.cat([self.embeddings[f](x_cat[f]) for f in x_cat], dim=-1)
        cat_embs = self.emb_dropout(cat_embs)
        cat_feat = self.cat_proj(cat_embs)  # [B, N, D]
        cat_feat = self.cat_transformer(cat_feat)  # [B, N, D]

        # Numeric features
        num_feat = self.num_proj_in(x_num)
        num_feat = self.num_transformer(num_feat)
        num_feat = self.num_se(num_feat)
        num_feat = self.num_proj_out(num_feat)

        # 🔧 FiLM: condition cat_feat on num_feat
        cat_feat = self.film(cat_feat, num_feat)

        # Cross-attention: categorical features attend over numeric ones
        cat_feat_attn, _ = self.cross_attn_cat2num(
            query=cat_feat, key=num_feat, value=num_feat
        )
        cat_feat = cat_feat + cat_feat_attn  # residual connection

        # Final concatenation
        x = torch.cat([cat_feat, num_feat], dim=-1)

        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        scores = self.mlp(x).squeeze(-1)  # [B, N]
        return scores


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs

def multi_focus_xranknet_loss(scores, labels, lengths, focus_tiers=[3, 5, 10], weights=[1.0, 0.5, 0.2], margin=0.2, temperature=1.0):
    """
    Multi-tier xRankNet Loss.
    Emphasizes top-K labels using weighted pairwise loss with optional margin and temperature scaling.
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0.0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue
        s = scores[i][:l]
        y = labels[i][:l]

        loss_i = 0.0
        weight_i = 0.0

        for focus_k, w in zip(focus_tiers, weights):
            topk = min(focus_k, l)
            topk_idx = torch.topk(y, topk).indices
            topk_mask = torch.zeros_like(y, dtype=torch.bool)
            topk_mask[topk_idx] = True

            score_diff = s.unsqueeze(1) - s.unsqueeze(0)
            label_diff = y.unsqueeze(1) - y.unsqueeze(0)

            valid_pairs = (label_diff > 0) & (topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0))
            importance = label_diff.abs() * valid_pairs.float()

            if importance.sum() == 0:
                continue

            score_diff = (score_diff - margin) / temperature
            pairwise_loss = -F.logsigmoid(score_diff) * importance

            loss_i += w * pairwise_loss.sum()
            weight_i += w * importance.sum()

        if weight_i > 0:
            total_loss += loss_i
            total_pairs += weight_i

    return total_loss / total_pairs if total_pairs > 0 else torch.tensor(0.0, device=scores.device, requires_grad=True)


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = multi_focus_xranknet_loss(scores, y, lengths, focus_tiers=[3, 5, 10], weights=[1.0, 0.5, 0.2], margin=0.2, temperature=1.0)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = multi_focus_xranknet_loss(scores, y, lengths, focus_tiers=[3, 5, 10], weights=[1.0, 0.5, 0.2], margin=0.2, temperature=1.0)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 3e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs,
        pct_start=0.3,
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Overwriting deeprec_validate.py


In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter
from torch.optim.lr_scheduler import OneCycleLR, CosineAnnealingWarmRestarts

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, N, C]
        w = self.fc(x.mean(dim=1))
        w = w.unsqueeze(1)  # [B, 1, C]
        return x * w


class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=4,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
            for f in cat_dims
        })
        self.emb_dropout = nn.Dropout(emb_dropout)

        total_emb_dim = sum(self.emb_dims.values())
        print(f"Categorial emb dim sum: {total_emb_dim}, Numerical dim: {num_numeric_feats}")

        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
                norm_first=True,
            ),
            num_layers=1,
        )

        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
                norm_first=True,
            ),
            num_layers=1,
        )
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        self.cross_attn_cat2num = nn.MultiheadAttention(
            embed_dim=proj_dim,
            num_heads=4,
            dropout=dropout,
            batch_first=True,
        )

        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=2 * proj_dim,
                nhead=num_heads,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
                norm_first=True,
            )
            for _ in range(num_layers)
        ])

        # Contextual pooling → concat global context to each item
        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim + 2 * proj_dim, 128),  # context + item
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape

        cat_embs = torch.cat([self.embeddings[f](x_cat[f]) for f in x_cat], dim=-1)
        cat_embs = self.emb_dropout(cat_embs)
        cat_feat = self.cat_proj(cat_embs)
        cat_feat = self.cat_transformer(cat_feat)

        num_feat = self.num_proj_in(x_num)
        num_feat = self.num_transformer(num_feat)
        num_feat = self.num_se(num_feat)
        num_feat = self.num_proj_out(num_feat)

        cat_feat_attn, _ = self.cross_attn_cat2num(query=cat_feat, key=num_feat, value=num_feat)
        cat_feat = cat_feat + cat_feat_attn

        x = torch.cat([cat_feat, num_feat], dim=-1)

        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        # 🔥 Contextual pooling
        global_context = x.mean(dim=1, keepdim=True).expand(-1, N, -1)  # [B, N, 2*D]
        x = torch.cat([x, global_context], dim=-1)  # [B, N, 4*D]

        scores = self.mlp(x).squeeze(-1)
        return scores



"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.0, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.0, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 3e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs,
        pct_start=0.3,
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Writing deeprec_validate.py


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:515: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3199, Val Loss=0.2600, HitRate@3=0.4677, NDCG@3=0.3706, MAP@3=0.6162, LR=0.000084 (607.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2564, Val Loss=0.2443, HitRate@3=0.4911, NDCG@3=0.3922, MAP@3=0.6326, LR=0.000228 (605.1s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2452, Val Loss=0.2439, HitRate@3=0.4942, NDCG@3=0.3957, MAP@3=0.6396, LR=0.000300 (604.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2351, Val Loss=0.2313, HitRate@3=0.5130, NDCG@3=0.4129, MAP@3=0.6533, LR=0.000285 (607.4s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2225, Val Loss=0.2211, HitRate@3=0.5126, NDCG@3=0.4148, MAP@3=0.6601, LR=0.000244 (607.2s)

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2133, Val Loss=0.2222, HitRate@3=0.5161, NDCG@3=0.4155, MAP@3=0.6558, LR=0.000183 (607.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

In [None]:
%%writefile deeprec_full_training.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F_torch
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter
from torch.optim.lr_scheduler import OneCycleLR

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, N, C]
        w = self.fc(x.mean(dim=1))
        w = w.unsqueeze(1)  # [B, 1, C]
        return x * w


class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=4,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
            for f in cat_dims
        })
        self.emb_dropout = nn.Dropout(emb_dropout)

        total_emb_dim = sum(self.emb_dims.values())
        print(f"Categorial emb dim sum: {total_emb_dim}, Numerical dim: {num_numeric_feats}")

        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
                norm_first=True,
            ),
            num_layers=1,
        )

        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
                norm_first=True,
            ),
            num_layers=1,
        )
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        self.cross_attn_cat2num = nn.MultiheadAttention(
            embed_dim=proj_dim,
            num_heads=4,
            dropout=dropout,
            batch_first=True,
        )

        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=2 * proj_dim,
                nhead=num_heads,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
                norm_first=True,
            )
            for _ in range(num_layers)
        ])

        # Contextual pooling → concat global context to each item
        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim + 2 * proj_dim, 128),  # context + item
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape

        cat_embs = torch.cat([self.embeddings[f](x_cat[f]) for f in x_cat], dim=-1)
        cat_embs = self.emb_dropout(cat_embs)
        cat_feat = self.cat_proj(cat_embs)
        cat_feat = self.cat_transformer(cat_feat)

        num_feat = self.num_proj_in(x_num)
        num_feat = self.num_transformer(num_feat)
        num_feat = self.num_se(num_feat)
        num_feat = self.num_proj_out(num_feat)

        cat_feat_attn, _ = self.cross_attn_cat2num(query=cat_feat, key=num_feat, value=num_feat)
        cat_feat = cat_feat + cat_feat_attn

        x = torch.cat([cat_feat, num_feat], dim=-1)

        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        # 🔥 Contextual pooling
        global_context = x.mean(dim=1, keepdim=True).expand(-1, N, -1)  # [B, N, 2*D]
        x = torch.cat([x, global_context], dim=-1)  # [B, N, 4*D]

        scores = self.mlp(x).squeeze(-1)
        return scores

"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3):
    batch_size = scores.size(0)
    hits = 0
    for i in range(batch_size):
        l = lengths[i]
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
    return hits / batch_size

"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F_torch.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.0, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device):
    model.eval()
    total_hitrate = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):

            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.0, focus_topk=3)

            total_hitrate += hitrate * scores.size(0)
            total_loss += loss.item() * scores.size(0)
            count += scores.size(0)

    avg_hitrate = total_hitrate / count
    avg_loss = total_loss / count
    return avg_loss, avg_hitrate


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n2 = train_size

    X_tr, X_te = X[:n2], X[n2:]
    y_tr, y_te = y[:n2], y[n2:]
    groups_tr, groups_te = groups[:n2], groups[n2:]

    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 3e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.05)

    scheduler = OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs,
        pct_start=0.3,
    )

    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)

        train_losses.append(train_loss)

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )

        torch.save(model.state_dict(), model_path)
        print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()
    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "selected"])
    )


    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.parquet")
    submission.write_parquet(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Writing deeprec_full_training.py


# **2 Stage Training**

**Top 10**

In [None]:
%%writefile deeprec_validate_top10.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter
from torch.optim.lr_scheduler import OneCycleLR

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, N, C]
        w = self.fc(x.mean(dim=1))
        w = w.unsqueeze(1)  # [B, 1, C]
        return x * w


class FiLM(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.gamma = nn.Linear(feature_dim, feature_dim)
        self.beta = nn.Linear(feature_dim, feature_dim)

    def forward(self, x, cond):
        gamma = self.gamma(cond)
        beta = self.beta(cond)
        return gamma * x + beta


class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
            for f in cat_dims
        })
        self.emb_dropout = nn.Dropout(emb_dropout)

        total_emb_dim = sum(self.emb_dims.values())
        print(f"Categorial emb dim sum: {total_emb_dim}, Numerical dim: {num_numeric_feats}")

        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )

        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        # 🔁 Cross-attention: cat attends to num
        self.cross_attn_cat2num = nn.MultiheadAttention(
            embed_dim=proj_dim,
            num_heads=4,
            dropout=dropout,
            batch_first=True,
        )

        self.film = FiLM(proj_dim)  # 🔧 FiLM added here

        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=2 * proj_dim,
                nhead=num_heads,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            )
            for _ in range(num_layers)
        ])

        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape  # x_num: [B, N, F_num]

        # Categorical embeddings
        cat_embs = torch.cat([self.embeddings[f](x_cat[f]) for f in x_cat], dim=-1)
        cat_embs = self.emb_dropout(cat_embs)
        cat_feat = self.cat_proj(cat_embs)  # [B, N, D]
        cat_feat = self.cat_transformer(cat_feat)  # [B, N, D]

        # Numeric features
        num_feat = self.num_proj_in(x_num)
        num_feat = self.num_transformer(num_feat)
        num_feat = self.num_se(num_feat)
        num_feat = self.num_proj_out(num_feat)

        # 🔧 FiLM: condition cat_feat on num_feat
        cat_feat = self.film(cat_feat, num_feat)

        # Cross-attention: categorical features attend over numeric ones
        cat_feat_attn, _ = self.cross_attn_cat2num(
            query=cat_feat, key=num_feat, value=num_feat
        )
        cat_feat = cat_feat + cat_feat_attn  # residual connection

        # Final concatenation
        x = torch.cat([cat_feat, num_feat], dim=-1)

        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        scores = self.mlp(x).squeeze(-1)  # [B, N]
        return scores




"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=10, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=10, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=10, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=10):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=10)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=10, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=10, min_group_size=min_group_size)
            map10 = map_at_k(scores, y, lengths, k=10, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=10)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map10 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience

def min_common_group_size_above(groups_df, threshold=10, name="Validation"):
    group_sizes = (
        groups_df["ranker_id"]
        .value_counts()
        .filter(pl.col("count") > threshold)
        .sort("count")
    )
    if group_sizes.is_empty():
        print(f"⚠️ {name}: No groups larger than {threshold}.")
        return None

    min_size = group_sizes["count"][0]
    print(f"✅ {name}: All groups larger than {threshold} have at least size {min_size}.")
    return min_size


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker_top10.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    min_common_group_size_above(groups_va, threshold=10, name="Validation")
    min_common_group_size_above(groups_te, threshold=10, name="Test")

    # Inspect group sizes in validation set
    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 3e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs,
        pct_start=0.3,
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@10={val_hitrate:.4f}, "
            f"NDCG@10={val_ndcg:.4f}, "
            f"MAP@10={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@10
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@10", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@10")
    plt.title("Validation HitRate@10")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@10
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@10", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@10")
    plt.title("Validation NDCG@10")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@10
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@10", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@10")
    plt.title("Validation MAP@10")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("rank")
    )

    val_top10_df = (
        val_df
        .filter(pl.col("rank") <= 10)
        .sort(["ranker_id", "rank"])
    )

    val_top10_path = os.path.join(MODEL_DIR, "validation_top10_dl_full.parquet")
    val_top10_df.write_parquet(val_top10_path)
    print(f"✅ Saved top-10 validation predictions (with features) to {val_top10_path}")

    original_val_cols = set(X_va.columns)
    val_top10_cols = set(val_top10_df.columns)
    extra_val_cols = val_top10_cols - original_val_cols

    print(f"📌 Extra columns in validation_top10_dl_full.parquet vs original validation set: {sorted(extra_val_cols)}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.parquet")
    submission.write_parquet(submission_path)
    print(f"✅ Submission saved to {submission_path}")

    submission_full = (
        X_te.with_columns([
            pl.Series("Id", test_ids),
            pl.Series("ranker_id", test_rankers),
            pl.Series("score", all_scores)
        ])
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("rank")
        )
    )

    submission_top10_full = (
        submission_full
        .filter(pl.col("rank") <= 10)
        .sort(["ranker_id", "rank"])
    )

    submission_top10_path = os.path.join(SUBMIT_DIR, "submission_top10_dl_full.parquet")
    submission_top10_full.write_parquet(submission_top10_path)
    print(f"✅ Saved top-10 test predictions (with features) to {submission_top10_path}")

    original_test_cols = set(X_te.columns)
    test_top10_cols = set(submission_top10_full.columns)
    extra_test_cols = test_top10_cols - original_test_cols

    print(f"📌 Extra columns in submission_top10_dl_full.parquet vs original test set: {sorted(extra_test_cols)}")

if __name__ == "__main__":
    main()

Overwriting deeprec_validate_top10.py


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate_top10.py:539: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3530, Val Loss=0.2840, HitRate@10=0.7228, NDCG@10=0.4645, MAP@10=0.5119, LR=0.000084 (608.1s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2876, Val Loss=0.2888, HitRate@10=0.7306, NDCG@10=0.4760, MAP@10=0.5265, LR=0.000228 (606.2s)

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2806, Val Loss=0.2776, HitRate@10=0.7306, NDCG@10=0.4809, MAP@10=0.5344, LR=0.000300 (606.5s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2670, Val Loss=0.2750, HitRate@10=0.7381, NDCG@10=0.4881, MAP@10=0.5402, LR=0.000285 (607.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2554, Val Loss=0.2603, HitRate@10=0.7432, NDCG@10=0.4930, MAP@10=0.5432, LR=0.000244 (606.6s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2444, Val Loss=0.2537, HitRate@10=0.7515, NDCG@10=0.4997, MAP@10=0.5447, LR=0.000183 (609.6s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2324, Val Loss=0.2449, HitRate@10=0.7517, NDCG@10=0.5001, MAP@10=0.5457, LR=0.000117 (607.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.2214, Val Loss=0.2457, HitRate@10=0.7554, NDCG@10=0.5059, MAP@10=0.5525, LR=0.000056 (607.6s)

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.2115, Val Loss=0.2382, HitRate@10=0.7572, NDCG@10=0.5082, MAP@10=0.5544, LR=0.000015 (606.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.2058, Val Loss=0.2383, HitRate@10=0.7563, NDCG@10=0.5056, MAP@10=0.5523, LR=0.000000 (607.5s)

**Top10 -> Top3**

In [None]:
📌 Validation set:
  X_va shape: (1658020, 153)
  X_va columns (153): ['corporateTariffCode', 'nationality', 'isAccess3D', 'isVip', 'legs0_segments0_cabinClass', 'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_departureFrom_airport_iata', 'legs0_segments0_baggageAllowance_weightMeasurementType', 'legs0_segments0_marketingCarrier_code', 'legs0_segments0_operatingCarrier_code', 'legs1_segments0_cabinClass', 'legs1_segments0_arrivalTo_airport_iata', 'legs1_segments0_departureFrom_airport_iata', 'legs1_segments0_baggageAllowance_weightMeasurementType', 'legs1_segments0_marketingCarrier_code', 'legs1_segments0_operatingCarrier_code', 'pricingInfo_isAccessTP', 'miniRules0_statusInfos', 'miniRules1_statusInfos', 'sex', 'has_ff_program', 'has_corporate_tariff', 'corporate_policy_compliant', 'corporate_vip_flag', 'free_cancel', 'free_exchange', 'is_popular_route', 'contains_capitials', 'is_codeshare_leg0_seg0', 'is_codeshare_leg1_seg0', 'fee_ratio_rule0_is_missing', 'fee_ratio_rule1_is_missing', 'is_vip_freq', 'is_direct_leg0', 'is_direct_leg1', 'is_min_segments_leg0', 'is_min_segments_leg1', 'is_direct_shortest', 'legs0_departureAt_hour', 'legs0_arrivalAt_hour', 'legs1_departureAt_hour', 'legs1_arrivalAt_hour', 'legs0_departureAt_weekday', 'legs0_arrivalAt_weekday', 'legs1_departureAt_weekday', 'legs1_arrivalAt_weekday', 'legs0_departureAt_time_bin', 'legs0_arrivalAt_time_bin', 'legs1_departureAt_time_bin', 'legs1_arrivalAt_time_bin', 'legs0_dep_arr_bin_combo', 'legs0_is_business_friendly', 'legs0_is_red_eye', 'legs1_dep_arr_bin_combo', 'legs1_is_business_friendly', 'legs1_is_red_eye', 'is_short_trip', 'legs0_is_short_flight', 'legs1_is_short_flight', 'is_top3_cheapest', 'is_cheaper_than_avg', 'is_direct_cheapest', 'has_first_class', 'max_cabin_level', 'all_cabin_level_1', 'legs0_segments0_marketingCarrier_code_in_frequentFlyer', 'legs0_segments0_marketingCarrier_code_is_only_frequentFlyer', 'is_major_carrier_0_0', 'legs0_is_cross_country', 'legs0_is_cross_timezone', 'legs1_is_cross_country', 'legs1_is_cross_timezone', 'has_any_label', 'label_BestPrice', 'label_BestPriceTravelPolicy', 'label_BestPriceDirect', 'label_Convenience', 'label_MinTime', 'label_BestPriceCorporateTariff', 'outbound_route', 'return_route', 'is_exact_round_trip', 'legs0_segments0_baggageAllowance_quantity', 'legs1_segments0_baggageAllowance_quantity', 'legs0_duration', 'legs0_segments0_duration', 'legs1_duration', 'legs1_segments0_duration', 'tax_rate', 'log_taxes', 'log_price', 'duration_ratio', 'n_ff_programs', 'fee_ratio_rule0', 'fee_ratio_rule1', 'group_size_log', 'price_duration_rat', 'n_segments_leg0', 'n_segments_leg1', 'stay_duration_hours_log', 'hours_to_departure', 'price_quantile_rank', 'duration_quantile_rank', 'rank_interaction_mul', 'rank_interaction_sub', 'price_zscore_from_median', 'price_relative_to_min', 'company_ocurrence', 'company_avg_selected_price', 'company_std_selected_price', 'company_legs0_direct_ratio', 'company_legs1_direct_ratio', 'company_avg_pct', 'company_avg_dct', 'company_policy_rate', 'company_tariff_selected_count', 'company_tariff_selected_ratio', 'legs0_Carrier_code_cabin1_select_ratio', 'legs0_Carrier_code_cabin2_select_ratio', 'legs0_Carrier_code_avg_price_rank', 'legs0_Carrier_code_avg_duration_rank', 'legs0_Carrier_code_company_diversity', 'legs0_Carrier_code_user_diversity', 'legs1_Carrier_code_cabin1_select_ratio', 'legs1_Carrier_code_cabin2_select_ratio', 'legs1_Carrier_code_avg_price_rank', 'legs1_Carrier_code_avg_duration_rank', 'legs1_Carrier_code_company_diversity', 'legs1_Carrier_code_user_diversity', 'legs0_Carrier_code_log_selected_count', 'legs1_Carrier_code_log_selected_count', 'carrier_pop_prod', 'user_selected_count', 'corporateTariffCode_hotness', 'group_price_mean', 'group_price_std', 'group_price_range', 'group_dur_mean', 'group_dur_std', 'group_dur_range', 'group_n_leg_combos', 'group_option_price_rank_ratio', 'outbound_route_hotness', 'return_route_hotness', 'outbound_route_avg_price', 'outbound_route_avg_duration', 'outbound_route_policy_rate', 'outbound_route_direct_ratio', 'return_route_avg_price', 'return_route_avg_duration', 'return_route_policy_rate', 'return_route_direct_ratio', 'Id']
  y_va shape: (1658020, 1), name: 'selected'
  groups_va shape: (1658020, 1), columns: ['ranker_id']
  First 3 y_va: shape: (3, 1)
┌──────────┐
│ selected │
│ ---      │
│ i64      │
╞══════════╡
│ 0        │
│ 0        │
│ 0        │
└──────────┘
  First 3 groups_va:
shape: (3, 1)
┌─────────────────────────────────┐
│ ranker_id                       │
│ ---                             │
│ str                             │
╞═════════════════════════════════╡
│ 0ab8aa6bd4b344efb94bdf4b814f10… │
│ 0ab8aa6bd4b344efb94bdf4b814f10… │
│ 0ab8aa6bd4b344efb94bdf4b814f10… │
└─────────────────────────────────┘

📌 Test set:
  X_te shape: (6897776, 153)
  X_te columns (153): ['corporateTariffCode', 'nationality', 'isAccess3D', 'isVip', 'legs0_segments0_cabinClass', 'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_departureFrom_airport_iata', 'legs0_segments0_baggageAllowance_weightMeasurementType', 'legs0_segments0_marketingCarrier_code', 'legs0_segments0_operatingCarrier_code', 'legs1_segments0_cabinClass', 'legs1_segments0_arrivalTo_airport_iata', 'legs1_segments0_departureFrom_airport_iata', 'legs1_segments0_baggageAllowance_weightMeasurementType', 'legs1_segments0_marketingCarrier_code', 'legs1_segments0_operatingCarrier_code', 'pricingInfo_isAccessTP', 'miniRules0_statusInfos', 'miniRules1_statusInfos', 'sex', 'has_ff_program', 'has_corporate_tariff', 'corporate_policy_compliant', 'corporate_vip_flag', 'free_cancel', 'free_exchange', 'is_popular_route', 'contains_capitials', 'is_codeshare_leg0_seg0', 'is_codeshare_leg1_seg0', 'fee_ratio_rule0_is_missing', 'fee_ratio_rule1_is_missing', 'is_vip_freq', 'is_direct_leg0', 'is_direct_leg1', 'is_min_segments_leg0', 'is_min_segments_leg1', 'is_direct_shortest', 'legs0_departureAt_hour', 'legs0_arrivalAt_hour', 'legs1_departureAt_hour', 'legs1_arrivalAt_hour', 'legs0_departureAt_weekday', 'legs0_arrivalAt_weekday', 'legs1_departureAt_weekday', 'legs1_arrivalAt_weekday', 'legs0_departureAt_time_bin', 'legs0_arrivalAt_time_bin', 'legs1_departureAt_time_bin', 'legs1_arrivalAt_time_bin', 'legs0_dep_arr_bin_combo', 'legs0_is_business_friendly', 'legs0_is_red_eye', 'legs1_dep_arr_bin_combo', 'legs1_is_business_friendly', 'legs1_is_red_eye', 'is_short_trip', 'legs0_is_short_flight', 'legs1_is_short_flight', 'is_top3_cheapest', 'is_cheaper_than_avg', 'is_direct_cheapest', 'has_first_class', 'max_cabin_level', 'all_cabin_level_1', 'legs0_segments0_marketingCarrier_code_in_frequentFlyer', 'legs0_segments0_marketingCarrier_code_is_only_frequentFlyer', 'is_major_carrier_0_0', 'legs0_is_cross_country', 'legs0_is_cross_timezone', 'legs1_is_cross_country', 'legs1_is_cross_timezone', 'has_any_label', 'label_BestPrice', 'label_BestPriceTravelPolicy', 'label_BestPriceDirect', 'label_Convenience', 'label_MinTime', 'label_BestPriceCorporateTariff', 'outbound_route', 'return_route', 'is_exact_round_trip', 'legs0_segments0_baggageAllowance_quantity', 'legs1_segments0_baggageAllowance_quantity', 'legs0_duration', 'legs0_segments0_duration', 'legs1_duration', 'legs1_segments0_duration', 'tax_rate', 'log_taxes', 'log_price', 'duration_ratio', 'n_ff_programs', 'fee_ratio_rule0', 'fee_ratio_rule1', 'group_size_log', 'price_duration_rat', 'n_segments_leg0', 'n_segments_leg1', 'stay_duration_hours_log', 'hours_to_departure', 'price_quantile_rank', 'duration_quantile_rank', 'rank_interaction_mul', 'rank_interaction_sub', 'price_zscore_from_median', 'price_relative_to_min', 'company_ocurrence', 'company_avg_selected_price', 'company_std_selected_price', 'company_legs0_direct_ratio', 'company_legs1_direct_ratio', 'company_avg_pct', 'company_avg_dct', 'company_policy_rate', 'company_tariff_selected_count', 'company_tariff_selected_ratio', 'legs0_Carrier_code_cabin1_select_ratio', 'legs0_Carrier_code_cabin2_select_ratio', 'legs0_Carrier_code_avg_price_rank', 'legs0_Carrier_code_avg_duration_rank', 'legs0_Carrier_code_company_diversity', 'legs0_Carrier_code_user_diversity', 'legs1_Carrier_code_cabin1_select_ratio', 'legs1_Carrier_code_cabin2_select_ratio', 'legs1_Carrier_code_avg_price_rank', 'legs1_Carrier_code_avg_duration_rank', 'legs1_Carrier_code_company_diversity', 'legs1_Carrier_code_user_diversity', 'legs0_Carrier_code_log_selected_count', 'legs1_Carrier_code_log_selected_count', 'carrier_pop_prod', 'user_selected_count', 'corporateTariffCode_hotness', 'group_price_mean', 'group_price_std', 'group_price_range', 'group_dur_mean', 'group_dur_std', 'group_dur_range', 'group_n_leg_combos', 'group_option_price_rank_ratio', 'outbound_route_hotness', 'return_route_hotness', 'outbound_route_avg_price', 'outbound_route_avg_duration', 'outbound_route_policy_rate', 'outbound_route_direct_ratio', 'return_route_avg_price', 'return_route_avg_duration', 'return_route_policy_rate', 'return_route_direct_ratio', 'Id']
  y_te shape: (6897776, 1), name: 'selected'
  groups_te shape: (6897776, 1), columns: ['ranker_id']
  First 3 y_te: shape: (3, 1)
┌──────────┐
│ selected │
│ ---      │
│ i64      │
╞══════════╡
│ 0        │
│ 0        │
│ 0        │
└──────────┘
  First 3 groups_te:
shape: (3, 1)
┌─────────────────────────────────┐
│ ranker_id                       │
│ ---                             │
│ str                             │
╞═════════════════════════════════╡
│ c9373e5f772e43d593dd6ad2fa90f6… │
│ c9373e5f772e43d593dd6ad2fa90f6… │
│ c9373e5f772e43d593dd6ad2fa90f6… │
└─────────────────────────────────┘

In [None]:
import polars as pl

validation_path = "./data/validation_top10_dl_full.parquet"
submission_path = "./data/submission_top10_dl_full.parquet"

val_df = pl.read_parquet(validation_path)
sub_df = pl.read_parquet(submission_path)

print("📋 Validation set:")
print(f"  shape: {val_df.shape}")
print(f"  columns: {val_df.columns}")

print("\n📋 Submission set:")
print(f"  shape: {sub_df.shape}")
print(f"  columns: {sub_df.columns}")

Writing check.py


In [None]:
📋 Validation set:
  shape: (97134, 157)
  columns: ['corporateTariffCode', 'nationality', 'isAccess3D', 'isVip', 'legs0_segments0_cabinClass', 'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_departureFrom_airport_iata', 'legs0_segments0_baggageAllowance_weightMeasurementType', 'legs0_segments0_marketingCarrier_code', 'legs0_segments0_operatingCarrier_code', 'legs1_segments0_cabinClass', 'legs1_segments0_arrivalTo_airport_iata', 'legs1_segments0_departureFrom_airport_iata', 'legs1_segments0_baggageAllowance_weightMeasurementType', 'legs1_segments0_marketingCarrier_code', 'legs1_segments0_operatingCarrier_code', 'pricingInfo_isAccessTP', 'miniRules0_statusInfos', 'miniRules1_statusInfos', 'sex', 'has_ff_program', 'has_corporate_tariff', 'corporate_policy_compliant', 'corporate_vip_flag', 'free_cancel', 'free_exchange', 'is_popular_route', 'contains_capitials', 'is_codeshare_leg0_seg0', 'is_codeshare_leg1_seg0', 'fee_ratio_rule0_is_missing', 'fee_ratio_rule1_is_missing', 'is_vip_freq', 'is_direct_leg0', 'is_direct_leg1', 'is_min_segments_leg0', 'is_min_segments_leg1', 'is_direct_shortest', 'legs0_departureAt_hour', 'legs0_arrivalAt_hour', 'legs1_departureAt_hour', 'legs1_arrivalAt_hour', 'legs0_departureAt_weekday', 'legs0_arrivalAt_weekday', 'legs1_departureAt_weekday', 'legs1_arrivalAt_weekday', 'legs0_departureAt_time_bin', 'legs0_arrivalAt_time_bin', 'legs1_departureAt_time_bin', 'legs1_arrivalAt_time_bin', 'legs0_dep_arr_bin_combo', 'legs0_is_business_friendly', 'legs0_is_red_eye', 'legs1_dep_arr_bin_combo', 'legs1_is_business_friendly', 'legs1_is_red_eye', 'is_short_trip', 'legs0_is_short_flight', 'legs1_is_short_flight', 'is_top3_cheapest', 'is_cheaper_than_avg', 'is_direct_cheapest', 'has_first_class', 'max_cabin_level', 'all_cabin_level_1', 'legs0_segments0_marketingCarrier_code_in_frequentFlyer', 'legs0_segments0_marketingCarrier_code_is_only_frequentFlyer', 'is_major_carrier_0_0', 'legs0_is_cross_country', 'legs0_is_cross_timezone', 'legs1_is_cross_country', 'legs1_is_cross_timezone', 'has_any_label', 'label_BestPrice', 'label_BestPriceTravelPolicy', 'label_BestPriceDirect', 'label_Convenience', 'label_MinTime', 'label_BestPriceCorporateTariff', 'outbound_route', 'return_route', 'is_exact_round_trip', 'legs0_segments0_baggageAllowance_quantity', 'legs1_segments0_baggageAllowance_quantity', 'legs0_duration', 'legs0_segments0_duration', 'legs1_duration', 'legs1_segments0_duration', 'tax_rate', 'log_taxes', 'log_price', 'duration_ratio', 'n_ff_programs', 'fee_ratio_rule0', 'fee_ratio_rule1', 'group_size_log', 'price_duration_rat', 'n_segments_leg0', 'n_segments_leg1', 'stay_duration_hours_log', 'hours_to_departure', 'price_quantile_rank', 'duration_quantile_rank', 'rank_interaction_mul', 'rank_interaction_sub', 'price_zscore_from_median', 'price_relative_to_min', 'company_ocurrence', 'company_avg_selected_price', 'company_std_selected_price', 'company_legs0_direct_ratio', 'company_legs1_direct_ratio', 'company_avg_pct', 'company_avg_dct', 'company_policy_rate', 'company_tariff_selected_count', 'company_tariff_selected_ratio', 'legs0_Carrier_code_cabin1_select_ratio', 'legs0_Carrier_code_cabin2_select_ratio', 'legs0_Carrier_code_avg_price_rank', 'legs0_Carrier_code_avg_duration_rank', 'legs0_Carrier_code_company_diversity', 'legs0_Carrier_code_user_diversity', 'legs1_Carrier_code_cabin1_select_ratio', 'legs1_Carrier_code_cabin2_select_ratio', 'legs1_Carrier_code_avg_price_rank', 'legs1_Carrier_code_avg_duration_rank', 'legs1_Carrier_code_company_diversity', 'legs1_Carrier_code_user_diversity', 'legs0_Carrier_code_log_selected_count', 'legs1_Carrier_code_log_selected_count', 'carrier_pop_prod', 'user_selected_count', 'corporateTariffCode_hotness', 'group_price_mean', 'group_price_std', 'group_price_range', 'group_dur_mean', 'group_dur_std', 'group_dur_range', 'group_n_leg_combos', 'group_option_price_rank_ratio', 'outbound_route_hotness', 'return_route_hotness', 'outbound_route_avg_price', 'outbound_route_avg_duration', 'outbound_route_policy_rate', 'outbound_route_direct_ratio', 'return_route_avg_price', 'return_route_avg_duration', 'return_route_policy_rate', 'return_route_direct_ratio', 'Id', 'ranker_id', 'score', 'label', 'rank']

📋 Submission set:
  shape: (410205, 156)
  columns: ['corporateTariffCode', 'nationality', 'isAccess3D', 'isVip', 'legs0_segments0_cabinClass', 'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_departureFrom_airport_iata', 'legs0_segments0_baggageAllowance_weightMeasurementType', 'legs0_segments0_marketingCarrier_code', 'legs0_segments0_operatingCarrier_code', 'legs1_segments0_cabinClass', 'legs1_segments0_arrivalTo_airport_iata', 'legs1_segments0_departureFrom_airport_iata', 'legs1_segments0_baggageAllowance_weightMeasurementType', 'legs1_segments0_marketingCarrier_code', 'legs1_segments0_operatingCarrier_code', 'pricingInfo_isAccessTP', 'miniRules0_statusInfos', 'miniRules1_statusInfos', 'sex', 'has_ff_program', 'has_corporate_tariff', 'corporate_policy_compliant', 'corporate_vip_flag', 'free_cancel', 'free_exchange', 'is_popular_route', 'contains_capitials', 'is_codeshare_leg0_seg0', 'is_codeshare_leg1_seg0', 'fee_ratio_rule0_is_missing', 'fee_ratio_rule1_is_missing', 'is_vip_freq', 'is_direct_leg0', 'is_direct_leg1', 'is_min_segments_leg0', 'is_min_segments_leg1', 'is_direct_shortest', 'legs0_departureAt_hour', 'legs0_arrivalAt_hour', 'legs1_departureAt_hour', 'legs1_arrivalAt_hour', 'legs0_departureAt_weekday', 'legs0_arrivalAt_weekday', 'legs1_departureAt_weekday', 'legs1_arrivalAt_weekday', 'legs0_departureAt_time_bin', 'legs0_arrivalAt_time_bin', 'legs1_departureAt_time_bin', 'legs1_arrivalAt_time_bin', 'legs0_dep_arr_bin_combo', 'legs0_is_business_friendly', 'legs0_is_red_eye', 'legs1_dep_arr_bin_combo', 'legs1_is_business_friendly', 'legs1_is_red_eye', 'is_short_trip', 'legs0_is_short_flight', 'legs1_is_short_flight', 'is_top3_cheapest', 'is_cheaper_than_avg', 'is_direct_cheapest', 'has_first_class', 'max_cabin_level', 'all_cabin_level_1', 'legs0_segments0_marketingCarrier_code_in_frequentFlyer', 'legs0_segments0_marketingCarrier_code_is_only_frequentFlyer', 'is_major_carrier_0_0', 'legs0_is_cross_country', 'legs0_is_cross_timezone', 'legs1_is_cross_country', 'legs1_is_cross_timezone', 'has_any_label', 'label_BestPrice', 'label_BestPriceTravelPolicy', 'label_BestPriceDirect', 'label_Convenience', 'label_MinTime', 'label_BestPriceCorporateTariff', 'outbound_route', 'return_route', 'is_exact_round_trip', 'legs0_segments0_baggageAllowance_quantity', 'legs1_segments0_baggageAllowance_quantity', 'legs0_duration', 'legs0_segments0_duration', 'legs1_duration', 'legs1_segments0_duration', 'tax_rate', 'log_taxes', 'log_price', 'duration_ratio', 'n_ff_programs', 'fee_ratio_rule0', 'fee_ratio_rule1', 'group_size_log', 'price_duration_rat', 'n_segments_leg0', 'n_segments_leg1', 'stay_duration_hours_log', 'hours_to_departure', 'price_quantile_rank', 'duration_quantile_rank', 'rank_interaction_mul', 'rank_interaction_sub', 'price_zscore_from_median', 'price_relative_to_min', 'company_ocurrence', 'company_avg_selected_price', 'company_std_selected_price', 'company_legs0_direct_ratio', 'company_legs1_direct_ratio', 'company_avg_pct', 'company_avg_dct', 'company_policy_rate', 'company_tariff_selected_count', 'company_tariff_selected_ratio', 'legs0_Carrier_code_cabin1_select_ratio', 'legs0_Carrier_code_cabin2_select_ratio', 'legs0_Carrier_code_avg_price_rank', 'legs0_Carrier_code_avg_duration_rank', 'legs0_Carrier_code_company_diversity', 'legs0_Carrier_code_user_diversity', 'legs1_Carrier_code_cabin1_select_ratio', 'legs1_Carrier_code_cabin2_select_ratio', 'legs1_Carrier_code_avg_price_rank', 'legs1_Carrier_code_avg_duration_rank', 'legs1_Carrier_code_company_diversity', 'legs1_Carrier_code_user_diversity', 'legs0_Carrier_code_log_selected_count', 'legs1_Carrier_code_log_selected_count', 'carrier_pop_prod', 'user_selected_count', 'corporateTariffCode_hotness', 'group_price_mean', 'group_price_std', 'group_price_range', 'group_dur_mean', 'group_dur_std', 'group_dur_range', 'group_n_leg_combos', 'group_option_price_rank_ratio', 'outbound_route_hotness', 'return_route_hotness', 'outbound_route_avg_price', 'outbound_route_avg_duration', 'outbound_route_policy_rate', 'outbound_route_direct_ratio', 'return_route_avg_price', 'return_route_avg_duration', 'return_route_policy_rate', 'return_route_direct_ratio', 'Id', 'ranker_id', 'score', 'rank']

In [None]:
%%writefile deeprec_validate_top3.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter
from torch.optim.lr_scheduler import OneCycleLR

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class SEModule(nn.Module):
    def __init__(self, channel, reduction=4):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel),
            nn.Sigmoid(),
        )

    def forward(self, x):
        # x: [B, N, C]
        w = self.fc(x.mean(dim=1))
        w = w.unsqueeze(1)  # [B, 1, C]
        return x * w


class FiLM(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.gamma = nn.Linear(feature_dim, feature_dim)
        self.beta = nn.Linear(feature_dim, feature_dim)

    def forward(self, x, cond):
        gamma = self.gamma(cond)
        beta = self.beta(cond)
        return gamma * x + beta


class DLRankerAttention(nn.Module):
    def __init__(
        self,
        cat_dims,
        num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64,
    ):
        super().__init__()

        self.emb_dims = {
            f: min(50, max(4, round(6 * math.log2(cat_dims[f] + 1)))) for f in cat_dims
        }
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], self.emb_dims[f], padding_idx=0)
            for f in cat_dims
        })
        self.emb_dropout = nn.Dropout(emb_dropout)

        total_emb_dim = sum(self.emb_dims.values())
        print(f"Categorial emb dim sum: {total_emb_dim}, Numerical dim: {num_numeric_feats}")

        self.cat_proj = nn.Linear(total_emb_dim, proj_dim)
        self.cat_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )

        self.num_proj_in = nn.Linear(num_numeric_feats, proj_dim)
        self.num_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=proj_dim,
                nhead=4,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            ),
            num_layers=1,
        )
        self.num_se = SEModule(proj_dim)
        self.num_proj_out = nn.Sequential(
            nn.Linear(proj_dim, proj_dim),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
        )

        # 🔁 Cross-attention: cat attends to num
        self.cross_attn_cat2num = nn.MultiheadAttention(
            embed_dim=proj_dim,
            num_heads=4,
            dropout=dropout,
            batch_first=True,
        )

        self.film = FiLM(proj_dim)  # 🔧 FiLM added here

        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(
                d_model=2 * proj_dim,
                nhead=num_heads,
                dim_feedforward=dim_feedforward,
                dropout=dropout,
                batch_first=True,
            )
            for _ in range(num_layers)
        ])

        self.mlp = nn.Sequential(
            nn.Linear(2 * proj_dim, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(64, 1),
        )

    def forward(self, x_cat: dict, x_num: torch.Tensor, attn_mask=None):
        B, N, _ = x_num.shape  # x_num: [B, N, F_num]

        # Categorical embeddings
        cat_embs = torch.cat([self.embeddings[f](x_cat[f]) for f in x_cat], dim=-1)
        cat_embs = self.emb_dropout(cat_embs)
        cat_feat = self.cat_proj(cat_embs)  # [B, N, D]
        cat_feat = self.cat_transformer(cat_feat)  # [B, N, D]

        # Numeric features
        num_feat = self.num_proj_in(x_num)
        num_feat = self.num_transformer(num_feat)
        num_feat = self.num_se(num_feat)
        num_feat = self.num_proj_out(num_feat)

        # 🔧 FiLM: condition cat_feat on num_feat
        cat_feat = self.film(cat_feat, num_feat)

        # Cross-attention: categorical features attend over numeric ones
        cat_feat_attn, _ = self.cross_attn_cat2num(
            query=cat_feat, key=num_feat, value=num_feat
        )
        cat_feat = cat_feat + cat_feat_attn  # residual connection

        # Final concatenation
        x = torch.cat([cat_feat, num_feat], dim=-1)

        for layer in self.transformer_layers:
            x = layer(x, src_key_padding_mask=attn_mask)

        scores = self.mlp(x).squeeze(-1)  # [B, N]
        return scores




"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker_top3.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.00005)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300

    X_tr = X[:n1]
    y_tr = y[:n1]
    groups_tr = groups[:n1]

    validation_path = os.path.join(DATA_DIR, "validation_top10_dl_full.parquet")
    submission_path = os.path.join(DATA_DIR, "submission_top10_dl_full.parquet")

    val_df = pl.read_parquet(validation_path).rename({"label": "selected"})
    test_df = pl.read_parquet(submission_path)

    X_va = val_df.drop(["selected", "rank", "score", "ranker_id"])
    y_va = val_df.select("selected")
    groups_va = val_df.select("ranker_id")

    X_te = test_df.drop(["rank", "score", "ranker_id"])
    y_te = pl.Series("selected", [0] * test_df.height)  # dummy labels
    groups_te = test_df.select("ranker_id")

    val_rankers = groups_va["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")


    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
    val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
    test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 1e-4

    model = DLRankerAttention(
        cat_dims=cat_dims,
        num_numeric_feats=num_numeric_feats,
        emb_dropout=0.1,
        dropout=0.1,
        num_heads=4,
        num_layers=2,
        dim_feedforward=128,
        proj_dim=64
    ).to(device)

    model.load_state_dict(torch.load("model/best_dl_ranker_top10.pt"))

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = OneCycleLR(
        optimizer,
        max_lr=learning_rate,
        steps_per_epoch=len(train_loader),
        epochs=num_epochs,
        pct_start=0.3,
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va["Id"]
    val_rankers = groups_va["ranker_id"]
    val_labels = y_va

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

    val_df = X_va.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("selected", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
    val_df.write_csv(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    assert len(all_scores) == X_te.shape[0], \
        f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

    submission_df = pl.DataFrame({
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores
    })

    submission = (
        submission_df
        .with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
    submission.write_csv(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Writing deeprec_validate_top3.py


In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/2636 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:539: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3531, Val Loss=0.5595, HitRate@3=0.4739, NDCG@3=0.3745, MAP@3=0.6289, LR=0.000084 (582.8s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.2876, Val Loss=0.5293, HitRate@3=0.4840, NDCG@3=0.3802, MAP@3=0.6295, LR=0.000228 (584.8s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2799, Val Loss=0.5292, HitRate@3=0.4895, NDCG@3=0.3912, MAP@3=0.6490, LR=0.000300 (584.0s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2667, Val Loss=0.5256, HitRate@3=0.4986, NDCG@3=0.4015, MAP@3=0.6618, LR=0.000285 (583.6s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2553, Val Loss=0.5082, HitRate@3=0.5038, NDCG@3=0.4005, MAP@3=0.6500, LR=0.000244 (582.9s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2443, Val Loss=0.5007, HitRate@3=0.5086, NDCG@3=0.4131, MAP@3=0.6719, LR=0.000183 (585.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2330, Val Loss=0.5043, HitRate@3=0.5092, NDCG@3=0.4076, MAP@3=0.6608, LR=0.000117 (584.1s)

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.2229, Val Loss=0.5002, HitRate@3=0.5070, NDCG@3=0.4075, MAP@3=0.6635, LR=0.000056 (586.0s)

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.2135, Val Loss=0.5007, HitRate@3=0.5111, NDCG@3=0.4104, MAP@3=0.6620, LR=0.000015 (582.4s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

# **Get back features**

In [None]:
from feature_specs import get_all_feature_lists

# Obține listele
_, _, exclude = get_all_feature_lists()

# Elimină duplicate și sortează
exclude_unique_sorted = sorted(set(exclude))

# Afișează numărul total și feature-urile excluse
print(f"Număr total de feature-uri excluse: {len(exclude_unique_sorted)}\n")
print("Lista feature-urilor excluse:")
for feat in exclude_unique_sorted:
    print(feat)


Număr total de feature-uri excluse: 102

Lista feature-urilor excluse:
Id
both_direct
carrier_pop_prod
corporateTariffCode_duration_rank_mean
corporateTariffCode_price_rank_mean
frequentFlyer
geo_distance_km
group_size
is_expensive_outlier
is_last_minute_booking
is_min_segments
is_popular_flight
is_shortest_duration
l0_seg
leg_dur_interaction_ratio
legs0_arrivalAt
legs0_arrivalAt_business_time
legs0_departureAt
legs0_departureAt_business_time
legs0_segments0_marketingCarrier_code_avg_dep_hour
legs0_segments0_marketingCarrier_code_ff_and_economic
legs0_segments0_marketingCarrier_code_selected_rank_bin
legs0_segments0_marketingCarrier_code_selection_rate
legs0_segments0_seatsAvailable
legs0_segments1_duration
legs0_segments1_seatsAvailable
legs0_segments2_aircraft_code
legs0_segments2_arrivalTo_airport_city_iata
legs0_segments2_arrivalTo_airport_iata
legs0_segments2_baggageAllowance_quantity
legs0_segments2_baggageAllowance_weightMeasurementType
legs0_segments2_cabinClass
legs0_segments2

In [None]:
%%writefile combine.py
import polars as pl

cols_to_copy = [
    "corporateTariffCode_duration_rank_mean",
    "corporateTariffCode_price_rank_mean",
    "miniRules0_percentage",
    "miniRules1_percentage",
    "miniRules0_monetaryAmount",
    "miniRules1_monetaryAmount",
    "taxes",
    "totalPrice",
    "group_size",
    "price_per_tax",
    "l0_seg",
    "rank_interaction_ratio",
    "rank_interaction_sum",
    "leg_dur_interaction_ratio",
    "selected_direct_ratio",
    "legs0_segments0_marketingCarrier_code_avg_dep_hour",
    "legs0_segments0_marketingCarrier_code_selection_rate",
    "legs1_segments0_marketingCarrier_code_selection_rate",
    "carrier_pop_prod"
]

# Citește fișierele
f1 = pl.read_parquet("feature_engineered_val.parquet")
f2 = pl.read_parquet("features.parquet").select(cols_to_copy)

# Verificare rapidă
assert f1.height == f2.height, "⚠️ Fișierele nu au același număr de rânduri!"

# Combină orizontal
f_new = f1.hstack(f2)

# Salvează
f_new.write_parquet("feature_engineered_val_new.parquet")
print("✅ Coloane adăugate prin hstack.")


Overwriting combine.py


In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F_torch
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter
from sklearn.decomposition import PCA

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch = [b for b in batch if b[3] >= 2]
    if len(batch) == 0:
        return None

    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths


class FiBiDLRanker(nn.Module):
    def __init__(self, cat_dims, num_numeric_feats, emb_dim=64, hidden=[512, 256, 128, 64], reduction=4, use_senet=True):
        super().__init__()
        self.cat_fields = list(cat_dims.keys())
        self.emb_dim = emb_dim
        self.num_fields = len(self.cat_fields)
        self.use_senet = use_senet

        # Embedding layers
        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], emb_dim, padding_idx=0) for f in self.cat_fields
        })
        self.linear_cat = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], 1, padding_idx=0) for f in self.cat_fields
        })
        self.linear_num = nn.Linear(num_numeric_feats, 1)

        for emb in list(self.embeddings.values()) + list(self.linear_cat.values()):
            nn.init.xavier_uniform_(emb.weight)
        nn.init.xavier_uniform_(self.linear_num.weight)

        # SENet (with LayerNorm)
        if self.use_senet:
            self.se_fc1 = nn.Linear(self.num_fields * emb_dim, self.num_fields * emb_dim // reduction)
            self.se_ln1 = nn.LayerNorm(self.num_fields * emb_dim // reduction)
            self.se_dropout = nn.Dropout(0.1)
            self.se_fc2 = nn.Linear(self.num_fields * emb_dim // reduction, self.num_fields * emb_dim)

        # Bilinear layer (shared)
        self.bilinear = nn.Linear(emb_dim, emb_dim, bias=False)

        # MLP (using LayerNorm instead of BatchNorm)
        input_dim = emb_dim + num_numeric_feats
        layers = []
        for h in hidden:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.LayerNorm(h))     # replaced BatchNorm1d with LayerNorm
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            input_dim = h
        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(input_dim, 1)

    def senet(self, embs):
        B, F, D = embs.shape
        z = embs.view(B, -1)  # Flatten [B, F*D]
        a = self.se_fc1(z)    # Linear
        a = self.se_ln1(a)    # LayerNorm instead of BatchNorm
        a = F_torch.relu(a)
        a = self.se_dropout(a)
        s = torch.sigmoid(self.se_fc2(a)).view(B, F, D)
        return embs * s

    def bilinear_interaction(self, embs):
        transformed = self.bilinear(embs)
        interaction = (embs * transformed).mean(dim=1)
        return interaction

    def forward(self, x_cat, x_num):
        B, G = x_num.shape[:2]
        F, D = self.num_fields, self.emb_dim

        # Embeddings [B, G, F, D]
        embs = torch.stack([self.embeddings[f](x_cat[f]) for f in self.cat_fields], dim=2)

        # Linear output
        linear_cat_sum = torch.stack([
            self.linear_cat[f](x_cat[f]).squeeze(-1) for f in self.cat_fields
        ], dim=0).sum(dim=0)
        linear_num_out = self.linear_num(x_num).squeeze(-1)
        linear_out = linear_cat_sum + linear_num_out  # [B, G]

        # Feature interactions
        embs = embs.view(B * G, F, D)
        if self.use_senet:
            embs = self.senet(embs)
        bi = self.bilinear_interaction(embs)  # [B*G, D]

        # Deep part
        x_num_flat = x_num.view(B * G, -1)
        deep_input = torch.cat([bi, x_num_flat], dim=1)
        deep_out = self.mlp(deep_input)
        scores = self.output(deep_out).view(B, G)

        return scores + linear_out  # final output


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F_torch.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0
    valid_batches = 0

    for batch in tqdm(loader, desc="Training", leave=False):
        if batch is None:
            continue

        x_cat, x_num, y, lengths = batch

        x_num, y = x_num.to(device), y.to(device)
        lengths = lengths.to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        valid_batches += 1

    return total_loss / valid_batches if valid_batches > 0 else float("inf")



def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.2, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X

def apply_pca(X, numeric_features, train_mask, explained_variance=0.975):
    train_data = X[numeric_features].to_numpy()[train_mask]
    full_data = X[numeric_features].to_numpy()

    pca = PCA(n_components=explained_variance, svd_solver='full')
    pca.fit(train_data)
    transformed = pca.transform(full_data)

    print(f"📐 PCA: {len(numeric_features)} features → {transformed.shape[1]} components (explained variance ≥ {explained_variance})")

    pca_feature_names = [f'pca_{i}' for i in range(transformed.shape[1])]
    X_pca = pl.DataFrame(transformed, schema=pca_feature_names)

    X = X.drop(numeric_features)
    X = pl.concat([X, X_pca], how="horizontal")

    return X, pca_feature_names

def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size

    for col in num_features:
        nans = X.select(pl.col(col).is_nan().sum()).item()
        if nans > 0:
            print(f"⚠️ Found {nans} NaNs in column '{col}', filling with 0.")
            X = X.with_columns(
                pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
            )

    X = normalize_numeric_features(X, num_features, train_mask)
    X, pca_features = apply_pca(X, num_features, train_mask, explained_variance=0.98)
    num_features = pca_features

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers



class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()


def main():
    DATA_DIR = "./data"
    SUBMIT_DIR = "submission"
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    learning_rates_to_try = [1e-3, 7e-4, 5e-4]

    for lr in learning_rates_to_try:
        run_name = f"lr_{lr:.0e}".replace("-", "")
        MODEL_DIR = f"model_{run_name}"
        os.makedirs(MODEL_DIR, exist_ok=True)

        print(f"\n\n🚀 Starting training with learning rate = {lr}\nSaved to: {MODEL_DIR}")

        best_val_loss = float("inf")
        best_hitrate = 0.0
        model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        train_losses = []
        val_losses = []
        val_hitrates = []
        val_ndcgs = []
        val_maps = []
        learning_rates = []

        os.makedirs(MODEL_DIR, exist_ok=True)
        os.makedirs(SUBMIT_DIR, exist_ok=True)

        early_stopper = EarlyStopping(patience=3, min_delta=0.00005)

        X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

        # Fill missing values
        float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
        for col in float_cols:
            X = X.with_columns(
                pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
            )

        n1 = 16487352
        # n1 = 17487300
        n2 = train_size

        X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
        y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
        groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

        # Inspect group sizes in validation set
        val_rankers = groups_va["ranker_id"].to_numpy()
        val_group_counts = Counter(val_rankers)

        print("\n📋 Validation group statistics:")
        # Save detailed group info to file
        group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
        with open(group_info_path, "w") as f:
            f.write(f"Total groups: {len(val_group_counts)}\n\n")
            f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

            group_to_rows = defaultdict(list)
            for i, g in enumerate(val_rankers):
                group_to_rows[g].append(i)

            for group_id, indices in sorted(group_to_rows.items()):
                start_idx = indices[0]
                end_idx = indices[-1]
                size = len(indices)
                f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

        print(f"📁 Saved validation group info to {group_info_path}")


        cat_dims = build_cat_dims(X, cat_features)
        num_numeric_feats = len(num_features)

        train_dataset = RankDataset(X_tr, y_tr, groups_tr, cat_features, num_features)
        val_dataset = RankDataset(X_va, y_va, groups_va, cat_features, num_features)
        test_dataset = RankDataset(X_te, y_te, groups_te, cat_features, num_features)

        plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
        print("📊 Group size distribution saved.")

        train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
        val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
        test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, collate_fn=collate_fn)

        print("✅ Data loaders created.")

        num_epochs = 10
        num_training_steps = num_epochs * len(train_loader)
        num_warmup_steps = int(0.1 * num_training_steps)

        model = FiBiDLRanker(
            cat_dims=cat_dims,
            num_numeric_feats=num_numeric_feats,
            emb_dim=32,
            hidden=[512, 256, 128, 64]
        ).to(device)

        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)

        scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )

        best_hitrate = 0
        for epoch in range(num_epochs):
            print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
            start_time = time.time()

            train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
            val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

            train_losses.append(train_loss)
            val_losses.append(val_loss)
            val_hitrates.append(val_hitrate.item())
            val_ndcgs.append(val_ndcg.item())
            val_maps.append(val_map.item())

            current_lr = scheduler.get_last_lr()
            learning_rates.append(current_lr[0])

            elapsed = time.time() - start_time
            print(
                f"Epoch {epoch+1}: "
                f"Train Loss={train_loss:.4f}, "
                f"Val Loss={val_loss:.4f}, "
                f"HitRate@3={val_hitrate:.4f}, "
                f"NDCG@3={val_ndcg:.4f}, "
                f"MAP@3={val_map:.4f}, "
                f"LR={current_lr[0]:.6f} "
                f"({elapsed:.1f}s)"
            )


            if val_loss < best_val_loss and val_hitrate > best_hitrate:
                best_val_loss = val_loss
                best_hitrate = val_hitrate
                torch.save(model.state_dict(), model_path)
                print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

            if early_stopper.step(-val_loss):
                print("⛔ Early stopping triggered.")
                break

        # Plot metrics
        epochs = list(range(1, len(train_losses) + 1))

        # Plot Loss Curve
        plt.figure()
        plt.plot(epochs, train_losses, label="Train Loss")
        plt.plot(epochs, val_losses, label="Val Loss")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.title("Training & Validation Loss")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
        plt.close()

        # Plot HitRate@3
        plt.figure()
        plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
        plt.xlabel("Epoch")
        plt.ylabel("HitRate@3")
        plt.title("Validation HitRate@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
        plt.close()

        # Plot Learning Rate
        plt.figure()
        plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
        plt.xlabel("Epoch")
        plt.ylabel("LR")
        plt.title("Learning Rate Schedule")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
        plt.close()

        # Plot NDCG@3
        plt.figure()
        plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
        plt.xlabel("Epoch")
        plt.ylabel("NDCG@3")
        plt.title("Validation NDCG@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
        plt.close()

        # Plot MAP@3
        plt.figure()
        plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
        plt.xlabel("Epoch")
        plt.ylabel("MAP@3")
        plt.title("Validation MAP@3")
        plt.legend()
        plt.grid(True)
        plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
        plt.close()

        print("📊 Saved metric plots to model/")

        # Predict test set
        model.load_state_dict(torch.load(model_path))
        model.eval()

        val_ids = X_va["Id"]
        val_rankers = groups_va["ranker_id"]
        val_labels = y_va

        all_val_scores = []

        with torch.no_grad():
            for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
                x_num = x_num.to(device)
                x_cat = {k: v.to(device) for k, v in x_cat.items()}
                scores = model(x_cat, x_num)

                for i in range(scores.size(0)):
                    l = lengths[i]
                    all_val_scores.append(scores[i, :l].cpu().numpy())

        all_val_scores = np.concatenate(all_val_scores)

        assert len(all_val_scores) == X_va.shape[0], \
            f"Mismatch: {len(all_val_scores)} scores vs {X_va.shape[0]} rows"

        val_df = X_va.with_columns([
            pl.Series("Id", val_ids),
            pl.Series("ranker_id", val_rankers),
            pl.Series("score", all_val_scores),
            pl.Series("label", val_labels)
        ])

        val_df = val_df.with_columns(
            pl.col("score")
            .rank(method="ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )

        val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.csv")
        val_df.write_csv(val_save_path)
        print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

        all_scores = []

        with torch.no_grad():
            for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
                x_num = x_num.to(device)
                x_cat = {k: v.to(device) for k, v in x_cat.items()}
                scores = model(x_cat, x_num)

                for i in range(scores.size(0)):
                    l = lengths[i]
                    all_scores.append(scores[i, :l].cpu().numpy())

        all_scores = np.concatenate(all_scores)

        assert len(all_scores) == X_te.shape[0], \
            f"Mismatch: {len(all_scores)} scores vs {X_te.shape[0]} rows in X_te"

        submission_df = pl.DataFrame({
            "Id": test_ids,
            "ranker_id": test_rankers,
            "score": all_scores
        })

        submission = (
            submission_df
            .with_columns(
                pl.col("score")
                .rank(method="ordinal", descending=True)
                .over("ranker_id")
                .cast(pl.Int32)
                .alias("selected")
            )
            .select(["Id", "ranker_id", "score", "selected"])
        )

        submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.csv")
        submission.write_csv(submission_path)
        print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Overwriting deeprec_validate.py


# **Legs0/Legs1 Separation**

In [None]:
%%writefile deeprec_validate.py
import os
import time
import random
import torch
import math
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import numpy as np
import polars as pl
from tqdm import tqdm
from src.feature import feature_engineering, feature_selection
from transformers import get_scheduler
from collections import Counter

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

class RankDataset(Dataset):
    def __init__(
        self,
        X: pl.DataFrame,
        y: pl.Series,
        groups: pl.DataFrame,
        cat_features: list[str],
        num_features: list[str],
    ):
        self.cat_features = cat_features
        self.num_features = num_features
        self.X_cat = {c: X[c].to_numpy() for c in cat_features}
        self.X_num = X[self.num_features].to_numpy()
        self.y = y.to_numpy()
        self.groups = groups["ranker_id"].to_numpy()

        self.group_to_indices = defaultdict(list)
        for i, g in enumerate(self.groups):
            self.group_to_indices[g].append(i)
        self.group_keys = list(self.group_to_indices.keys())

    def __len__(self):
        return len(self.group_keys)

    def __getitem__(self, idx):
        g = self.group_keys[idx]
        inds = self.group_to_indices[g]
        length = len(inds)

        x_cat = {
            c: torch.LongTensor([
                self.X_cat[c][i] if self.X_cat[c][i] >= 0 else 0 for i in inds
            ])
            for c in self.cat_features
        }

        x_num = torch.FloatTensor(self.X_num[inds])
        y = torch.FloatTensor(self.y[inds]).reshape(-1)

        return x_cat, x_num, y, length

def collate_fn(batch):
    batch_size = len(batch)
    lengths = [b[3] for b in batch]
    max_len = max(lengths)

    padded_x_cat = {key: [] for key in batch[0][0].keys()}
    padded_x_num = []
    padded_y = []

    num_dim = batch[0][1].shape[1]

    for x_cat, x_num, y, length in batch:
        pad_len = max_len - length

        for key in padded_x_cat:
            val = x_cat[key]
            if pad_len > 0:
                val = torch.cat([val, torch.zeros(pad_len, dtype=torch.long)])
            padded_x_cat[key].append(val)

        if pad_len > 0:
            x_num = torch.cat([x_num, torch.zeros(pad_len, num_dim)])
            y = torch.cat([y, torch.zeros(pad_len)])

        padded_x_num.append(x_num)
        padded_y.append(y)

    padded_x_cat = {k: torch.stack(v) for k, v in padded_x_cat.items()}
    padded_x_num = torch.stack(padded_x_num)
    padded_y = torch.stack(padded_y)
    lengths = torch.tensor(lengths, dtype=torch.long)

    return padded_x_cat, padded_x_num, padded_y, lengths

class DLRanker(nn.Module):
    def __init__(self, cat_dims, num_numeric_feats, emb_dim=64, hidden=[512, 256, 128, 64]):
        super().__init__()

        self.embeddings = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], emb_dim, padding_idx=0) for f in cat_dims
        })
        self.linear_cat = nn.ModuleDict({
            f: nn.Embedding(cat_dims[f], 1, padding_idx=0) for f in cat_dims
        })

        for emb in list(self.embeddings.values()) + list(self.linear_cat.values()):
            nn.init.xavier_uniform_(emb.weight)

        self.linear_num = nn.Linear(num_numeric_feats, 1)
        nn.init.xavier_uniform_(self.linear_num.weight)

        input_dim = emb_dim * len(cat_dims) + num_numeric_feats
        layers = []
        for h in hidden:
            layers.append(nn.Linear(input_dim, h))
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.1))
            input_dim = h
        self.mlp = nn.Sequential(*layers)
        self.output = nn.Linear(input_dim, 1)

    def forward(self, x_cat, x_num):
        batch_size, group_size = x_num.shape[:2]

        linear_cat_sum = torch.stack([
            self.linear_cat[f](x_cat[f]).squeeze(-1)
            for f in self.linear_cat
        ], dim=0).sum(dim=0)

        linear_num_out = self.linear_num(x_num).squeeze(-1)
        linear_out = linear_cat_sum + linear_num_out

        embs = torch.stack([
            self.embeddings[f](x_cat[f]) for f in self.embeddings
        ], dim=2)

        sum_emb = embs.sum(dim=2)
        sum_emb_square = sum_emb ** 2
        square_emb_sum = (embs ** 2).sum(dim=2)
        fm_out = 0.0
        # fm_out = 0.5 * (sum_emb_square - square_emb_sum).sum(dim=2)

        embs_cat = embs.reshape(batch_size * group_size, -1)
        x_num_flat = x_num.reshape(batch_size * group_size, -1)
        deep_input = torch.cat([embs_cat, x_num_flat], dim=1)
        deep_out = self.mlp(deep_input)
        deep_out = self.output(deep_out).view(batch_size, group_size)

        return linear_out + fm_out + deep_out


"""
def listmle_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = torch.tensor(0.0, device=scores.device, requires_grad=True)

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        _, idx = torch.sort(y, descending=True)
        s_sorted = s[idx]

        parts = (
            torch.logsumexp(s_sorted.unsqueeze(0).expand(l, -1).triu(diagonal=0), dim=1)
            - s_sorted
        )

        total_loss = total_loss + parts.sum()

    return total_loss / batch_size
"""

def hitrate_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    hits = 0
    valid_count = 0
    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue
        length = min(l, k)
        s = scores[i][:l]
        y = labels[i][:l]
        _, topk_idx = torch.topk(s, length)
        topk_labels = y[topk_idx]
        hits += (topk_labels > 0).any().float()
        valid_count += 1
    return hits / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def ndcg_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ndcg = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        _, idx_ideal = torch.topk(y, min(k, l))

        dcg = (y[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred), device=y.device).float())).sum()
        idcg = (y[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal), device=y.device).float())).sum()

        ndcg = dcg / idcg if idcg > 0 else 0.0
        total_ndcg += ndcg
        valid_count += 1

    return total_ndcg / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)


def map_at_k(scores, labels, lengths, k=3, min_group_size=10):
    batch_size = scores.size(0)
    total_ap = 0.0
    valid_count = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < min_group_size:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        _, idx_pred = torch.topk(s, min(k, l))
        y_true = y[idx_pred] > 0

        if y_true.sum() == 0:
            continue

        precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
        ap = torch.stack(precisions).mean()
        total_ap += ap
        valid_count += 1

    return total_ap / valid_count if valid_count > 0 else torch.tensor(0.0, device=scores.device)



"""
def listnet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        s_softmax = torch.nn.functional.log_softmax(s, dim=0)
        y_softmax = torch.nn.functional.softmax(y, dim=0)

        loss = -torch.sum(y_softmax * s_softmax)
        total_loss += loss

    return total_loss / batch_size
"""

def ranknet_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        diff_scores = s.unsqueeze(1) - s.unsqueeze(0)
        rel = y.unsqueeze(1) > y.unsqueeze(0)
        label_diff = rel.float()

        log_sigmoid = torch.nn.functional.logsigmoid(diff_scores)
        pair_loss = -label_diff * log_sigmoid

        total_loss += pair_loss.sum()

    return total_loss / batch_size

def xranknet_loss(scores, labels, lengths, temperature=1.0, margin=0.2, focus_topk=3):
    """
    xRankNet with emphasis on top-K items (based on labels, not scores).
    """
    batch_size = scores.size(0)
    total_loss = 0.0
    total_pairs = 0

    for i in range(batch_size):
        l = lengths[i]
        if l < 2:
            continue

        s = scores[i][:l]
        y = labels[i][:l]

        # Top-K mask based on ground truth labels
        topk = min(focus_topk, l)
        _, topk_idx = torch.topk(y, topk)
        topk_mask = torch.zeros_like(y, dtype=torch.bool)
        topk_mask[topk_idx] = True

        # Create pairwise difference matrices
        score_diff = s.unsqueeze(1) - s.unsqueeze(0)
        label_diff = y.unsqueeze(1) - y.unsqueeze(0)

        # valid pair: y_i > y_j and at least one of i, j is in top-K
        valid_pairs = (label_diff > 0) & (
            topk_mask.unsqueeze(1) | topk_mask.unsqueeze(0)
        )

        if valid_pairs.sum() == 0:
            continue

        importance = (label_diff.abs() * valid_pairs).float()

        if margin > 0.0:
            score_diff = score_diff - margin

        score_diff = score_diff / temperature
        pairwise_loss = -F.logsigmoid(score_diff) * importance

        total_loss += pairwise_loss.sum()
        total_pairs += importance.sum()

    if total_pairs == 0:
        return torch.tensor(0.0, device=scores.device, requires_grad=True)

    return total_loss / total_pairs


"""
def bpr_loss(scores, labels, lengths):
    batch_size = scores.size(0)
    total_loss = 0.0
    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        pos_mask = y > 0
        neg_mask = y <= 0

        pos_scores = s[pos_mask]
        neg_scores = s[neg_mask]

        if len(pos_scores) == 0 or len(neg_scores) == 0:
            continue

        pair_diff = pos_scores.unsqueeze(1) - neg_scores.unsqueeze(0)
        loss = -F.logsigmoid(pair_diff).mean()
        total_loss += loss

    return total_loss / batch_size
"""

"""
def lambdarank_loss(scores, labels, lengths, eps=1e-10):
    batch_size = scores.size(0)
    total_loss = 0.0

    for i in range(batch_size):
        l = lengths[i]
        s = scores[i][:l]
        y = labels[i][:l]

        if l < 2:
            continue

        _, rank_order = torch.sort(s, descending=True)
        ideal_order = torch.sort(y, descending=True).indices

        gain = 2 ** y - 1
        discount = torch.log2(torch.arange(2, l + 2, device=y.device).float())

        ideal_dcg = (gain[ideal_order] / discount).sum()
        if ideal_dcg < eps:
            continue

        delta_ndcg = torch.zeros((l, l), device=y.device)

        for i_idx in range(l):
            for j_idx in range(l):
                if y[i_idx] > y[j_idx]:
                    change = (
                        (1.0 / torch.log2(rank_order[i_idx] + 2.0))
                        - (1.0 / torch.log2(rank_order[j_idx] + 2.0))
                    ).abs()
                    delta = (2 ** y[i_idx] - 2 ** y[j_idx]).abs() * change
                    delta_ndcg[i_idx, j_idx] = delta / ideal_dcg

        score_diffs = s.unsqueeze(1) - s.unsqueeze(0)
        lambdas = -F.logsigmoid(score_diffs) * delta_ndcg
        total_loss += lambdas.sum()

    return total_loss / batch_size
"""

def train_one_epoch(model, loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0.0

    for x_cat, x_num, y, lengths in tqdm(loader, desc="Training", leave=False):
        x_num, y = x_num.to(device), y.to(device)
        if not isinstance(lengths, torch.Tensor):
            lengths = torch.tensor(lengths)
        lengths = lengths.detach().clone().to(device)
        x_cat = {k: v.to(device) for k, v in x_cat.items()}

        optimizer.zero_grad()
        scores = model(x_cat, x_num)
        loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.0, focus_topk=3)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(loader)


def validate(model, loader, device, min_group_size=10):
    model.eval()
    total_hitrate = 0
    total_ndcg = 0
    total_map = 0
    total_loss = 0
    count = 0

    with torch.no_grad():
        for x_cat, x_num, y, lengths in tqdm(loader, desc="Validating", leave=False):
            x_num, y = x_num.to(device), y.to(device)
            lengths = torch.tensor(lengths).to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}

            scores = model(x_cat, x_num)

            hitrate = hitrate_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            ndcg = ndcg_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            map3 = map_at_k(scores, y, lengths, k=3, min_group_size=min_group_size)
            loss = xranknet_loss(scores, y, lengths, temperature=1.0, margin=0.0, focus_topk=3)

            batch_size = scores.size(0)
            total_hitrate += hitrate * batch_size
            total_ndcg += ndcg * batch_size
            total_map += map3 * batch_size
            total_loss += loss.item() * batch_size
            count += batch_size

    return total_loss / count, total_hitrate / count, total_ndcg / count, total_map / count


def build_cat_dims(X, cat_features):
    cat_dims = {}
    for c in cat_features:
        max_val = X[c].max()
        if max_val < 0:
            max_val = 0
        cat_dims[c] = max_val + 1
    return cat_dims


def normalize_numeric_features(X, numeric_features, train_mask):
    for col in numeric_features:
        values = X[col].to_numpy()
        train_vals = values[train_mask]
        mean = train_vals.mean()
        std = train_vals.std() if train_vals.std() > 0 else 1.0
        norm_vals = (values - mean) / std
        X = X.with_columns(pl.Series(col, norm_vals).alias(col))
    return X


def load_or_prepare_data(
    data_dir,
    train_file="train.parquet",
    test_file="test.parquet",
    cache_file="feature_engineered_val.parquet",
):
    cache_path = os.path.join(data_dir, cache_file)

    if os.path.exists(cache_path):
        print(f"Loading cached feature-engineered data from {cache_path} ...")
        df = pl.read_parquet(cache_path)

        test = pl.read_parquet(os.path.join(data_dir, test_file)).drop("__index_level_0__")
        test_ids = test["Id"]
        test_rankers = test["ranker_id"]
    else:
        print("Cached file not found, processing raw data ...")

        train = pl.read_parquet(os.path.join(data_dir, train_file)).drop("__index_level_0__")
        test = (
            pl.read_parquet(os.path.join(data_dir, test_file))
            .drop("__index_level_0__")
            .with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
        )

        test_ids = test["Id"]
        test_rankers = test["ranker_id"]

        df = pl.concat((train, test))
        df = feature_engineering(df, full=True)

        df = df.with_columns(
            [pl.col(c).fill_null(0) for c in df.select(pl.selectors.numeric()).columns] +
            [pl.col(c).fill_null("missing") for c in df.select(pl.selectors.string()).columns]
        )

        os.makedirs(data_dir, exist_ok=True)
        df.write_parquet(cache_path)
        print(f"Saved feature-engineered data to {cache_path}")

    return df, test_ids, test_rankers


def prepare_data(data_dir):
    df, test_ids, test_rankers = load_or_prepare_data(data_dir)

    id_column = df["Id"]
    X, y, groups, cat_features, num_features = feature_selection(df)

    X = X.with_columns(pl.Series("Id", id_column[:X.shape[0]]))

    X = X.with_columns(
        [
            (pl.col(c).rank("dense") - 1).fill_null(-1).cast(pl.Int32)
            for c in cat_features
        ]
    )

    train_size = 18145372
    train_mask = np.arange(X.height) < train_size
    X = normalize_numeric_features(X, num_features, train_mask)

    return X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers


class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.00005):
        self.patience = patience
        self.min_delta = min_delta
        self.best = float("-inf")
        self.counter = 0

    def step(self, metric):
        if metric > self.best + self.min_delta:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
        return self.counter >= self.patience


def plot_group_size_distribution(dataset, save_path=None):
    group_sizes = [len(v) for v in dataset.group_to_indices.values()]
    plt.figure()
    plt.hist(group_sizes, bins=100, log=True)
    plt.xlabel("Group size (number of items per ranker_id)")
    plt.ylabel("Frequency (log scale)")
    plt.title("Distribution of Group Sizes")
    plt.grid(True)
    if save_path:
        plt.savefig(save_path)
    else:
        plt.show()
    plt.close()

def split_by_is_one_way(
    X: pl.DataFrame, y: pl.Series, groups: pl.DataFrame, value: int
) -> tuple[pl.DataFrame, pl.Series, pl.DataFrame]:
    """
    Returns (X_sub, y_sub, groups_sub) where is_one_way == value (0 or 1).
    """
    mask = (X["is_one_way"] == value)
    return X.filter(mask), y.filter(mask), groups.filter(mask)

# Keep only columns that do NOT contain "legs1" or "leg1"
def drop_leg_cols(df: pl.DataFrame) -> pl.DataFrame:
    cols_to_keep = [c for c in df.columns if "legs1" not in c and "leg1" not in c]
    return df.select(cols_to_keep)

def main():
    DATA_DIR = "./data"
    MODEL_DIR = "model"
    SUBMIT_DIR = "submission"

    best_val_loss = float("inf")
    best_hitrate = 0.0
    model_path = os.path.join(MODEL_DIR, "best_dl_ranker.pt")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_losses = []
    val_losses = []
    val_hitrates = []
    val_ndcgs = []
    val_maps = []
    learning_rates = []

    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(SUBMIT_DIR, exist_ok=True)

    early_stopper = EarlyStopping(patience=3, min_delta=0.001)

    X, y, groups, cat_features, num_features, train_size, test_ids, test_rankers = prepare_data(DATA_DIR)

    # Fill missing values
    float_cols = [c for c, t in zip(X.columns, X.dtypes) if t == pl.Float64]
    for col in float_cols:
        X = X.with_columns(
            pl.when(pl.col(col).is_nan()).then(0).otherwise(pl.col(col)).alias(col)
        )

    n1 = 16487352
    # n1 = 17487300
    n2 = train_size

    X_tr, X_va, X_te = X[:n1], X[n1:n2], X[n2:]
    y_tr, y_va, y_te = y[:n1], y[n1:n2], y[n2:]
    groups_tr, groups_va, groups_te = groups[:n1], groups[n1:n2], groups[n2:]

    print("\n📊 Distribution of is_one_way in the training set:")
    print(X_tr["is_one_way"].value_counts().sort("is_one_way"))

    print("\n📊 Distribution of is_one_way in the validation set:")
    print(X_va["is_one_way"].value_counts().sort("is_one_way"))

    print("\n📊 Distribution of is_one_way in the test set:")
    print(X_te["is_one_way"].value_counts().sort("is_one_way"))

    X_tr_1, y_tr_1, groups_tr_1 = split_by_is_one_way(X_tr, y_tr, groups_tr, 1)
    X_tr_0, y_tr_0, groups_tr_0 = split_by_is_one_way(X_tr, y_tr, groups_tr, 0)

    X_va_1, y_va_1, groups_va_1 = split_by_is_one_way(X_va, y_va, groups_va, 1)
    X_va_0, y_va_0, groups_va_0 = split_by_is_one_way(X_va, y_va, groups_va, 0)

    X_te_1, y_te_1, groups_te_1 = split_by_is_one_way(X_te, y_te, groups_te, 1)
    X_te_0, y_te_0, groups_te_0 = split_by_is_one_way(X_te, y_te, groups_te, 0)

    # To use
    # X_tr_to_use = drop_leg_cols(X_tr_1)
    # y_tr_to_use, groups_tr_to_use = y_tr_1, groups_tr_1

    # X_va_to_use = drop_leg_cols(X_va_1)
    # y_va_to_use, groups_va_to_use = y_va_1, groups_va_1

    # X_te_to_use = drop_leg_cols(X_te_1)
    # y_te_to_use, groups_te_to_use = y_te_1, groups_te_1

    # To use
    # X_tr_to_use, y_tr_to_use, groups_tr_to_use = X_tr_1, y_tr_1, groups_tr_1
    # X_va_to_use, y_va_to_use, groups_va_to_use = X_va_1, y_va_1, groups_va_1
    # X_te_to_use, y_te_to_use, groups_te_to_use = X_te_1, y_te_1, groups_te_1

    # To use
    X_tr_to_use, y_tr_to_use, groups_tr_to_use = X_tr_0, y_tr_0, groups_tr_0
    X_va_to_use, y_va_to_use, groups_va_to_use = X_va_0, y_va_0, groups_va_0
    X_te_to_use, y_te_to_use, groups_te_to_use = X_te_0, y_te_0, groups_te_0

    # Inspect group sizes in validation set
    val_rankers = groups_va_to_use["ranker_id"].to_numpy()
    val_group_counts = Counter(val_rankers)

    print("\n📋 Validation group statistics:")
    # Save detailed group info to file
    group_info_path = os.path.join(MODEL_DIR, "validation_groups_info.txt")
    with open(group_info_path, "w") as f:
        f.write(f"Total groups: {len(val_group_counts)}\n\n")
        f.write("GroupID\tSize\tStartIdx\tEndIdx\n")

        group_to_rows = defaultdict(list)
        for i, g in enumerate(val_rankers):
            group_to_rows[g].append(i)

        for group_id, indices in sorted(group_to_rows.items()):
            start_idx = indices[0]
            end_idx = indices[-1]
            size = len(indices)
            f.write(f"{group_id}\t{size}\t{start_idx}\t{end_idx}\n")

    print(f"📁 Saved validation group info to {group_info_path}")

    cat_dims = build_cat_dims(X, cat_features)
    num_numeric_feats = len(num_features)

    train_dataset = RankDataset(X_tr_to_use, y_tr_to_use, groups_tr_to_use, cat_features, num_features)
    val_dataset = RankDataset(X_va_to_use, y_va_to_use, groups_va_to_use, cat_features, num_features)
    test_dataset = RankDataset(X_te_to_use, y_te_to_use, groups_te_to_use, cat_features, num_features)

    plot_group_size_distribution(train_dataset, save_path=os.path.join(MODEL_DIR, "group_sizes.png"))
    print("📊 Group size distribution saved.")

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

    print("✅ Data loaders created.")

    num_epochs = 10
    num_training_steps = num_epochs * len(train_loader)
    num_warmup_steps = int(0.1 * num_training_steps)

    learning_rate = 1e-3

    model = DLRanker(cat_dims, num_numeric_feats).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)

    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    best_hitrate = 0
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        start_time = time.time()

        train_loss = train_one_epoch(model, train_loader, optimizer, device, scheduler)
        val_loss, val_hitrate, val_ndcg, val_map = validate(model, val_loader, device, min_group_size=10)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        val_hitrates.append(val_hitrate.item())
        val_ndcgs.append(val_ndcg.item())
        val_maps.append(val_map.item())

        current_lr = scheduler.get_last_lr()
        learning_rates.append(current_lr[0])

        elapsed = time.time() - start_time
        print(
            f"Epoch {epoch+1}: "
            f"Train Loss={train_loss:.4f}, "
            f"Val Loss={val_loss:.4f}, "
            f"HitRate@3={val_hitrate:.4f}, "
            f"NDCG@3={val_ndcg:.4f}, "
            f"MAP@3={val_map:.4f}, "
            f"LR={current_lr[0]:.6f} "
            f"({elapsed:.1f}s)"
        )


        if val_loss < best_val_loss and val_hitrate > best_hitrate:
            best_val_loss = val_loss
            best_hitrate = val_hitrate
            torch.save(model.state_dict(), model_path)
            print("💾 Model improved (loss ↓ and hitrate ↑), saved.")

        if early_stopper.step(-val_loss):
            print("⛔ Early stopping triggered.")
            break

    # Plot metrics
    epochs = list(range(1, len(train_losses) + 1))

    # Plot Loss Curve
    plt.figure()
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "loss_curve.png"))
    plt.close()

    # Plot HitRate@3
    plt.figure()
    plt.plot(epochs, val_hitrates, label="Val HitRate@3", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("HitRate@3")
    plt.title("Validation HitRate@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "hitrate_curve.png"))
    plt.close()

    # Plot Learning Rate
    plt.figure()
    plt.plot(epochs, learning_rates, label="Learning Rate", color="orange")
    plt.xlabel("Epoch")
    plt.ylabel("LR")
    plt.title("Learning Rate Schedule")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "lr_curve.png"))
    plt.close()

    # Plot NDCG@3
    plt.figure()
    plt.plot(epochs, val_ndcgs, label="Val NDCG@3", color="purple")
    plt.xlabel("Epoch")
    plt.ylabel("NDCG@3")
    plt.title("Validation NDCG@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "ndcg_curve.png"))
    plt.close()

    # Plot MAP@3
    plt.figure()
    plt.plot(epochs, val_maps, label="Val MAP@3", color="red")
    plt.xlabel("Epoch")
    plt.ylabel("MAP@3")
    plt.title("Validation MAP@3")
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(MODEL_DIR, "map_curve.png"))
    plt.close()

    print("📊 Saved metric plots to model/")

    # Predict test set
    model.load_state_dict(torch.load(model_path))
    model.eval()

    val_ids = X_va_to_use["Id"]
    val_rankers = groups_va_to_use["ranker_id"]
    val_labels = y_va_to_use

    all_val_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(val_loader, desc="Predicting validation set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_val_scores.append(scores[i, :l].cpu().numpy())

    all_val_scores = np.concatenate(all_val_scores)

    assert len(all_val_scores) == X_va_to_use.shape[0], \
        f"Mismatch: {len(all_val_scores)} scores vs {X_va_to_use.shape[0]} rows"

    val_df = X_va_to_use.with_columns([
        pl.Series("Id", val_ids),
        pl.Series("ranker_id", val_rankers),
        pl.Series("score", all_val_scores),
        pl.Series("label", val_labels)
    ])

    val_df = val_df.with_columns(
        pl.col("score")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    val_save_path = os.path.join(MODEL_DIR, "validation_preds_dl_full.parquet")
    val_df.write_parquet(val_save_path)
    print(f"✅ Saved full validation predictions (with features) to {val_save_path}")

    all_scores = []

    with torch.no_grad():
        for x_cat, x_num, _, lengths in tqdm(test_loader, desc="Predicting test set"):
            x_num = x_num.to(device)
            x_cat = {k: v.to(device) for k, v in x_cat.items()}
            scores = model(x_cat, x_num)

            for i in range(scores.size(0)):
                l = lengths[i]
                all_scores.append(scores[i, :l].cpu().numpy())

    all_scores = np.concatenate(all_scores)

    # Find the rows in the full test set where we actually made predictions
    # (assuming test_loader is built from X_te filtered by is_one_way == 1/0)
    mask_one_way = (X_te["is_one_way"].to_numpy() == 0)
    idx_one_way = np.where(mask_one_way)[0]

    assert len(all_scores) == len(idx_one_way), \
        f"Mismatch: {len(all_scores)} predictions vs {len(idx_one_way)} one-way rows"

    # Initialize scores for the entire test set as NaN and fill only predicted rows
    all_scores_full = np.full(len(test_ids), np.nan, dtype=np.float32)
    all_scores_full[idx_one_way] = all_scores

    # Build the full submission dataframe; add a stable row index to preserve original order
    row_idx = pl.Series("row_idx", np.arange(len(test_ids), dtype=np.int64))

    submission_df = pl.DataFrame({
        "row_idx": row_idx,
        "Id": test_ids,
        "ranker_id": test_rankers,
        "score": all_scores_full,
    })

    # Compute 'selected' only where a score exists; keep others at 0 (or another default)
    submission_df = submission_df.with_columns(
        pl.when(pl.col("score").is_not_null())
          .then(
              pl.col("score")
              .rank(method="ordinal", descending=True)
              .over("ranker_id")
              .cast(pl.Int32)
          )
          .otherwise(0)
          .alias("selected")
    )

    # Ensure original order by row_idx, then select final columns
    submission = (
        submission_df
        .sort("row_idx")       # keeps original Id order
        .select(["Id", "ranker_id", "score", "selected"])
    )

    submission_path = os.path.join(SUBMIT_DIR, "submission_dl_ranker.parquet")
    submission.write_parquet(submission_path)
    print(f"✅ Submission saved to {submission_path}")


if __name__ == "__main__":
    main()

Overwriting deeprec_validate.py


is_one_way = 1, with legs_1 columns

In [None]:
--- Epoch 1/10 ---
Validating:   0%|          | 0/332 [00:00<?, ?it/s]/home/ionut/test_whisper/test2/deeprec_validate.py:446: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.detach().clone() or sourceTensor.detach().clone().requires_grad_(True), rather than torch.tensor(sourceTensor).
  lengths = torch.tensor(lengths).to(device)
Epoch 1: Train Loss=0.3710, Val Loss=0.3602, HitRate@3=0.4649, NDCG@3=0.3531, MAP@3=0.6646, LR=0.001000 (292.6s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 2/10 ---
Epoch 2: Train Loss=0.3145, Val Loss=0.3105, HitRate@3=0.5272, NDCG@3=0.4234, MAP@3=0.7303, LR=0.000889 (292.2s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 3/10 ---
Epoch 3: Train Loss=0.2890, Val Loss=0.2891, HitRate@3=0.5440, NDCG@3=0.4394, MAP@3=0.7309, LR=0.000778 (292.0s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 4/10 ---
Epoch 4: Train Loss=0.2730, Val Loss=0.2990, HitRate@3=0.5272, NDCG@3=0.4176, MAP@3=0.7124, LR=0.000667 (293.3s)

--- Epoch 5/10 ---
Epoch 5: Train Loss=0.2596, Val Loss=0.2897, HitRate@3=0.5543, NDCG@3=0.4514, MAP@3=0.7424, LR=0.000556 (292.6s)

--- Epoch 6/10 ---
Epoch 6: Train Loss=0.2486, Val Loss=0.2816, HitRate@3=0.5537, NDCG@3=0.4523, MAP@3=0.7458, LR=0.000444 (292.4s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 7/10 ---
Epoch 7: Train Loss=0.2357, Val Loss=0.2850, HitRate@3=0.5543, NDCG@3=0.4512, MAP@3=0.7415, LR=0.000333 (291.8s)

--- Epoch 8/10 ---
Epoch 8: Train Loss=0.2241, Val Loss=0.2806, HitRate@3=0.5533, NDCG@3=0.4501, MAP@3=0.7414, LR=0.000222 (292.3s)

--- Epoch 9/10 ---
Epoch 9: Train Loss=0.2142, Val Loss=0.2806, HitRate@3=0.5545, NDCG@3=0.4516, MAP@3=0.7430, LR=0.000111 (292.7s)
💾 Model improved (loss ↓ and hitrate ↑), saved.

--- Epoch 10/10 ---
Epoch 10: Train Loss=0.2030, Val Loss=0.2857, HitRate@3=0.5501, NDCG@3=0.4476, MAP@3=0.7420, LR=0.000000 (292.1s)

In [None]:
%%writefile check.py
import pyarrow.parquet as pq

# Function to extract column names from a parquet file
def get_columns(filename):
    schema = pq.read_schema(filename)
    return schema.names

# Get columns
cols_full = get_columns("feateng_full.parquet")
cols_not_full = get_columns("feateng_not_full.parquet")

# Filter columns containing 'legs0' and 'legs1'
legs0_full = [col for col in cols_full if "legs0" in col]
legs1_full = [col for col in cols_full if "legs1" in col]
legs0_not_full = [col for col in cols_not_full if "legs0" in col]
legs1_not_full = [col for col in cols_not_full if "legs1" in col]

# Save results to a text file
with open("columns_info.txt", "w", encoding="utf-8") as f:
    f.write("Columns in feateng_full.parquet:\n")
    f.write(", ".join(cols_full) + "\n\n")

    f.write("Columns in feateng_not_full.parquet:\n")
    f.write(", ".join(cols_not_full) + "\n\n")

    f.write("Columns in feateng_full containing 'legs0':\n")
    f.write(", ".join(legs0_full) + "\n")

    f.write("Columns in feateng_full containing 'legs1':\n")
    f.write(", ".join(legs1_full) + "\n\n")

    f.write("Columns in feateng_not_full containing 'legs0':\n")
    f.write(", ".join(legs0_not_full) + "\n")

    f.write("Columns in feateng_not_full containing 'legs1':\n")
    f.write(", ".join(legs1_not_full) + "\n")

print("File 'columns_info.txt' has been saved.")

Overwriting check.py


In [None]:
%%writefile create_legs0_only.py
import pyarrow.parquet as pq
import pandas as pd

# Step 1: Read only the schema (no data yet)
schema = pq.read_schema("feateng_not_full.parquet")
all_columns = schema.names

# Step 2: Keep only columns that DO NOT contain 'legs1'
columns_to_keep = [col for col in all_columns if "legs1" not in col]

# Step 3: Load only these columns from the file
df_not_full_legs0_only = pd.read_parquet("feateng_not_full.parquet", columns=columns_to_keep)

# Step 4: Save the new dataset
df_not_full_legs0_only.to_parquet("feateng_not_full_legs0_only.parquet", index=False)

print(f"Saved 'feateng_not_full_legs0_only.parquet' with {len(columns_to_keep)} columns.")

Overwriting create_legs0_only.py


# **Ensemble**

In [None]:
import polars as pl
import os

def compute_confidence(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns([
        pl.len().over("ranker_id").alias("group_size")
    ])

    df = df.with_columns([
        (1.0 - ((pl.col("selected") - 1) / (pl.col("group_size") - 1 + 1e-8))).alias("confidence")
    ])

    return df.drop("group_size")

def process_file(input_path: str):
    print(f"🔄 Processing: {input_path}")

    if input_path.endswith(".csv"):
        df = pl.read_csv(input_path)
    elif input_path.endswith(".parquet"):
        df = pl.read_parquet(input_path)
    else:
        print(f"⚠️ Skipping unsupported file: {input_path}")
        return

    df_conf = compute_confidence(df)

    base, ext = os.path.splitext(input_path)
    if base.endswith("_with_confidence"):
        output_path = f"{base}.csv"
    else:
        output_path = f"{base}_with_confidence.csv"

    df_conf.write_csv(output_path)
    print(f"✅ Saved: {output_path}\n")



def main():
    files = [
        "submission_20250721083807_0.52244.parquet",
        "submission_20250724032338_0.51345.parquet",
        "submission_20250725083055_0.52391.parquet",
        "submission_20250727084025_0.52795.parquet",
        "submission_20250802074816_0.52603.parquet",
        "submission_20250804001151_0.51244.parquet",
        "submission_20250807032439_0.52538.parquet",
        "submission_dl_ranker_0.48755.parquet"
    ]

    for file in files:
        if os.path.exists(file):
            process_file(file)
        else:
            print(f"❌ File not found: {file}")


if __name__ == "__main__":
    main()

🔄 Processing: submission_20250721083807_0.52244.parquet
✅ Saved: submission_20250721083807_0.52244_with_confidence.csv

🔄 Processing: submission_20250724032338_0.51345.parquet
✅ Saved: submission_20250724032338_0.51345_with_confidence.csv

🔄 Processing: submission_20250725083055_0.52391.parquet
✅ Saved: submission_20250725083055_0.52391_with_confidence.csv

🔄 Processing: submission_20250727084025_0.52795.parquet
✅ Saved: submission_20250727084025_0.52795_with_confidence.csv

🔄 Processing: submission_20250802074816_0.52603.parquet
✅ Saved: submission_20250802074816_0.52603_with_confidence.csv

🔄 Processing: submission_20250804001151_0.51244.parquet
✅ Saved: submission_20250804001151_0.51244_with_confidence.csv

🔄 Processing: submission_20250807032439_0.52538.parquet
✅ Saved: submission_20250807032439_0.52538_with_confidence.csv

🔄 Processing: submission_dl_ranker_0.48755.parquet
✅ Saved: submission_dl_ranker_0.48755_with_confidence.csv



In [None]:
import polars as pl
import os

def compute_confidence(df: pl.DataFrame, alpha: float = 0.7, k: int = 5) -> pl.DataFrame:
    # group size for each ranker_id
    df = df.with_columns([
        pl.len().over("ranker_id").alias("group_size")
    ])

    # 1) base score based on position within the group (max for selected = 1)
    base_conf = 1.0 - ((pl.col("selected") - 1) / (pl.col("group_size") - 1 + 1e-8))
    df = df.with_columns(base_conf.alias("confidence"))

    # 2) RRF score normalized to [0, 1]
    rrf_max = 1.0 / (k + 1)  # RRF value for selected = 1
    df = df.with_columns(
        ((1.0 / (k + pl.col("selected"))) / rrf_max).alias("rrf_score")
    )

    # 3) convex combination between base score and RRF
    df = df.with_columns(
        (alpha * pl.col("confidence") + (1 - alpha) * pl.col("rrf_score")).alias("confidence")
    )

    # keep 'selected'; remove only temporary columns
    return df.drop(["group_size", "rrf_score"])

def process_file(input_path: str, alpha: float = 0.7, k: int = 5):
    print(f"🔄 Processing: {input_path}")

    if input_path.endswith(".csv"):
        df = pl.read_csv(input_path)
    elif input_path.endswith(".parquet"):
        df = pl.read_parquet(input_path)
    else:
        print(f"⚠️ Skipping unsupported file: {input_path}")
        return

    df_conf = compute_confidence(df, alpha=alpha, k=k)

    base, ext = os.path.splitext(input_path)
    if base.endswith("_with_confidence"):
        output_path = f"{base}.csv"
    else:
        output_path = f"{base}_with_confidence.csv"

    df_conf.write_csv(output_path)
    print(f"✅ Saved: {output_path}\n")


def main():
    files = [
        "submission_20250721083807_0.52244.parquet",
        "submission_20250724032338_0.51345.parquet",
        "submission_20250725083055_0.52391.parquet",
        "submission_20250727084025_0.52795.parquet",
        "submission_20250802074816_0.52603.parquet",
        "submission_20250804001151_0.51244.parquet",
        "submission_20250807032439_0.52538.parquet",
        "submission_dl_ranker_0.48755.parquet"
    ]

    alpha = 0.7
    k = 5

    for file in files:
        if os.path.exists(file):
            process_file(file, alpha=alpha, k=k)
        else:
            print(f"❌ File not found: {file}")


if __name__ == "__main__":
    main()


🔄 Processing: submission_20250721083807_0.52244.parquet
✅ Saved: submission_20250721083807_0.52244_with_confidence.csv

🔄 Processing: submission_20250724032338_0.51345.parquet
✅ Saved: submission_20250724032338_0.51345_with_confidence.csv

🔄 Processing: submission_20250725083055_0.52391.parquet
✅ Saved: submission_20250725083055_0.52391_with_confidence.csv

🔄 Processing: submission_20250727084025_0.52795.parquet
✅ Saved: submission_20250727084025_0.52795_with_confidence.csv

🔄 Processing: submission_20250802074816_0.52603.parquet
✅ Saved: submission_20250802074816_0.52603_with_confidence.csv

🔄 Processing: submission_20250804001151_0.51244.parquet
✅ Saved: submission_20250804001151_0.51244_with_confidence.csv

🔄 Processing: submission_20250807032439_0.52538.parquet
✅ Saved: submission_20250807032439_0.52538_with_confidence.csv

🔄 Processing: submission_dl_ranker_0.48755.parquet
✅ Saved: submission_dl_ranker_0.48755_with_confidence.csv



In [None]:
import polars as pl
import glob
import re
import numpy as np
import pandas as pd

def load_submission_with_confidence(filepath):
    df = pl.read_csv(filepath)
    if "selected" in df.columns:
        df = df.drop("selected")
    return df

def compute_entropy(probabilities):
    p = np.clip(probabilities, 1e-9, 1.0)
    return -np.sum(p * np.log(p))

def pca_svd(X, n_components):
    """
    X: (n_samples, n_models) already standardized by columns.
    Returns:
      scores  : (n_samples, n_components)  = X @ Vt_k.T
      loadings: (n_models,  n_components)  = Vt_k.T
      recon   : (n_samples, n_models) reconstructed from the first components
    """
    # SVD on standardized X (centering + scaling should be done beforehand)
    # Note: np.linalg.svd returns Vt with shape (n_models, n_models)
    U, S, Vt = np.linalg.svd(X, full_matrices=False)
    Vt_k = Vt[:n_components, :]               # (k, n_models)
    scores = X @ Vt_k.T                        # (n_samples, k)
    recon  = scores @ Vt_k                     # (n_samples, n_models)
    loadings = Vt_k.T                          # (n_models, k)
    return scores, loadings, recon

def main():
    # Find all files matching *_with_confidence.csv
    filepaths = glob.glob("submission_*_*_with_confidence.csv")

    dfs = []
    confidence_col_names = []

    for filepath in filepaths:
        match = re.search(r"submission_.*_([\d.]+)_with_confidence\.csv", filepath)
        if match:
            score = match.group(1)
            df = load_submission_with_confidence(filepath)
            confidence_col = f"confidence_{score}"
            df = df.with_columns(pl.col("confidence").alias(confidence_col)).drop("confidence")
            confidence_col_names.append(confidence_col)
            dfs.append(df)

    if not dfs:
        print("❌ No valid submission files found.")
        return

    # Join on Id + ranker_id
    df_combined = dfs[0]
    for df in dfs[1:]:
        df_combined = df_combined.join(df, on=["Id", "ranker_id"])

    # Confidence matrix (n_samples x n_models)
    X_df = df_combined.select(confidence_col_names).to_pandas()
    # Replace NaN with column mean
    X_df = X_df.apply(lambda col: col.fillna(col.mean()), axis=0)
    X = X_df.values.astype(float)

    # Standardize columns (center and scale) for PCA
    col_means = X.mean(axis=0, keepdims=True)
    col_stds  = X.std(axis=0, ddof=1, keepdims=True)
    col_stds[col_stds == 0.0] = 1.0  # protection against zero std
    X_std = (X - col_means) / col_stds

    # PCA parameters + Spearman calculation mode
    k_pca = 5
    spearman_mode = "residual"  # "residual" (recommended) | "loadings" | "scores"

    # PCA via SVD
    scores, loadings, recon = pca_svd(X_std, n_components=k_pca)

    if spearman_mode == "residual":
        # Idiosyncratic signal (after removing the first k components)
        X_resid = X_std - recon
        corr_df = pd.DataFrame(X_resid, columns=confidence_col_names).corr(method="spearman")
        print(f"📊 Spearman between models on RESIDUALS after PCA (k={k_pca}):")
    elif spearman_mode == "loadings":
        # Spearman correlation between each model's loadings on the first k components
        load_df = pd.DataFrame(loadings, index=confidence_col_names,
                               columns=[f"PC{i+1}" for i in range(k_pca)])
        corr_df = load_df.T.corr(method="spearman")
        print(f"📊 Spearman between models on PCA LOADINGS (first {k_pca}):")
    elif spearman_mode == "scores":
        # Spearman correlation between models viewed through the PCA scores
        # (correlate reconstructed model columns from the first k components)
        X_k = recon  # common signal from the first components
        corr_df = pd.DataFrame(X_k, columns=confidence_col_names).corr(method="spearman")
        print(f"📊 Spearman between models on PCA SCORES (first {k_pca}):")
    else:
        raise ValueError("Invalid spearman_mode. Choose: 'residual' | 'loadings' | 'scores'.")

    print(corr_df)

    # Uniqueness based on 1 - average correlation (exclude diagonal = 1)
    mean_corr = corr_df.apply(lambda row: (row.sum() - 1) / (len(row) - 1), axis=1)
    model_uniqueness = 1 - mean_corr
    uniqueness_weights = model_uniqueness / model_uniqueness.sum()

    # Entropy for each confidence column (on raw scores, not standardized)
    entropies = []
    for col in confidence_col_names:
        probs = df_combined[col].to_numpy()
        probs = np.nan_to_num(probs, nan=np.nanmean(probs))
        total = np.sum(probs)
        if total <= 0:
            # stable fallback
            probs = np.full_like(probs, 1.0 / len(probs), dtype=float)
        else:
            probs = probs / total
        entropies.append(compute_entropy(probs))

    entropy_weights = np.array(entropies, dtype=float)
    entropy_weights /= entropy_weights.sum()

    # Combine weights: uniqueness (post-PCA) + entropy (informativeness)
    alpha = 0.65  # more emphasis on uniqueness
    combined_weights = alpha * uniqueness_weights.values + (1 - alpha) * entropy_weights
    combined_weights = combined_weights / combined_weights.sum()

    print("\n⚖️ Final ensemble weights (uniqueness + entropy):")
    for col, weight in zip(confidence_col_names, combined_weights):
        print(f"{col}: {weight:.4f}")

    # Weighted ensemble on confidence
    weighted_conf = sum(
        df_combined[col].fill_null(0) * weight
        for col, weight in zip(confidence_col_names, combined_weights)
    )
    df_combined = df_combined.with_columns(weighted_conf.alias("ensemble_confidence"))

    # Ranking per group (ranker_id) by ensemble_confidence
    df_ranked = df_combined.with_columns(
        pl.col("ensemble_confidence")
        .rank("ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    final_submission = df_ranked.select(["Id", "ranker_id", "selected"])

    print("\n✅ Sample of final submission:")
    print(final_submission.head())

    final_submission.write_parquet("submission_ensemble.parquet")
    print("\n💾 Saved as: submission_ensemble.parquet")

if __name__ == "__main__":
    main()

📊 Spearman between models on RESIDUALS after PCA (k=5):
                    confidence_0.51244  confidence_0.52795  \
confidence_0.51244            1.000000           -0.871089   
confidence_0.52795           -0.871089            1.000000   
confidence_0.52603            0.049608           -0.366223   
confidence_0.52538            0.678231           -0.507004   
confidence_0.48755           -0.036545            0.287423   
confidence_0.51345           -0.463710            0.119839   
confidence_0.52244           -0.239497           -0.023314   
confidence_0.52391            0.297432           -0.020860   

                    confidence_0.52603  confidence_0.52538  \
confidence_0.51244            0.049608            0.678231   
confidence_0.52795           -0.366223           -0.507004   
confidence_0.52603            1.000000           -0.487018   
confidence_0.52538           -0.487018            1.000000   
confidence_0.48755           -0.023417           -0.371223   
confidence_0.

In [None]:
import polars as pl
import os

def compute_confidence(df: pl.DataFrame, alpha: float = 0.7, k: int = 5) -> pl.DataFrame:
    # mărimea grupului pentru fiecare ranker_id
    df = df.with_columns([
        pl.len().over("ranker_id").alias("group_size")
    ])

    # 1) scorul de bază pe poziție în cadrul grupului (maxim pentru selected=1)
    base_conf = 1.0 - ((pl.col("selected") - 1) / (pl.col("group_size") - 1 + 1e-8))
    df = df.with_columns(base_conf.alias("confidence"))

    # 2) scorul RRF normalizat la [0, 1]
    rrf_max = 1.0 / (k + 1)  # valoarea RRF pentru selected=1
    df = df.with_columns(
        ((1.0 / (k + pl.col("selected"))) / rrf_max).alias("rrf_score")
    )

    # 3) combinație convexă între scorul de bază și RRF
    df = df.with_columns(
        (alpha * pl.col("confidence") + (1 - alpha) * pl.col("rrf_score")).alias("confidence")
    )

    # păstrăm 'selected'; curățăm doar coloanele temporare
    return df.drop(["group_size", "rrf_score"])

def process_file(input_path: str, alpha: float = 0.7, k: int = 5):
    print(f"🔄 Processing: {input_path}")

    if input_path.endswith(".csv"):
        df = pl.read_csv(input_path)
    elif input_path.endswith(".parquet"):
        df = pl.read_parquet(input_path)
    else:
        print(f"⚠️ Skipping unsupported file: {input_path}")
        return

    df_conf = compute_confidence(df, alpha=alpha, k=k)

    base, ext = os.path.splitext(input_path)
    if base.endswith("_with_confidence"):
        output_path = f"{base}.csv"
    else:
        output_path = f"{base}_with_confidence.csv"

    df_conf.write_csv(output_path)
    print(f"✅ Saved: {output_path}\n")


def main():
    files = [
        "submission_ensemble_xgb.parquet",
        "submission_ensemble_lgb.parquet"
    ]

    alpha = 0.7
    k = 5

    for file in files:
        if os.path.exists(file):
            process_file(file, alpha=alpha, k=k)
        else:
            print(f"❌ File not found: {file}")


if __name__ == "__main__":
    main()


🔄 Processing: submission_ensemble_xgb.parquet
✅ Saved: submission_ensemble_xgb_with_confidence.csv

🔄 Processing: submission_ensemble_lgb.parquet
✅ Saved: submission_ensemble_lgb_with_confidence.csv



In [None]:
import polars as pl
from pathlib import Path

# ---------------------
# FILES TO ENSEMBLE
# ---------------------
INPUT_FILES = [
    "submission_0.51822_with_confidence.csv",
    "submission_dl_ranker_with_confidence.csv",
    "submission_0.52244_with_confidence.csv"
]

OUTPUT_FILE = "submission_ensemble.parquet"

# ---------------------
# LOAD & CONCAT
# ---------------------
dfs = []

for path in INPUT_FILES:
    if not Path(path).exists():
        raise FileNotFoundError(f"❌ File not found: {path}")

    df = pl.read_csv(path).drop("selected", strict=False)  # drop selected if exists
    if "confidence" not in df.columns:
        raise ValueError(f"⚠️ Missing 'confidence' column in {path}")

    dfs.append(df)

# ---------------------
# ENSEMBLE LOGIC
# ---------------------
# Concatenate and aggregate confidence per Id + ranker_id
df_combined = (
    pl.concat(dfs)
    .group_by(["Id", "ranker_id"])
    .agg(pl.sum("confidence").alias("confidence_sum"))
)

# Rank within each ranker_id
df_ranked = (
    df_combined.with_columns([
        pl.col("confidence_sum")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    ])
)

# Load original ordering from first file
df_original = pl.read_csv(INPUT_FILES[0]).select(["Id", "ranker_id"])

# Join and keep only relevant columns
final_submission = (
    df_original.join(
        df_ranked.select(["Id", "ranker_id", "selected"]),
        on=["Id", "ranker_id"],
        how="left"
    )
)

# ---------------------
# SAVE
# ---------------------
final_submission.write_parquet(OUTPUT_FILE)
print(f"✅ Ensemble saved as {OUTPUT_FILE}")


✅ Ensemble saved as submission_ensemble.parquet


**Boosting**

In [None]:
import polars as pl
from pathlib import Path

# ---------------------
# CONFIG
# ---------------------
V1 = "submission_dl_ranker_0.48755_with_confidence.csv"
V2 = "submission_dl_ranker_0.49242_with_confidence.csv"
OUTPUT_FILE = "submission_dl_ranker_weighted_boosted_normalized.csv"

WEIGHT_V1 = 0.65
WEIGHT_V2 = 0.35
BOOST_IF_EQUAL = 1.2
EPSILON = 1e-8  # pentru comparare și normalizare

# ---------------------
# LOAD FILES
# ---------------------
def load_conf_df(path: str) -> pl.DataFrame:
    if not Path(path).exists():
        raise FileNotFoundError(f"❌ File not found: {path}")
    df = pl.read_csv(path) if path.endswith(".csv") else pl.read_parquet(path)
    if "confidence" not in df.columns:
        raise ValueError(f"⚠️ Missing 'confidence' column in {path}")
    return df.drop("selected", strict=False)

df_v1 = load_conf_df(V1).rename({"confidence": "confidence_v1"})
df_v2 = load_conf_df(V2).rename({"confidence": "confidence_v2"})

# ---------------------
# JOIN FILES
# ---------------------
df_joined = df_v1.join(df_v2, on=["Id", "ranker_id"], how="inner")

# ---------------------
# DETERMINE BOOST FLAG (before weighting)
# ---------------------
df_with_flag = df_joined.with_columns([
    ((pl.col("confidence_v1") - pl.col("confidence_v2")).abs() < EPSILON).alias("apply_boost")
])

# ---------------------
# COMPUTE WEIGHTED CONFIDENCE + APPLY BOOST
# ---------------------
df_weighted = df_with_flag.with_columns([
    (WEIGHT_V1 * pl.col("confidence_v1") + WEIGHT_V2 * pl.col("confidence_v2")).alias("base_confidence")
])

df_boosted = df_weighted.with_columns([
    pl.when(pl.col("apply_boost"))
    .then(pl.col("base_confidence") * BOOST_IF_EQUAL)
    .otherwise(pl.col("base_confidence"))
    .alias("confidence")
])

# ---------------------
# NORMALIZE CONFIDENCE TO [0, 1]
# ---------------------
min_conf, max_conf = df_boosted.select([
    pl.col("confidence").min().alias("min"),
    pl.col("confidence").max().alias("max")
]).row(0)

df_normalized = df_boosted.with_columns([
    ((pl.col("confidence") - min_conf) / (max_conf - min_conf + EPSILON)).alias("confidence_normalized")
])

# ---------------------
# SELECT & SAVE
# ---------------------
df_output = df_normalized.select(["Id", "ranker_id", "confidence_normalized"]).rename({"confidence_normalized": "confidence"})
df_output.write_csv(OUTPUT_FILE)

print(f"✅ Weighted, boosted, and normalized confidence saved to {OUTPUT_FILE}")


✅ Weighted, boosted, and normalized confidence saved to submission_dl_ranker_weighted_boosted_normalized.csv


In [None]:
import polars as pl
from pathlib import Path

# ---------------------
# FILES
# ---------------------
V1 = "submission_ensemble_0.53108_with_confidence.csv"
V2 = "submission_ensemble_0.53135_with_confidence.csv"
OUTPUT_FILE = "submission_ensemble_boosted.parquet"

# ---------------------
# LOAD FILES
# ---------------------
def load_conf_df(path: str) -> pl.DataFrame:
    if not Path(path).exists():
        raise FileNotFoundError(f"❌ File not found: {path}")
    df = pl.read_csv(path) if path.endswith(".csv") else pl.read_parquet(path)
    if "confidence" not in df.columns:
        raise ValueError(f"⚠️ Missing 'confidence' column in {path}")
    return df.drop("selected", strict=False)

df_v1 = load_conf_df(V1)
df_v2 = load_conf_df(V2)

# ---------------------
# BUILD top6 per ranker_id
# ---------------------
def build_top6_map(df: pl.DataFrame) -> dict[str, set[str]]:
    ranked = (
        df.with_columns(
            pl.col("confidence")
            .rank("ordinal", descending=True)
            .over("ranker_id")
            .alias("rank")
        )
        .filter(pl.col("rank") <= 6)
        .group_by("ranker_id")
        .agg(pl.col("Id").alias("top6"))
    )
    return {row["ranker_id"]: set(row["top6"]) for row in ranked.iter_rows(named=True)}

top6_v1 = build_top6_map(df_v1)
top6_v2 = build_top6_map(df_v2)

# ---------------------
# COMBINE v1 + v2 WITH BOOST ONLY FOR SHARED TOP6 IDs
# ---------------------
df_v1v2 = pl.concat([df_v1, df_v2])
df_grouped = (
    df_v1v2
    .group_by(["Id", "ranker_id"])
    .agg(pl.sum("confidence").alias("conf_sum"))
)

boosted_rows = []
for row in df_grouped.iter_rows(named=True):
    id_, ranker_id, conf = row["Id"], row["ranker_id"], row["conf_sum"]
    top1 = top6_v1.get(ranker_id, set())
    top2 = top6_v2.get(ranker_id, set())
    shared_top6 = top1 & top2

    if id_ in shared_top6:
        n_shared = len(shared_top6)
        boost = 1.5 if n_shared >= 6 else 1.4 if n_shared >= 4 else 1.3
    else:
        boost = 1.0

    boosted_rows.append({
        "Id": id_,
        "ranker_id": ranker_id,
        "confidence": conf * boost
    })

df_boosted = pl.DataFrame(boosted_rows)

# ---------------------
# RANK
# ---------------------
df_ranked = df_boosted.with_columns([
    pl.col("confidence")
    .rank(method="ordinal", descending=True)
    .over("ranker_id")
    .cast(pl.Int32)
    .alias("selected")
])

# ---------------------
# RESTORE ORIGINAL ORDER
# ---------------------
df_original = df_v1.select(["Id", "ranker_id"])
final_submission = df_original.join(
    df_ranked.select(["Id", "ranker_id", "selected"]),
    on=["Id", "ranker_id"],
    how="left"
)

# ---------------------
# SAVE
# ---------------------
final_submission.write_parquet(OUTPUT_FILE)
print(f"✅ Boosted ensemble (Top6) saved to {OUTPUT_FILE}")


✅ Boosted ensemble (Top6) saved to submission_ensemble_boosted.parquet


In [None]:
import polars as pl
from pathlib import Path

# ---------------------
# FILES
# ---------------------
V1 = "submission_ensemble_0.53007_with_confidence.csv"
V2 = "submission_ensemble_0.53108_with_confidence.csv"
OUTPUT_FILE = "submission_ensemble_boosted.parquet"

# ---------------------
# LOAD FILES
# ---------------------
def load_conf_df(path: str) -> pl.DataFrame:
    if not Path(path).exists():
        raise FileNotFoundError(f"❌ File not found: {path}")
    df = pl.read_csv(path) if path.endswith(".csv") else pl.read_parquet(path)
    if "confidence" not in df.columns:
        raise ValueError(f"⚠️ Missing 'confidence' column in {path}")
    return df.drop("selected", strict=False)

df_v1 = load_conf_df(V1)
df_v2 = load_conf_df(V2)

# ---------------------
# BUILD top5 per ranker_id
# ---------------------
def build_top5_map(df: pl.DataFrame) -> dict[str, set[str]]:
    ranked = (
        df.with_columns(
            pl.col("confidence")
            .rank("ordinal", descending=True)
            .over("ranker_id")
            .alias("rank")
        )
        .filter(pl.col("rank") <= 5)
        .group_by("ranker_id")
        .agg(pl.col("Id").alias("top5"))
    )
    return {row["ranker_id"]: set(row["top5"]) for row in ranked.iter_rows(named=True)}

top5_v1 = build_top5_map(df_v1)
top5_v2 = build_top5_map(df_v2)

# ---------------------
# COMBINE v1 + v2 WITH BOOST ONLY FOR SHARED TOP5 IDs
# ---------------------
df_v1v2 = pl.concat([df_v1, df_v2])
df_grouped = (
    df_v1v2
    .group_by(["Id", "ranker_id"])
    .agg(pl.sum("confidence").alias("conf_sum"))
)

boosted_rows = []
for row in df_grouped.iter_rows(named=True):
    id_, ranker_id, conf = row["Id"], row["ranker_id"], row["conf_sum"]
    shared_top5 = top5_v1.get(ranker_id, set()) & top5_v2.get(ranker_id, set())

    # Apply a fixed boost of 1.5 only if the current ID is in shared top5
    boost = 1.4 if id_ in shared_top5 else 1.0

    boosted_rows.append({
        "Id": id_,
        "ranker_id": ranker_id,
        "confidence": conf * boost
    })

df_boosted = pl.DataFrame(boosted_rows)

# ---------------------
# RANK
# ---------------------
df_ranked = df_boosted.with_columns([
    pl.col("confidence")
    .rank(method="ordinal", descending=True)
    .over("ranker_id")
    .cast(pl.Int32)
    .alias("selected")
])

# ---------------------
# RESTORE ORIGINAL ORDER
# ---------------------
df_original = df_v1.select(["Id", "ranker_id"])
final_submission = df_original.join(
    df_ranked.select(["Id", "ranker_id", "selected"]),
    on=["Id", "ranker_id"],
    how="left"
)

# ---------------------
# SAVE
# ---------------------
final_submission.write_parquet(OUTPUT_FILE)
print(f"✅ Boosted ensemble (Top5 shared, boost=1.5) saved to {OUTPUT_FILE}")


✅ Boosted ensemble (Top5 shared, boost=1.5) saved to submission_ensemble_boosted.parquet


In [None]:
%%writefile boosting.py
import polars as pl
from pathlib import Path
from collections import defaultdict

# ---------------------
# FILES
# ---------------------
V1 = "submission_0.51822_with_confidence.csv"
V2 = "submission_0.52244_with_confidence.csv"
V3 = "submission_dl_ranker_with_confidence.csv"
OUTPUT_FILE = "submission_ensemble_boosted.parquet"

# ---------------------
# LOAD FILES
# ---------------------
def load_conf_df(path: str) -> pl.DataFrame:
    if not Path(path).exists():
        raise FileNotFoundError(f"❌ File not found: {path}")
    df = pl.read_csv(path).drop("selected", strict=False)
    if "confidence" not in df.columns:
        raise ValueError(f"⚠️ Missing 'confidence' column in {path}")
    return df

df_v1 = load_conf_df(V1)
df_v2 = load_conf_df(V2)
df_v3 = load_conf_df(V3)

# ---------------------
# BUILD top3 per ranker_id
# ---------------------
def build_top3_map(df: pl.DataFrame) -> dict[str, set[str]]:
    ranked = (
        df.with_columns(
            pl.col("confidence").rank("ordinal", descending=True).over("ranker_id").alias("rank")
        )
        .filter(pl.col("rank") <= 3)
        .group_by("ranker_id")
        .agg(pl.col("Id").alias("top3"))
    )
    return {row["ranker_id"]: set(row["top3"]) for row in ranked.iter_rows(named=True)}

top3_v1 = build_top3_map(df_v1)
top3_v2 = build_top3_map(df_v2)

# ---------------------
# COMBINE v1 + v2 WITH BOOST ONLY FOR SHARED TOP3 IDs
# ---------------------
df_v1v2 = pl.concat([df_v1, df_v2])
df_grouped = (
    df_v1v2
    .group_by(["Id", "ranker_id"])
    .agg(pl.sum("confidence").alias("conf_v1v2"))
)

boosted_rows = []

for row in df_grouped.iter_rows(named=True):
    id_, ranker_id, conf = row["Id"], row["ranker_id"], row["conf_v1v2"]
    top1 = top3_v1.get(ranker_id, set())
    top2 = top3_v2.get(ranker_id, set())
    shared_top3 = top1 & top2

    if id_ in shared_top3:
        if len(shared_top3) == 3:
            boost = 1.5
        elif len(shared_top3) == 2:
            boost = 1.4
        elif len(shared_top3) == 1:
            boost = 1.3
        else:
            boost = 1.0
    else:
        boost = 1.0

    boosted_rows.append({
        "Id": id_,
        "ranker_id": ranker_id,
        "confidence": conf * boost
    })

df_boosted_v1v2 = pl.DataFrame(boosted_rows)

# ---------------------
# ADD dl_ranker (v3)
# ---------------------
df_final_conf = (
    pl.concat([df_boosted_v1v2, df_v3])
    .group_by(["Id", "ranker_id"])
    .agg(pl.sum("confidence").alias("confidence_sum"))
)

# ---------------------
# RANK
# ---------------------
df_ranked = (
    df_final_conf.with_columns([
        pl.col("confidence_sum")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    ])
)

# ---------------------
# RESTORE ORIGINAL ORDER
# ---------------------
df_original = df_v1.select(["Id", "ranker_id"])
final_submission = (
    df_original.join(
        df_ranked.select(["Id", "ranker_id", "selected"]),
        on=["Id", "ranker_id"],
        how="left"
    )
)

# ---------------------
# SAVE
# ---------------------
final_submission.write_parquet(OUTPUT_FILE)
print(f"✅ Boosted ensemble saved to {OUTPUT_FILE}")


Overwriting boosting.py


In [None]:
%%writefile boosting.py
import polars as pl
from pathlib import Path

# ---------------------
# FILES
# ---------------------
V1 = "submission_0.51822_with_confidence.csv"
V2 = "submission_0.52244_with_confidence.csv"
V3 = "submission_dl_ranker_with_confidence.csv"
OUTPUT_FILE = "submission_ensemble_boosted.parquet"

# ---------------------
# LOAD FILES
# ---------------------
def load_conf_df(path: str) -> pl.DataFrame:
    if not Path(path).exists():
        raise FileNotFoundError(f"❌ File not found: {path}")
    df = pl.read_csv(path).drop("selected", strict=False)
    if "confidence" not in df.columns:
        raise ValueError(f"⚠️ Missing 'confidence' column in {path}")
    return df

df_v1 = load_conf_df(V1)
df_v2 = load_conf_df(V2)
df_v3 = load_conf_df(V3)

# ---------------------
# BUILD top3 per ranker_id
# ---------------------
def build_top3_map(df: pl.DataFrame) -> dict[str, set[str]]:
    ranked = (
        df.with_columns(
            pl.col("confidence").rank("ordinal", descending=True).over("ranker_id").alias("rank")
        )
        .filter(pl.col("rank") <= 3)
        .group_by("ranker_id")
        .agg(pl.col("Id").alias("top3"))
    )
    return {row["ranker_id"]: set(row["top3"]) for row in ranked.iter_rows(named=True)}

top3_v1 = build_top3_map(df_v1)
top3_v2 = build_top3_map(df_v2)
top3_v3 = build_top3_map(df_v3)

# ---------------------
# COMBINE ALL FILES
# ---------------------
df_all = pl.concat([df_v1, df_v2, df_v3])
df_grouped = (
    df_all
    .group_by(["Id", "ranker_id"])
    .agg(pl.sum("confidence").alias("confidence_sum"))
)

# ---------------------
# APPLY BOOSTING STRATEGY
# ---------------------
boosted_rows = []

for row in df_grouped.iter_rows(named=True):
    id_, ranker_id, conf_sum = row["Id"], row["ranker_id"], row["confidence_sum"]

    top1 = top3_v1.get(ranker_id, set())
    top2 = top3_v2.get(ranker_id, set())
    top3 = top3_v3.get(ranker_id, set())

    # All three top3 are identical
    if top1 == top2 == top3 and id_ in top1:
        boost = 2.0
    else:
        shared_all = top1 & top2 & top3
        if len(shared_all) == 2 and id_ in shared_all:
            boost = 1.8
        elif len(shared_all) == 1 and id_ in shared_all:
            boost = 1.7
        else:
            # fallback: use old logic between v1 and v2 only
            shared_v1v2 = top1 & top2
            if id_ in shared_v1v2:
                if len(shared_v1v2) == 3:
                    boost = 1.6
                elif len(shared_v1v2) == 2:
                    boost = 1.5
                elif len(shared_v1v2) == 1:
                    boost = 1.4
                else:
                    boost = 1.0
            else:
                boost = 1.0

    boosted_rows.append({
        "Id": id_,
        "ranker_id": ranker_id,
        "confidence": conf_sum * boost
    })

df_boosted = pl.DataFrame(boosted_rows)

# ---------------------
# RANK
# ---------------------
df_ranked = (
    df_boosted.with_columns([
        pl.col("confidence")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    ])
)

# ---------------------
# RESTORE ORIGINAL ORDER
# ---------------------
df_original = df_v1.select(["Id", "ranker_id"])
final_submission = (
    df_original.join(
        df_ranked.select(["Id", "ranker_id", "selected"]),
        on=["Id", "ranker_id"],
        how="left"
    )
)

# ---------------------
# SAVE
# ---------------------
final_submission.write_parquet(OUTPUT_FILE)
print(f"✅ Boosted ensemble saved to {OUTPUT_FILE}")

Writing boosting.py


**Spearman**

In [None]:
import polars as pl
import glob
import re
import numpy as np

def load_submission_with_confidence(filepath):
    df = pl.read_csv(filepath)
    if "selected" in df.columns:
        df = df.drop("selected")
    return df

def main():
    # Căutăm toate fișierele CSV cu _with_confidence
    filepaths = glob.glob("submission_*_*_with_confidence.csv")

    dfs = []
    confidence_col_names = []

    for filepath in filepaths:
        # Extragem scorul exact din numele fișierului
        match = re.search(r"submission_.*_([\d.]+)_with_confidence\.csv", filepath)
        if match:
            score = match.group(1)
            score_str = score  # nu mai normalizăm scorul
            df = load_submission_with_confidence(filepath)
            confidence_col = f"confidence_{score_str}"
            df = df.with_columns(pl.col("confidence").alias(confidence_col)).drop("confidence")
            confidence_col_names.append(confidence_col)
            dfs.append(df)

    if not dfs:
        print("❌ Nu s-au găsit fișiere valide.")
        return

    # Combinăm toate datele pe baza Id + ranker_id
    df_combined = dfs[0]
    for df in dfs[1:]:
        df_combined = df_combined.join(df, on=["Id", "ranker_id"])

    # Calculăm corelația Spearman
    corr_df = df_combined.select(confidence_col_names).to_pandas().corr(method="spearman")
    print("📊 Corelație Spearman între modele:")
    print(corr_df)

    # Calculăm unicitatea modelelor și ponderile
    mean_corr = corr_df.apply(lambda row: (row.sum() - 1) / (len(row) - 1), axis=1)
    model_uniqueness = 1 - mean_corr
    weights = model_uniqueness / model_uniqueness.sum()

    print("\n⚖️ Ponderi calculate:")
    for col, weight in zip(confidence_col_names, weights.values):
        print(f"{col}: {weight:.4f}")

    # Ensemble ponderat
    weighted_conf = sum(
        df_combined[col].fill_null(0) * weight
        for col, weight in zip(confidence_col_names, weights.values)
    )

    df_combined = df_combined.with_columns(weighted_conf.alias("ensemble_confidence"))

    # Rankăm în funcție de ensemble_confidence
    df_ranked = df_combined.with_columns(
        pl.col("ensemble_confidence")
        .rank("ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    # Rezultatul final
    final_submission = df_ranked.select(["Id", "ranker_id", "selected"])

    print("\n✅ Exemplu din rezultatul final:")
    print(final_submission.head())

    final_submission.write_csv("submission_ensemble.csv")
    print("\n💾 Salvare completă: submission_ensemble.csv")

if __name__ == "__main__":
    main()


📊 Corelație Spearman între modele:
                    confidence_0.52474  confidence_0.49380  \
confidence_0.52474            1.000000            0.894615   
confidence_0.49380            0.894615            1.000000   
confidence_0.52795            0.979722            0.895570   
confidence_0.51345            0.935174            0.897351   
confidence_0.52391            0.958787            0.897411   
confidence_0.51244            0.949075            0.898176   
confidence_0.52603            0.982708            0.896322   
confidence_0.52244            0.958957            0.899752   

                    confidence_0.52795  confidence_0.51345  \
confidence_0.52474            0.979722            0.935174   
confidence_0.49380            0.895570            0.897351   
confidence_0.52795            1.000000            0.937831   
confidence_0.51345            0.937831            1.000000   
confidence_0.52391            0.961404            0.949847   
confidence_0.51244            0.95

In [None]:
import polars as pl
import glob
import re
import numpy as np

def load_submission_with_confidence(filepath):
    df = pl.read_csv(filepath)
    if "selected" in df.columns:
        df = df.drop("selected")
    return df

def compute_entropy(probabilities):
    p = np.clip(probabilities, 1e-9, 1.0)
    return -np.sum(p * np.log(p))

def main():
    # Find all submission files with confidence
    filepaths = glob.glob("submission_*_*_with_confidence.csv")

    dfs = []
    confidence_col_names = []

    for filepath in filepaths:
        match = re.search(r"submission_.*_([\d.]+)_with_confidence\.csv", filepath)
        if match:
            score = match.group(1)
            df = load_submission_with_confidence(filepath)
            confidence_col = f"confidence_{score}"
            df = df.with_columns(pl.col("confidence").alias(confidence_col)).drop("confidence")
            confidence_col_names.append(confidence_col)
            dfs.append(df)

    if not dfs:
        print("❌ No valid submission files found.")
        return

    # Join all dataframes on Id + ranker_id
    df_combined = dfs[0]
    for df in dfs[1:]:
        df_combined = df_combined.join(df, on=["Id", "ranker_id"])

    # Spearman correlation between confidence columns
    corr_df = df_combined.select(confidence_col_names).to_pandas().corr(method="spearman")
    print("📊 Spearman correlation between models:")
    print(corr_df)

    # Uniqueness based on 1 - average Spearman correlation
    mean_corr = corr_df.apply(lambda row: (row.sum() - 1) / (len(row) - 1), axis=1)
    model_uniqueness = 1 - mean_corr
    uniqueness_weights = model_uniqueness / model_uniqueness.sum()

    # Entropy of each confidence column (as measure of uncertainty/informativeness)
    entropies = []
    for col in confidence_col_names:
        probs = df_combined[col].to_numpy()
        probs = probs / np.sum(probs)
        entropies.append(compute_entropy(probs))

    entropy_weights = np.array(entropies)
    entropy_weights /= entropy_weights.sum()

    # Combine uniqueness and entropy into final weights
    alpha = 0.75  # higher = more weight on uniqueness
    combined_weights = alpha * uniqueness_weights.values + (1 - alpha) * entropy_weights
    combined_weights /= combined_weights.sum()

    print("\n⚖️ Final ensemble weights (uniqueness + entropy):")
    for col, weight in zip(confidence_col_names, combined_weights):
        print(f"{col}: {weight:.4f}")

    # Weighted ensemble of confidence scores
    weighted_conf = sum(
        df_combined[col].fill_null(0) * weight
        for col, weight in zip(confidence_col_names, combined_weights)
    )

    df_combined = df_combined.with_columns(weighted_conf.alias("ensemble_confidence"))

    # Ranking based on ensemble confidence
    df_ranked = df_combined.with_columns(
        pl.col("ensemble_confidence")
        .rank("ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    final_submission = df_ranked.select(["Id", "ranker_id", "selected"])

    print("\n✅ Sample of final submission:")
    print(final_submission.head())

    final_submission.write_parquet("submission_ensemble.parquet")
    print("\n💾 Saved as: submission_ensemble.parquet")

if __name__ == "__main__":
    main()


📊 Spearman correlation between models:
                    confidence_0.52795  confidence_0.51345  \
confidence_0.52795            1.000000            0.939309   
confidence_0.51345            0.939309            1.000000   
confidence_0.51244            0.955373            0.945280   
confidence_0.52391            0.962294            0.950659   
confidence_0.48755            0.891861            0.902773   
confidence_0.52244            0.957935            0.949373   
confidence_0.49755            0.875167            0.883281   
confidence_0.52538            0.979736            0.935733   

                    confidence_0.51244  confidence_0.52391  \
confidence_0.52795            0.955373            0.962294   
confidence_0.51345            0.945280            0.950659   
confidence_0.51244            1.000000            0.937243   
confidence_0.52391            0.937243            1.000000   
confidence_0.48755            0.886153            0.899695   
confidence_0.52244            

In [None]:
%%writefile ensemble_cos_sim.py
import polars as pl
import glob
import re
import numpy as np
from numpy.linalg import norm

def load_submission_with_confidence(filepath: str) -> pl.DataFrame:
    df = pl.read_csv(filepath)
    if "selected" in df.columns:
        df = df.drop("selected")
    return df

def cos_sim(a: np.ndarray, b: np.ndarray) -> float:
    # Guard against NaN/Inf and zero vectors
    a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0)
    b = np.nan_to_num(b, nan=0.0, posinf=0.0, neginf=0.0)
    na = norm(a) + 1e-12
    nb = norm(b) + 1e-12
    return float(np.dot(a, b) / (na * nb))

def main():
    # Find all *_with_confidence.csv files
    filepaths = glob.glob("submission_*_*_with_confidence.csv")

    dfs = []
    confidence_col_names = []

    for filepath in filepaths:
        # extract numeric label for column naming
        match = re.search(r"submission_.*_([\d.]+)_with_confidence\.csv", filepath)
        if not match:
            # skip files without a numeric score in the name (keeps original behavior)
            continue

        score = match.group(1)
        df = load_submission_with_confidence(filepath)
        if "confidence" not in df.columns:
            continue

        confidence_col = f"confidence_{score}"
        df = df.with_columns(pl.col("confidence").alias(confidence_col)).drop("confidence")
        confidence_col_names.append(confidence_col)
        dfs.append(df)

    if not dfs:
        print("❌ No valid submission files found.")
        return

    # Join on Id + ranker_id
    df_combined = dfs[0]
    for df in dfs[1:]:
        df_combined = df_combined.join(df, on=["Id", "ranker_id"])

    # === Cosine-uniqueness weights (label-free) ===
    cols_np = []
    for c in confidence_col_names:
        x = df_combined[c].to_numpy()
        x = np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0)
        cols_np.append(x)

    m = len(cols_np)
    if m == 1:
        combined_weights = np.array([1.0])
        print("\n📌 Only one model detected → weight = 1.0")
    else:
        uniq = []
        for i in range(m):
            sims = [abs(cos_sim(cols_np[i], cols_np[j])) for j in range(m) if j != i]
            avg_sim = np.mean(sims) if sims else 0.0
            uniq.append(max(0.0, 1.0 - avg_sim))  # uniqueness ∈ [0,1]
        uniq = np.array(uniq, dtype=float)
        s = uniq.sum()
        combined_weights = (uniq / s) if s > 0 and np.isfinite(s) else (np.ones(m) / m)

    print("\n⚖️ Final ensemble weights (cosine-uniqueness):")
    for col, w in zip(confidence_col_names, combined_weights):
        print(f"{col}: {w:.4f}")

    # Weighted ensemble of confidence scores
    weighted_conf = sum(
        df_combined[col].fill_null(0) * weight
        for col, weight in zip(confidence_col_names, combined_weights)
    )

    df_combined = df_combined.with_columns(weighted_conf.alias("ensemble_confidence"))

    # Ranking by ensemble_confidence within each ranker_id
    df_ranked = df_combined.with_columns(
        pl.col("ensemble_confidence")
        .rank("ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    final_submission = df_ranked.select(["Id", "ranker_id", "selected"])

    print("\n✅ Sample of final submission:")
    print(final_submission.head())

    final_submission.write_parquet("submission_ensemble.parquet")
    print("\n💾 Saved as: submission_ensemble.parquet")

if __name__ == "__main__":
    main()

Writing ensemble_cos_sim.py


In [None]:
import polars as pl
import glob
import re
import numpy as np
from numpy.linalg import norm, svd

# === Helpers ===

def load_submission_with_confidence(filepath: str) -> pl.DataFrame:
    df = pl.read_csv(filepath)
    if "selected" in df.columns:
        df = df.drop("selected")
    return df

def cos_sim(a: np.ndarray, b: np.ndarray) -> float:
    a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0)
    b = np.nan_to_num(b, nan=0.0, posinf=0.0, neginf=0.0)
    na = norm(a) + 1e-12
    nb = norm(b) + 1e-12
    return float(np.dot(a, b) / (na * nb))

def compute_entropy(probabilities: np.ndarray) -> float:
    # Scores are already normalized to [0,1]; ensure they sum to 1
    p = np.nan_to_num(probabilities, nan=0.0, posinf=0.0, neginf=0.0)
    s = p.sum()
    if s <= 0 or not np.isfinite(s):
        p = np.ones_like(p) / len(p)
    else:
        p = p / s
    p = np.clip(p, 1e-12, 1.0)
    return float(-np.sum(p * np.log(p)))

# === Main ===
def main():
    # 1) Load all *_with_confidence.csv files with a numeric score in the name
    filepaths = glob.glob("submission_*_*_with_confidence.csv")

    dfs = []
    confidence_col_names = []
    for filepath in filepaths:
        m = re.search(r"submission_.*_([\d.]+)_with_confidence\.csv", filepath)
        if not m:
            continue
        score = m.group(1)
        df = load_submission_with_confidence(filepath)
        if "confidence" not in df.columns:
            continue
        colname = f"confidence_{score}"
        df = df.with_columns(pl.col("confidence").alias(colname)).drop("confidence")
        confidence_col_names.append(colname)
        dfs.append(df)

    if not dfs:
        print("❌ No valid files found.")
        return

    # 2) Join on Id and ranker_id
    df_combined = dfs[0]
    for df in dfs[1:]:
        df_combined = df_combined.join(df, on=["Id", "ranker_id"])

    # 3) Matrix X (n x m) from original columns
    X = df_combined.select(confidence_col_names).to_numpy()
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
    n, m = X.shape

    # 4) Cosine-uniqueness blend → new column 'blend_cos'
    if m == 1:
        w_cos = np.array([1.0])
    else:
        uniq = []
        for i in range(m):
            sims = [abs(cos_sim(X[:, i], X[:, j])) for j in range(m) if j != i]
            avg_sim = np.mean(sims) if sims else 0.0
            uniq.append(max(0.0, 1.0 - avg_sim))
        uniq = np.array(uniq, dtype=float)
        s = uniq.sum()
        w_cos = (uniq / s) if s > 0 and np.isfinite(s) else np.ones(m) / m

    blend_cos = X @ w_cos  # true blend across all models
    df_combined = df_combined.with_columns(pl.Series("blend_cos", blend_cos))

    # 5) PCA consensus (PC1) → new column 'pc1_consensus'
    # standardize columns before SVD
    X_std = (X - X.mean(axis=0)) / (X.std(axis=0) + 1e-12)
    U, Svals, Vt = svd(X_std, full_matrices=False)
    v1 = Vt[0]                 # PC1 loadings (m,)
    pc1_scores = X_std @ v1    # PC1 score per row
    # optional: align sign with mean score direction
    sign = np.sign(np.corrcoef(pc1_scores, X_std.mean(axis=1))[0, 1] + 1e-12)
    pc1_scores *= sign
    df_combined = df_combined.with_columns(pl.Series("pc1_consensus", pc1_scores))

    # 6) Columns to use for weighting
    ensemble_cols = confidence_col_names + ["blend_cos", "pc1_consensus"]

    # 7) Spearman correlation across all columns (original + new)
    corr_df = df_combined.select(ensemble_cols).to_pandas().corr(method="spearman").fillna(0.0)
    print("📊 Spearman correlation (original + blend_cos + pc1_consensus):")
    print(corr_df)

    # 8) Uniqueness weights from Spearman: 1 - average off-diagonal correlation
    ncols = corr_df.shape[0]
    if ncols == 1:
        w_uni = np.array([1.0])
    else:
        mean_corr = (corr_df.sum(axis=1) - 1.0) / max(ncols - 1, 1)
        uniqueness = 1.0 - mean_corr.values
        su = uniqueness.sum()
        w_uni = (uniqueness / su) if su > 0 and np.isfinite(su) else np.ones(ncols) / ncols

    # 9) Entropy-based weights (ensure each column sums to 1 before H)
    entropies = []
    for c in ensemble_cols:
        entropies.append(compute_entropy(df_combined[c].to_numpy()))
    entropies = np.array(entropies, dtype=float)
    se = entropies.sum()
    w_ent = (entropies / se) if se > 0 and np.isfinite(se) else np.ones_like(entropies) / len(entropies)

    # 10) Final weights (alpha toward uniqueness)
    alpha = 0.75
    w_final = alpha * w_uni + (1 - alpha) * w_ent
    sf = w_final.sum()
    w_final = (w_final / sf) if sf > 0 and np.isfinite(sf) else np.ones_like(w_final) / len(w_final)

    print("\n⚖️ Final weights (including the new columns):")
    for col, w in zip(ensemble_cols, w_final):
        print(f"{col}: {w:.4f}")

    # 11) Final ensemble score
    ensemble_conf = sum(
        df_combined[c].fill_null(0) * w
        for c, w in zip(ensemble_cols, w_final)
    )
    df_combined = df_combined.with_columns(ensemble_conf.alias("ensemble_confidence"))

    # 12) Rank within each ranker_id
    df_ranked = df_combined.with_columns(
        pl.col("ensemble_confidence")
        .rank("ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    final_submission = df_ranked.select(["Id", "ranker_id", "selected"])

    print("\n✅ Sample of final submission:")
    print(final_submission.head())

    final_submission.write_parquet("submission_ensemble.parquet")
    print("\n💾 Saved as: submission_ensemble.parquet")

if __name__ == "__main__":
    main()


📊 Spearman correlation (original + blend_cos + pc1_consensus):
                    confidence_0.48755  confidence_0.52244  \
confidence_0.48755            1.000000            0.904576   
confidence_0.52244            0.904576            1.000000   
confidence_0.52603            0.892817            0.958322   
confidence_0.51244            0.886153            0.934095   
confidence_0.51345            0.902773            0.949373   
confidence_0.52391            0.899695            0.973914   
confidence_0.52538            0.891378            0.957988   
confidence_0.52795            0.891861            0.957935   
blend_cos                     0.946820            0.977914   
pc1_consensus                 0.932055            0.980414   

                    confidence_0.52603  confidence_0.51244  \
confidence_0.48755            0.892817            0.886153   
confidence_0.52244            0.958322            0.934095   
confidence_0.52603            1.000000            0.952257   
confid

In [None]:
import polars as pl
import glob
import re
import numpy as np

def load_submission_with_confidence(filepath: str) -> pl.DataFrame:
    df = pl.read_csv(filepath)
    # Drop 'selected' if present — we only need Id, ranker_id, confidence-like column
    if "selected" in df.columns:
        df = df.drop("selected")
    return df

def compute_entropy(probabilities: np.ndarray) -> float:
    p = np.clip(probabilities, 1e-9, 1.0)
    return float(-np.sum(p * np.log(p)))

def main():
    # -------- 1) Load all submissions that contain per-item confidence --------
    filepaths = glob.glob("submission_*_*_with_confidence.csv")

    dfs = []
    confidence_col_names = []

    for filepath in filepaths:
        # Extract a score token from filename, e.g. ..._0.52795_with_confidence.csv
        match = re.search(r"submission_.*_([\d.]+)_with_confidence\.csv", filepath)
        if match:
            score = match.group(1)
            df = load_submission_with_confidence(filepath)
            # Rename 'confidence' to a unique column per model
            conf_col = f"confidence_{score}"
            df = df.with_columns(pl.col("confidence").alias(conf_col)).drop("confidence")
            confidence_col_names.append(conf_col)
            dfs.append(df)

    if not dfs:
        print("❌ No valid submission files found.")
        return

    # -------- 2) Join on (Id, ranker_id) to align items across models --------
    df_combined = dfs[0]
    for df in dfs[1:]:
        df_combined = df_combined.join(df, on=["Id", "ranker_id"])

    # -------- 3) Compute model-to-model Spearman (global, on confidence) --------
    corr_df = df_combined.select(confidence_col_names).to_pandas().corr(method="spearman")
    print("📊 Spearman correlation between models:")
    print(corr_df)

    # Uniqueness weight: 1 - average Spearman correlation (row-wise, excluding diagonal)
    mean_corr = corr_df.apply(lambda row: (row.sum() - 1) / (len(row) - 1), axis=1)
    model_uniqueness = 1 - mean_corr
    uniqueness_weights = model_uniqueness / model_uniqueness.sum()

    # -------- 4) Entropy per model (global) as informativeness proxy --------
    entropies = []
    for col in confidence_col_names:
        probs = df_combined[col].to_numpy()
        probs = probs / np.sum(probs)
        entropies.append(compute_entropy(probs))

    entropy_weights = np.array(entropies, dtype=float)
    entropy_weights /= entropy_weights.sum()

    # -------- 5) Combine weights (same strategy as before) --------
    alpha = 0.7  # higher -> more weight on uniqueness
    combined_weights = alpha * uniqueness_weights.values + (1 - alpha) * entropy_weights
    combined_weights /= combined_weights.sum()

    print("\n⚖️ Final ensemble weights (uniqueness + entropy):")
    for col, w in zip(confidence_col_names, combined_weights):
        print(f"{col}: {w:.4f}")

    # -------- 6) Weighted confidence ensemble (same as before) --------
    weighted_conf = sum(
        df_combined[col].fill_null(0) * w
        for col, w in zip(confidence_col_names, combined_weights)
    )
    df_combined = df_combined.with_columns(weighted_conf.alias("ensemble_confidence"))

    # -------- 7) Top-K agreement boosting (K=3) --------
    # Agreement = fraction of models that place the item in Top-K of its group
    K = 3           # Top-K window
    lam = 0.2      # boost strength (try 0.1–0.3)
    cap = 0.25     # maximum additional boost (25%)

    # Build per-model ranks and binary Top-K votes per group
    vote_cols = []
    df_boost = df_combined.clone()
    for c in confidence_col_names:
        rank_col = f"rank_{c}"
        vote_col = f"vote_{c}"
        df_boost = df_boost.with_columns(
            pl.col(c).rank("ordinal", descending=True).over("ranker_id").alias(rank_col)
        ).with_columns(
            (pl.col(rank_col) <= K).cast(pl.Float64).alias(vote_col)
        )
        vote_cols.append(vote_col)

    # Agreement (vote_rate): fraction of models with rank <= K
    M = float(len(confidence_col_names))
    df_boost = df_boost.with_columns(
        (sum(pl.col(v) for v in vote_cols) / M).alias("agreement")
    )

    # Boost factor: 1 + lam * agreement, clamped to [1, 1 + cap]
    df_boost = df_boost.with_columns(
        (1.0 + lam * pl.col("agreement")).clip(1.0, 1.0 + cap).alias("boost")
    )

    # Apply boost to ensemble confidence
    df_boost = df_boost.with_columns(
        (pl.col("ensemble_confidence") * pl.col("boost")).alias("ensemble_confidence_boosted")
    )

    # -------- 8) Final ranking per group (using boosted score) --------
    df_ranked = df_boost.with_columns(
        pl.col("ensemble_confidence_boosted")
        .rank("ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    final_submission = df_ranked.select(["Id", "ranker_id", "selected"])

    print("\n✅ Sample of final submission:")
    print(final_submission.head())

    final_submission.write_parquet("submission_ensemble.parquet")
    print("\n💾 Saved as: submission_ensemble.parquet")

if __name__ == "__main__":
    main()

📊 Spearman correlation between models:
                    confidence_0.52795  confidence_0.51244  \
confidence_0.52795            1.000000            0.955373   
confidence_0.51244            0.955373            1.000000   
confidence_0.52244            0.957935            0.934095   
confidence_0.48755            0.891861            0.886153   

                    confidence_0.52244  confidence_0.48755  
confidence_0.52795            0.957935            0.891861  
confidence_0.51244            0.934095            0.886153  
confidence_0.52244            1.000000            0.904576  
confidence_0.48755            0.904576            1.000000  

⚖️ Final ensemble weights (uniqueness + entropy):
confidence_0.52795: 0.2201
confidence_0.51244: 0.2421
confidence_0.52244: 0.2265
confidence_0.48755: 0.3114

✅ Sample of final submission:
shape: (5, 3)
┌──────────┬─────────────────────────────────┬──────────┐
│ Id       ┆ ranker_id                       ┆ selected │
│ ---      ┆ ---         

In [None]:
%%writefile ensemble.py
import polars as pl
import glob
import re
import numpy as np

def load_submission_with_confidence(filepath: str, alpha: float = 0.7, k: int = 5) -> pl.DataFrame:
    df = pl.read_csv(filepath)

    if "selected" in df.columns:
        rrf_max = 1.0 / (k + 1)
        df = df.with_columns(
            ((1.0 / (k + pl.col("selected"))) / rrf_max).alias("rrf_score")
        )
        df = df.with_columns(
            (alpha * pl.col("confidence") + (1 - alpha) * pl.col("rrf_score")).alias("confidence")
        )
        df = df.drop(["selected", "rrf_score"])

    return df

def compute_entropy(probabilities: np.ndarray) -> float:
    p = np.clip(probabilities, 1e-9, 1.0)
    return float(-np.sum(p * np.log(p)))

def main():
    # -------- 1) Load all submissions that contain per-item confidence --------
    filepaths = glob.glob("submission_*_*_with_confidence.csv")

    dfs = []
    confidence_col_names = []

    for filepath in filepaths:
        # Extract a score token from filename, e.g. ..._0.52795_with_confidence.csv
        match = re.search(r"submission_.*_([\d.]+)_with_confidence\.csv", filepath)
        if match:
            score = match.group(1)
            df = load_submission_with_confidence(filepath, alpha=0.7, k=5)
            conf_col = f"confidence_{score}"
            df = df.with_columns(pl.col("confidence").alias(conf_col)).drop("confidence")
            confidence_col_names.append(conf_col)
            dfs.append(df)

    if not dfs:
        print("❌ No valid submission files found.")
        return

    # -------- 2) Join on (Id, ranker_id) to align items across models --------
    df_combined = dfs[0]
    for df in dfs[1:]:
        df_combined = df_combined.join(df, on=["Id", "ranker_id"])

    # -------- 3) Compute model-to-model Spearman (global, on confidence) --------
    corr_df = df_combined.select(confidence_col_names).to_pandas().corr(method="spearman")
    print("📊 Spearman correlation between models:")
    print(corr_df)

    # Uniqueness weight: 1 - average Spearman correlation (row-wise, excluding diagonal)
    mean_corr = corr_df.apply(lambda row: (row.sum() - 1) / (len(row) - 1), axis=1)
    model_uniqueness = 1 - mean_corr
    uniqueness_weights = model_uniqueness / model_uniqueness.sum()

    # -------- 4) Entropy per model (global) as informativeness proxy --------
    entropies = []
    for col in confidence_col_names:
        probs = df_combined[col].to_numpy()
        probs = probs / np.sum(probs)
        entropies.append(compute_entropy(probs))

    entropy_weights = np.array(entropies, dtype=float)
    entropy_weights /= entropy_weights.sum()

    # -------- 5) Combine weights (same strategy as before) --------
    alpha = 0.75  # higher -> more weight on uniqueness
    combined_weights = alpha * uniqueness_weights.values + (1 - alpha) * entropy_weights
    combined_weights /= combined_weights.sum()

    print("\n⚖️ Final ensemble weights (uniqueness + entropy):")
    for col, w in zip(confidence_col_names, combined_weights):
        print(f"{col}: {w:.4f}")

    # -------- 6) Weighted confidence ensemble --------
    weighted_conf = sum(
        df_combined[col].fill_null(0) * w
        for col, w in zip(confidence_col_names, combined_weights)
    )
    df_combined = df_combined.with_columns(weighted_conf.alias("ensemble_confidence"))

    # -------- 7) Top-K agreement boosting (K=3) --------
    K = 3
    lam = 0.35
    cap = 0.45

    vote_cols = []
    df_boost = df_combined.clone()
    for c in confidence_col_names:
        rank_col = f"rank_{c}"
        vote_col = f"vote_{c}"
        df_boost = df_boost.with_columns(
            pl.col(c).rank("ordinal", descending=True).over("ranker_id").alias(rank_col)
        ).with_columns(
            (pl.col(rank_col) <= K).cast(pl.Float64).alias(vote_col)
        )
        vote_cols.append(vote_col)

    M = float(len(confidence_col_names))
    df_boost = df_boost.with_columns(
        (sum(pl.col(v) for v in vote_cols) / M).alias("agreement")
    )

    df_boost = df_boost.with_columns(
        (1.0 + lam * pl.col("agreement")).clip(1.0, 1.0 + cap).alias("boost")
    )

    df_boost = df_boost.with_columns(
        (pl.col("ensemble_confidence") * pl.col("boost")).alias("ensemble_confidence_boosted")
    )

    # -------- 8) Final ranking --------
    df_ranked = df_boost.with_columns(
        pl.col("ensemble_confidence_boosted")
        .rank("ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    final_submission = df_ranked.select(["Id", "ranker_id", "selected"])

    print("\n✅ Sample of final submission:")
    print(final_submission.head())

    final_submission.write_parquet("submission_ensemble.parquet")
    print("\n💾 Saved as: submission_ensemble.parquet")

if __name__ == "__main__":
    main()


Writing ensemble.py


In [None]:
# -*- coding: utf-8 -*-
import os
import re
import glob
import numpy as np
import polars as pl
import pandas as pd

# =========================
# ======== Config =========
# =========================

# Raw submissions to aggregate (Id, ranker_id, selected)
RAW_SUBMISSIONS = [
    "submission_20250721083807_0.52244.parquet",
    "submission_20250724032338_0.51345.parquet",
    "submission_20250725083055_0.52391.parquet",
    "submission_20250727084025_0.52795.parquet",
    "submission_20250802074816_0.52603.parquet",
    "submission_20250804001151_0.51244.parquet",
    "submission_20250807032439_0.52538.parquet",
    "submission_dl_ranker_0.48755.parquet",
]

# Which aggregators to compute and blend
USE_RRF      = True
USE_BORDA    = True
USE_SOFTMAX  = True

# RRF and Softmax parameters
RRF_K = 5
TAU   = 0.7

# Blend the aggregators with min-variance weights? (else pick one by name)
BLEND_AGGREGATORS = True
# If not blending, choose one: "rrf" | "borda" | "softmax"
SINGLE_AGGREGATOR = "rrf"

# Correlation shrinkage for min-variance meta-weights
CORR_SHRINKAGE_LAMBDA = 0.2

# Normalize final score per group before ranking?
NORMALIZE_FINAL_PER_GROUP = True

# Deterministic tie-break by Id?
DETERMINISTIC_TIE_BREAK = True

# Output
OUTPUT_PARQUET = "submission_ensemble_rank_agg.parquet"
SAVE_AGG_WEIGHTS = True   # save aggregator weights if blending

# =========================
# ===== Utilities =========
# =========================

def mv_weights_from_corr(corr: np.ndarray, lam: float = 0.2) -> np.ndarray:
    """
    Minimum-variance weights from correlation matrix (with shrinkage).
    w ∝ Σ^{-1} 1 , normalized to sum to 1, forced non-negative.
    """
    p = corr.shape[0]
    corr_shr = (1 - lam) * corr + lam * np.eye(p)
    inv = np.linalg.pinv(corr_shr)
    ones = np.ones((p, 1))
    denom = (ones.T @ inv @ ones)[0, 0]
    denom = float(denom)
    if denom <= 0:
        return np.ones(p) / p
    w = (inv @ ones) / denom
    w = w.ravel()
    w = np.clip(w, 0, None)
    s = w.sum()
    return w / s if s > 0 else np.ones(p) / p

def safe_minmax_norm(expr: pl.Expr, over_col: str) -> pl.Expr:
    """Groupwise min-max normalize, safe for constant groups."""
    e_min = expr.min().over(over_col)
    e_max = expr.max().over(over_col)
    return (expr - e_min) / (e_max - e_min + 1e-12)

def read_submission(path: str) -> pl.DataFrame:
    """Read a submission (parquet or csv) with columns Id, ranker_id, selected."""
    if path.endswith(".parquet"):
        return pl.read_parquet(path)
    elif path.endswith(".csv"):
        return pl.read_csv(path)
    else:
        raise ValueError(f"Unsupported file: {path}")

# =========================
# === Rank Aggregators ====
# =========================

def agg_rrf(df: pl.DataFrame, rank_cols: list[str], rrf_k: int) -> pl.Expr:
    """RRF: sum_i 1 / (k + rank_i)."""
    expr = None
    for c in rank_cols:
        term = 1.0 / (rrf_k + pl.col(c))
        expr = term if expr is None else (expr + term)
    return expr.alias("agg_rrf")

def agg_borda(df: pl.DataFrame, rank_cols: list[str]) -> pl.Expr:
    """
    Borda (normalized): for each model, points = (max_rank + 1 - rank),
    then sum points, then (optionally) normalize per group later.
    We approximate max_rank per group via the max across rank columns.
    """
    # max rank across models per row (same within a group but computed rowwise)
    max_rank_row = None
    for c in rank_cols:
        max_rank_row = pl.max_horizontal(pl.col(c)) if max_rank_row is None else pl.max_horizontal(max_rank_row, pl.col(c))
    # sum points
    expr = None
    for c in rank_cols:
        points_c = (max_rank_row + 1 - pl.col(c))
        expr = points_c if expr is None else (expr + points_c)
    return expr.alias("agg_borda")

def agg_softmax(df: pl.DataFrame, rank_cols: list[str], tau: float) -> pl.Expr:
    """Softmax over ranks: sum_i exp(-rank_i / tau)."""
    expr = None
    for c in rank_cols:
        term = (-pl.col(c) / tau).exp()
        expr = term if expr is None else (expr + term)
    return expr.alias("agg_softmax")

# =========================
# ========= Main ==========
# =========================

def main():
    # 1) Read all raw submissions and rename their 'selected' to a per-model column
    dfs = []
    model_cols = []
    for path in RAW_SUBMISSIONS:
        if not os.path.exists(path):
            print(f"❌ File not found: {path}")
            continue
        df = read_submission(path)
        # derive a tag from filename to name the column
        m = re.search(r"submission_.*?_(\d+\.\d+)\.(?:parquet|csv)$", os.path.basename(path))
        tag = m.group(1) if m else os.path.splitext(os.path.basename(path))[0]
        colname = f"rank_{tag}"
        # keep only necessary cols; rename 'selected' -> colname
        df = df.select(["Id", "ranker_id", "selected"]).rename({"selected": colname})
        dfs.append(df)
        model_cols.append(colname)

    if not dfs:
        print("❌ No valid raw submissions found.")
        return

    # 2) Join all on (Id, ranker_id)
    df = dfs[0]
    for d in dfs[1:]:
        df = df.join(d, on=["Id", "ranker_id"], how="inner")

    # 3) Compute aggregators (as expressions), then materialize
    agg_exprs = []
    agg_names = []

    if USE_RRF:
        agg_exprs.append(agg_rrf(df, model_cols, RRF_K))
        agg_names.append("agg_rrf")

    if USE_BORDA:
        agg_exprs.append(agg_borda(df, model_cols))
        agg_names.append("agg_borda")

    if USE_SOFTMAX:
        agg_exprs.append(agg_softmax(df, model_cols, TAU))
        agg_names.append("agg_softmax")

    if not agg_exprs:
        print("❌ No aggregators selected.")
        return

    df = df.with_columns(agg_exprs)

    # 4) Optionally blend aggregators with min-variance weights (computed from Spearman corr)
    if BLEND_AGGREGATORS and len(agg_names) > 1:
        # Normalize each aggregator per group so scales are compatible
        norm_cols = []
        for name in agg_names:
            norm_col = f"{name}_norm"
            df = df.with_columns(safe_minmax_norm(pl.col(name), "ranker_id").alias(norm_col))
            norm_cols.append(norm_col)

        # Correlation across all rows (Spearman) between normalized aggregators
        pdf = df.select(norm_cols).to_pandas()
        corr_df = pdf.corr(method="spearman")
        print("📊 Spearman between aggregators:\n", corr_df, "\n")

        corr = corr_df.values
        w = mv_weights_from_corr(corr, lam=CORR_SHRINKAGE_LAMBDA)

        if SAVE_AGG_WEIGHTS:
            pd.Series(w, index=agg_names, name="weight").to_csv("agg_blend_weights.csv", header=True)
            print("📝 Saved aggregator blend weights to agg_blend_weights.csv")

        # Blend: sum_j w_j * norm_agg_j
        blended_expr = None
        for name, norm_col, ww in zip(agg_names, norm_cols, w):
            term = pl.col(norm_col) * float(ww)
            blended_expr = term if blended_expr is None else (blended_expr + term)

        df = df.with_columns(blended_expr.alias("ensemble_score"))

    else:
        # Use a single aggregator
        pick = SINGLE_AGGREGATOR.lower()
        if pick not in {"rrf", "borda", "softmax"}:
            raise ValueError("SINGLE_AGGREGATOR must be one of: 'rrf', 'borda', 'softmax'")
        pick_col = f"agg_{pick}"
        if pick_col not in df.columns:
            raise ValueError(f"Aggregator {pick_col} not computed; enable it or change SINGLE_AGGREGATOR.")
        df = df.with_columns(pl.col(pick_col).alias("ensemble_score"))

    # 5) Optional final groupwise normalization
    if NORMALIZE_FINAL_PER_GROUP:
        df = df.with_columns(
            safe_minmax_norm(pl.col("ensemble_score"), "ranker_id").alias("ensemble_score")
        )

    # 6) Final ranking within each ranker_id
    if DETERMINISTIC_TIE_BREAK:
        df = (
            df.sort(["ranker_id", "ensemble_score", "Id"], descending=[False, True, False])
              .with_columns(
                  pl.col("ensemble_score")
                  .rank("ordinal", descending=True)
                  .over("ranker_id")
                  .cast(pl.Int32)
                  .alias("selected")
              )
        )
    else:
        df = df.with_columns(
            pl.col("ensemble_score")
            .rank("ordinal", descending=True)
            .over("ranker_id")
            .cast(pl.Int32)
            .alias("selected")
        )

    final_submission = df.select(["Id", "ranker_id", "selected"])
    print("✅ Sample final submission:")
    print(final_submission.head())

    final_submission.write_parquet(OUTPUT_PARQUET)
    print(f"💾 Saved as: {OUTPUT_PARQUET}")

if __name__ == "__main__":
    main()


📊 Spearman between aggregators:
                   agg_rrf_norm  agg_borda_norm  agg_softmax_norm
agg_rrf_norm          1.000000        0.253933          0.917329
agg_borda_norm        0.253933        1.000000          0.161572
agg_softmax_norm      0.917329        0.161572          1.000000 

📝 Saved aggregator blend weights to agg_blend_weights.csv
✅ Sample final submission:
shape: (5, 3)
┌──────────┬─────────────────────────────────┬──────────┐
│ Id       ┆ ranker_id                       ┆ selected │
│ ---      ┆ ---                             ┆ ---      │
│ i64      ┆ str                             ┆ i32      │
╞══════════╪═════════════════════════════════╪══════════╡
│ 18952395 ┆ 0001b08669de43deb3606f6764f1b4… ┆ 1        │
│ 18952394 ┆ 0001b08669de43deb3606f6764f1b4… ┆ 2        │
│ 18952396 ┆ 0001b08669de43deb3606f6764f1b4… ┆ 3        │
│ 24082569 ┆ 0002979a2bf046d99d0ddc79e924cf… ┆ 1        │
│ 24082567 ┆ 0002979a2bf046d99d0ddc79e924cf… ┆ 2        │
└──────────┴──────────────

In [None]:
import polars as pl

# Citește fișierul CSV
df = pl.read_csv("submission_ensemble.csv")

# Scrie fișierul în format Parquet
df.write_parquet("submission_ensemble.parquet")

print("💾 Conversie completă: submission_ensemble.parquet")


💾 Conversie completă: submission_ensemble.parquet


In [None]:
import polars as pl

def load_submission_with_confidence(tag):
    df = pl.read_parquet(f"./ensemble/submission_{tag}_with_confidence.parquet").drop(
        "selected"
    )

    return df

dfs = []

timetag_score = {
    # "20250716132706": [0, 0.51381],
    # "20250718083308": [0, 0.51822],
    # "20250719002505": [0, 0.51859],
    # "20250720003111": [0, "0.50693/lgb"],
    "20250721025740": [0, 0.51960],
    "20250722050939": [1, 0.52070],
    "20250721083807": [1, 0.52244],
    "20250724032338": [1, "0.51345/lgb"],
    # "20250725040223": [1, 0.52309],
    "20250725083055": [1, 0.52391],
    "20250727084025": [1, 0.52795],
    "20250728094305": [1, 0.52492],
    "20250729084249": [1, 0.51822],
    # "dl_ranker": [1, 0.48755],
    "0.49242": [1, 0.49242],
}

for timetag, score_list in timetag_score.items():
    df = load_submission_with_confidence(timetag)
    weight, score = score_list
    df = df.with_columns(
        (pl.col("confidence")).alias(f"confidence_{score}")
    ).drop("confidence")
    dfs.append(df)

df_combined = dfs[0]
for i in range(1, len(dfs)):
    df_combined = df_combined.join(dfs[i], on=["Id", "ranker_id"])

print(df_combined)

In [None]:
confidence_cols = [col for col in df_combined.columns if col.startswith("confidence")]
corr_df = df_combined.select(confidence_cols).to_pandas().corr(method="spearman")
print(corr_df)

In [None]:
mean_corr = corr_df.apply(lambda row: (row.sum() - 1) / (len(row) - 1), axis=1)

model_uniqueness = 1 - mean_corr

weights = model_uniqueness / model_uniqueness.sum()
print(weights)

In [None]:
weighted_conf = sum(
    df_combined[f].fill_null(0) * w
    for f, w in zip(confidence_cols, weights.values)
)

df_combined = df_combined.with_columns(weighted_conf.alias("ensemble_confidence"))

In [None]:
df_ranked = df_combined.with_columns(
    [
        pl.col("ensemble_confidence")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    ]
)

# Load original order from one of the submissions
df_original = dfs[0].select(["Id", "ranker_id"])

# Join and keep only required columns in original order
final_submission = df_original.join(
    df_ranked.select(["Id", "ranker_id", "selected"]),
    on=["Id", "ranker_id"],
    how="left",
)
print(final_submission.head())

In [None]:
import polars as pl
import os

# 🎯 Weights for each model
weights = {
    "confidence_0.52244":       0.108858,
    "confidence_0.51345/lgb":   0.131961,
    "confidence_0.52391":       0.106060,
    "confidence_0.52795":       0.099680,
    "confidence_0.52603":       0.099481,
    "confidence_0.52474":       0.102036,
    "confidence_0.51244/lgb":   0.132210,
    "confidence_0.48755/0.49242":       0.219714,
}


# 📁 Input files mapped to confidence column names
file_to_confcol = {
    "submission_20250721083807_0.52244_with_confidence.csv": "confidence_0.52244",
    "submission_20250724032338_0.51345_with_confidence.csv": "confidence_0.51345/lgb",
    "submission_20250725083055_0.52391_with_confidence.csv": "confidence_0.52391",
    "submission_20250727084025_0.52795_with_confidence.csv": "confidence_0.52795",
    "submission_20250802074816_0.52603_with_confidence.csv": "confidence_0.52603",
    "submission_20250803063026_0.52474_with_confidence.csv": "confidence_0.52474",
    "submission_20250804001151_0.51244_with_confidence.csv": "confidence_0.51244/lgb",
    "submission_dl_ranker_weighted_boosted_normalized.csv": "confidence_0.48755/0.49242"
}

# 📥 Load and rename confidence columns
dfs = []
for file, colname in file_to_confcol.items():
    if not os.path.exists(file):
        print(f"❌ File not found: {file}")
        continue
    df = pl.read_csv(file)
    if "selected" in df.columns:
        df = df.drop("selected")
    df = df.with_columns(pl.col("confidence").alias(colname)).drop("confidence")
    dfs.append(df)

# 🔗 Join all DataFrames
df_combined = dfs[0]
for df in dfs[1:]:
    df_combined = df_combined.join(df, on=["Id", "ranker_id"])

# 🧮 Compute ensemble confidence
confidence_cols = list(weights.keys())
df_combined = df_combined.with_columns(
    sum(df_combined[col] * weights[col] for col in confidence_cols).alias("ensemble_confidence")
)

# 🏁 Rank and save result
df_output = df_combined.with_columns(
    pl.col("ensemble_confidence")
    .rank("ordinal", descending=True)
    .over("ranker_id")
    .cast(pl.Int32)
    .alias("selected")
).select(["Id", "ranker_id", "selected"])

df_output.write_parquet("submission_weighted_ensemble.parquet")
print("📦 Saved unboosted ensemble → submission_weighted_ensemble.parquet")


📦 Saved unboosted ensemble → submission_weighted_ensemble.parquet


In [None]:
import polars as pl
import os
from collections import Counter, defaultdict

# 🎯 Weights for each model
weights = {
    "confidence_0.51960": 0.089680,
    "confidence_0.52070": 0.087749,
    "confidence_0.52244": 0.089308,
    "confidence_0.51345/lgb": 0.129152,
    "confidence_0.52391": 0.093028,
    "confidence_0.52795": 0.097278,
    "confidence_0.52492": 0.094761,
    "confidence_0.51822": 0.101181,
    "confidence_0.48755": 0.217863,
}

# 📁 Input files mapped to confidence column names
file_to_confcol = {
    "submission_20250721025740_0.51960_with_confidence.csv": "confidence_0.51960",
    "submission_20250722050939_0.52070_with_confidence.csv": "confidence_0.52070",
    "submission_20250721083807_0.52244_with_confidence.csv": "confidence_0.52244",
    "submission_20250724032338_0.51345_with_confidence.csv": "confidence_0.51345/lgb",
    "submission_20250725083055_0.52391_with_confidence.csv": "confidence_0.52391",
    "submission_20250727084025_0.52795_with_confidence.csv": "confidence_0.52795",
    "submission_20250728094305_0.52492_with_confidence.csv": "confidence_0.52492",
    "submission_20250729084249_0.51822_with_confidence.csv": "confidence_0.51822",
    "submission_dl_ranker_0.48755_with_confidence.csv": "confidence_0.48755",
}

# 🚀 Boost factors based on frequency in top 3 across models
boost_map = {
    2: 1.30,
    3: 1.40,
    4: 1.45,
    5: 1.50,
    6: 1.55,
    7: 1.60,
    8: 1.65,
    9: 1.70,
}

# 📥 Load CSVs and rename confidence column
dfs = []
for file, colname in file_to_confcol.items():
    if not os.path.exists(file):
        print(f"❌ File not found: {file}")
        continue
    df = pl.read_csv(file)
    if "selected" in df.columns:
        df = df.drop("selected")
    df = df.with_columns(pl.col("confidence").alias(colname)).drop("confidence")
    dfs.append(df)

# 🔗 Join all dataframes on Id and ranker_id
df_combined = dfs[0]
for df in dfs[1:]:
    df_combined = df_combined.join(df, on=["Id", "ranker_id"])

confidence_cols = list(weights.keys())

# 🧮 Compute ensemble confidence (before boosting)
df_combined = df_combined.with_columns(
    sum(df_combined[col] * weights[col] for col in confidence_cols).alias("ensemble_confidence")
)

# 💾 Save unboosted version
df_combined.with_columns(
    pl.col("ensemble_confidence")
    .rank("ordinal", descending=True)
    .over("ranker_id")
    .cast(pl.Int32)
    .alias("selected")
).select(["Id", "ranker_id", "selected"]).write_parquet("submission_weighted_ensemble_unboosted.parquet")

print("📦 Saved unboosted ensemble → submission_weighted_ensemble_unboosted.parquet")

# 🚀 Compute dynamic boost factors: (ranker_id, Id) → boost_factor
boosted_records = []
boost_counts = defaultdict(int)

# 🔄 Group data by ranker_id just once
for group in df_combined.partition_by("ranker_id"):
    ranker_id_val = group["ranker_id"][0]
    top_ids_counter = Counter()

    for col in confidence_cols:
        top3_ids = (
            group.select(["Id", col])
            .sort(by=col, descending=True)
            .head(3)
            .get_column("Id")
            .to_list()
        )
        top_ids_counter.update(top3_ids)

    for id_, freq in top_ids_counter.items():
        if freq >= 2:
            factor = boost_map.get(freq, 1.0)
            boosted_records.append((ranker_id_val, id_, factor))
            boost_counts[factor] += 1

# 📌 Create boost DataFrame
df_boost = pl.DataFrame(boosted_records, schema=["ranker_id", "Id", "boost_factor"])

# 🧪 Apply boost to ensemble_confidence
df_boosted = df_combined.join(df_boost, on=["ranker_id", "Id"], how="left").with_columns(
    (pl.col("ensemble_confidence") * pl.col("boost_factor").fill_null(1.0)).alias("ensemble_confidence")
)

# 🏁 Compute final ranking after boosting
df_boosted = df_boosted.with_columns(
    pl.col("ensemble_confidence")
    .rank("ordinal", descending=True)
    .over("ranker_id")
    .cast(pl.Int32)
    .alias("selected")
)

df_boosted.select(["Id", "ranker_id", "selected"]).write_parquet("submission_weighted_ensemble_boosted.parquet")
print("✅ Saved boosted ensemble → submission_weighted_ensemble_boosted.parquet")

# 📊 Boost summary
print("\n📊 Boosting summary (number of boosted Ids):")
for factor in sorted(boost_counts):
    print(f"×{factor:.2f} → {boost_counts[factor]} rows")

📦 Saved unboosted ensemble → submission_weighted_ensemble_unboosted.parquet


  df_boost = pl.DataFrame(boosted_records, schema=["ranker_id", "Id", "boost_factor"])


✅ Saved boosted ensemble → submission_weighted_ensemble_boosted.parquet

📊 Boosting summary (număr de Id-uri boost-uite):
×1.30 → 25938 rows
×1.40 → 17318 rows
×1.45 → 14032 rows
×1.50 → 12435 rows
×1.55 → 11609 rows
×1.60 → 13422 rows
×1.65 → 19900 rows
×1.70 → 64719 rows


In [None]:
import polars as pl
import os
from collections import Counter, defaultdict

# 📥 Încarcă fișierele *_with_confidence.csv
files = [f for f in os.listdir() if f.endswith("_with_confidence.csv")]

dfs = []
for file in files:
    score_tag = os.path.splitext(file)[0]  # ex: submission_20250721025740_0.51960_with_confidence
    df = pl.read_csv(file).rename({"confidence": f"confidence_{score_tag}"})
    dfs.append(df)

# 🔗 Combină toate pe Id și ranker_id
df_combined = dfs[0]
for df in dfs[1:]:
    df_combined = df_combined.join(df, on=["Id", "ranker_id"])

# 🧠 Coloanele cu confidence
confidence_cols = [col for col in df_combined.columns if col.startswith("confidence_")]

# 📊 Calculează corelație Spearman între modele
corr_df = df_combined.select(confidence_cols).to_pandas().corr(method="spearman")

# 🎯 Derivă ponderea pe baza unicitații (1 - corelația medie)
mean_corr = corr_df.apply(lambda row: (row.sum() - 1) / (len(row) - 1), axis=1)
model_uniqueness = 1 - mean_corr
weights = model_uniqueness / model_uniqueness.sum()

# 🧮 Agregare ponderată
weighted_conf = sum(
    df_combined[col].fill_null(0) * weight
    for col, weight in zip(confidence_cols, weights.values)
)

df_combined = df_combined.with_columns(weighted_conf.alias("ensemble_confidence"))

# 🏁 Clasament final fără boost
df_ranked = df_combined.with_columns(
    pl.col("ensemble_confidence")
    .rank("ordinal", descending=True)
    .over("ranker_id")
    .cast(pl.Int32)
    .alias("selected")
)

# 💾 Salvează rezultatul fără boost
df_ranked.select(["Id", "ranker_id", "selected"]).write_parquet("submission_spearman_unboosted.parquet")
print("✅ Salvat fără boosting: submission_spearman_unboosted.parquet")

# 🚀 Boosting bazat pe frecvența în top 3
boost_map = {
    2: 1.20, 3: 1.25, 4: 1.35, 5: 1.40,
    6: 1.45, 7: 1.50, 8: 1.55, 9: 1.60,
    10: 1.65, 11: 1.70
}
boosted_records = []
boost_counts = defaultdict(int)

for group in df_combined.partition_by("ranker_id"):
    ranker_id_val = group["ranker_id"][0]
    top_ids_counter = Counter()

    for col in confidence_cols:
        top3_ids = (
            group.select(["Id", col])
            .sort(col, descending=True)
            .head(3)
            .get_column("Id")
            .to_list()
        )
        top_ids_counter.update(top3_ids)

    for id_, freq in top_ids_counter.items():
        if freq >= 2:
            factor = boost_map.get(freq, 1.0)
            boosted_records.append((ranker_id_val, id_, factor))
            boost_counts[factor] += 1

df_boost = pl.DataFrame(boosted_records, schema=["ranker_id", "Id", "boost_factor"])

# 📈 Aplică boosting
df_boosted = df_combined.join(df_boost, on=["ranker_id", "Id"], how="left").with_columns(
    (pl.col("ensemble_confidence") * pl.col("boost_factor").fill_null(1.0)).alias("ensemble_confidence")
)

df_boosted = df_boosted.with_columns(
    pl.col("ensemble_confidence")
    .rank("ordinal", descending=True)
    .over("ranker_id")
    .cast(pl.Int32)
    .alias("selected")
)

df_boosted.select(["Id", "ranker_id", "selected"]).write_parquet("submission_spearman_boosted.parquet")
print("✅ Salvat cu boosting: submission_spearman_boosted.parquet")

# 📊 Rezumat boosting
print("\n📊 Boosting summary:")
for factor in sorted(boost_counts):
    print(f"×{factor:.2f} → {boost_counts[factor]} rows")


DuplicateError: column with name 'selected_right' already exists

You may want to try:
- renaming the column prior to joining
- using the `suffix` parameter to specify a suffix different to the default one ('_right')

Resolved plan until failure:

	---> FAILED HERE RESOLVING 'sink' <---
DF ["Id", "ranker_id", "selected", "confidence_submission_20250727084025_0.52795_with_confidence"]; PROJECT */4 COLUMNS

**PCA**

In [None]:
import polars as pl
import numpy as np
from sklearn.decomposition import PCA


def load_submission_with_confidence(tag):
    df = pl.read_parquet(f"./ensemble/submission_{tag}_with_confidence.parquet").drop(
        "selected"
    )
    return df


# Timetag and corresponding public scores (used for column names and optional interpretation)
timetag_score = {
    "20250721025740": [0, 0.51960],
    "20250722050939": [1, 0.52070],
    "20250721083807": [1, 0.52244],
    "20250724032338": [1, "0.51345/lgb"],
    "20250725083055": [1, 0.52391],
    "20250727084025": [1, 0.52795],
    "20250728094305": [1, 0.52492],
    "20250729084249": [1, 0.51822],
    "0.49242": [1, 0.49242],
}

# Load and rename confidence columns from each model
dfs = []
for timetag, score_list in timetag_score.items():
    df = load_submission_with_confidence(timetag)
    _, score = score_list
    df = df.with_columns(pl.col("confidence").alias(f"confidence_{score}")).drop(
        "confidence"
    )
    dfs.append(df)

# Join all submissions on Id and ranker_id
df_combined = dfs[0]
for i in range(1, len(dfs)):
    df_combined = df_combined.join(dfs[i], on=["Id", "ranker_id"])

# Extract confidence columns
confidence_cols = [col for col in df_combined.columns if col.startswith("confidence")]
X = df_combined.select(confidence_cols).to_numpy()
pca = PCA(n_components=1)
pc1_scores = pca.fit_transform(X).flatten()

# Add ensemble confidence based on PCA projection
df_combined = df_combined.with_columns(pl.Series("ensemble_confidence", pc1_scores))

# Rank each item within its group (ranker_id) based on ensemble confidence
df_ranked = df_combined.with_columns(
    pl.col("ensemble_confidence")
    .rank(method="ordinal", descending=True)
    .over("ranker_id")
    .cast(pl.Int32)
    .alias("selected")
)
# Restore original ordering and format final submission
df_original = dfs[0].select(["Id", "ranker_id"])
final_submission = df_original.join(
    df_ranked.select(["Id", "ranker_id", "selected"]),
    on=["Id", "ranker_id"],
    how="left",
)

# Display the top rows of the final submission
print(final_submission.head())

# Print each model's contribution to the PCA component
print("\nPCA component weights per model:")
for col_name, weight in zip(confidence_cols, pca.components_[0]):
    print(f"{col_name}: {weight:.4f}")

In [None]:
final_submission.write_parquet("submission_ensemble_pca.parquet")

In [None]:
import polars as pl
import numpy as np
from sklearn.decomposition import PCA
import glob

def load_confidence_csv(path):
    df = pl.read_csv(path).select(["Id", "ranker_id", "confidence"])
    return df

def main():
    # Caută toate fișierele *_with_confidence.csv
    confidence_files = sorted(glob.glob("*_with_confidence.csv"))
    print(f"📁 Found {len(confidence_files)} confidence files.")

    if not confidence_files:
        print("❌ No confidence files found.")
        return

    # Încarcă și redenumește coloanele de confidence
    dfs = []
    for f in confidence_files:
        tag = os.path.splitext(os.path.basename(f))[0].replace("_with_confidence", "")
        df = load_confidence_csv(f)
        df = df.rename({"confidence": f"confidence_{tag}"})
        dfs.append(df)

    # Combină toate pe Id și ranker_id
    df_combined = dfs[0]
    for df in dfs[1:]:
        df_combined = df_combined.join(df, on=["Id", "ranker_id"])

    # Aplică PCA pe scorurile de confidence
    confidence_cols = [col for col in df_combined.columns if col.startswith("confidence")]
    X = df_combined.select(confidence_cols).to_numpy()
    pca = PCA(n_components=1)
    pc1_scores = pca.fit_transform(X).flatten()

    # Adaugă ensemble confidence și selectează
    df_combined = df_combined.with_columns(pl.Series("ensemble_confidence", pc1_scores))
    df_ranked = df_combined.with_columns(
        pl.col("ensemble_confidence")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    # Format final
    final_submission = df_ranked.select(["Id", "ranker_id", "selected"])
    final_submission.write_parquet("submission_ensemble_pca.parquet")
    print("✅ Saved PCA ensemble to submission_ensemble_pca.parquet")

    # Afișează contribuția fiecărui model
    print("\n📊 PCA weights per model:")
    for name, weight in zip(confidence_cols, pca.components_[0]):
        print(f"{name}: {weight:.4f}")

if __name__ == "__main__":
    import os
    main()

📁 Found 8 confidence files.
✅ Saved PCA ensemble to submission_ensemble_pca.csv

📊 PCA weights per model:
confidence_submission_20250724032338_0.51345: 0.3525
confidence_submission_20250725040223_0.52309: 0.3580
confidence_submission_20250725083055_0.52391: 0.3580
confidence_submission_20250727084025_0.52795: 0.3586
confidence_submission_20250728094305_0.52492: 0.3589
confidence_submission_20250729084249_0.51822: 0.3581
confidence_submission_dl_ranker_0.48755: 0.3404
confidence_submission_dl_ranker_0.49242: 0.3433


In [None]:
import polars as pl
import numpy as np
from sklearn.decomposition import PCA
import glob
import os

def load_confidence_csv(path):
    df = pl.read_csv(path).select(["Id", "ranker_id", "confidence"])
    return df

def main():
    # Caută toate fișierele *_with_confidence.csv
    confidence_files = sorted(glob.glob("*_with_confidence.csv"))
    print(f"📁 Found {len(confidence_files)} confidence files.")

    if not confidence_files:
        print("❌ No confidence files found.")
        return

    # Încarcă și redenumește coloanele de confidence
    dfs = []
    for f in confidence_files:
        tag = os.path.splitext(os.path.basename(f))[0].replace("_with_confidence", "")
        df = load_confidence_csv(f)
        df = df.rename({"confidence": f"confidence_{tag}"})
        dfs.append(df)

    # Combină toate pe Id și ranker_id
    df_combined = dfs[0]
    for df in dfs[1:]:
        df_combined = df_combined.join(df, on=["Id", "ranker_id"])

    # Extrage matricea de confidence
    confidence_cols = [col for col in df_combined.columns if col.startswith("confidence")]
    X = df_combined.select(confidence_cols).to_numpy()

    # --- PAS 1: Calculează corelația Spearman ---
    corr_df = df_combined.select(confidence_cols).to_pandas().corr(method="spearman")
    mean_corr = corr_df.apply(lambda row: (row.sum() - 1) / (len(row) - 1), axis=1)
    model_uniqueness = 1 - mean_corr
    weights_spearman = model_uniqueness / model_uniqueness.sum()

    print("\n🔍 Spearman-based weights:")
    for col, w in zip(confidence_cols, weights_spearman):
        print(f"{col}: {w:.4f}")

    # --- PAS 2: Aplică PCA pe scorurile de confidence ponderate ---
    X_weighted = X * weights_spearman.values  # Broadcast pe coloane
    pca = PCA(n_components=1)
    pc1_scores = pca.fit_transform(X_weighted).flatten()

    # Adaugă ensemble confidence și selectează
    df_combined = df_combined.with_columns(pl.Series("ensemble_confidence", pc1_scores))
    df_ranked = df_combined.with_columns(
        pl.col("ensemble_confidence")
        .rank(method="ordinal", descending=True)
        .over("ranker_id")
        .cast(pl.Int32)
        .alias("selected")
    )

    # Format final
    final_submission = df_ranked.select(["Id", "ranker_id", "selected"])
    final_submission.write_parquet("submission_ensemble_spearman_pca.parquet")
    print("✅ Saved Spearman+PCA ensemble to submission_ensemble_spearman_pca.parquet")

    # Afișează contribuția fiecărui model în PC1
    print("\n📊 PCA component weights after Spearman weighting:")
    for name, weight in zip(confidence_cols, pca.components_[0]):
        print(f"{name}: {weight:.4f}")

if __name__ == "__main__":
    main()

📁 Found 8 confidence files.

🔍 Spearman-based weights:
confidence_submission_20250724032338_0.51345: 0.1303
confidence_submission_20250725040223_0.52309: 0.0999
confidence_submission_20250725083055_0.52391: 0.1001
confidence_submission_20250727084025_0.52795: 0.0968
confidence_submission_20250728094305_0.52492: 0.0947
confidence_submission_20250729084249_0.51822: 0.0995
confidence_submission_dl_ranker_0.48755: 0.1974
confidence_submission_dl_ranker_0.49242: 0.1813
✅ Saved Spearman+PCA ensemble to submission_ensemble_spearman_pca.parquet

📊 PCA component weights after Spearman weighting:
confidence_submission_20250724032338_0.51345: 0.3517
confidence_submission_20250725040223_0.52309: 0.2712
confidence_submission_20250725083055_0.52391: 0.2718
confidence_submission_20250727084025_0.52795: 0.2629
confidence_submission_20250728094305_0.52492: 0.2577
confidence_submission_20250729084249_0.51822: 0.2701
confidence_submission_dl_ranker_0.48755: 0.5317
confidence_submission_dl_ranker_0.49242:

# **Advanced Ensemble Methods**

In [None]:
import polars as pl
import os

def compute_confidence(df: pl.DataFrame, alpha: float = 0.7, k: int = 5) -> pl.DataFrame:
    # mărimea grupului pentru fiecare ranker_id
    df = df.with_columns([
        pl.len().over("ranker_id").alias("group_size")
    ])

    # 1) scorul de bază pe poziție în cadrul grupului (maxim pentru selected=1)
    base_conf = 1.0 - ((pl.col("selected") - 1) / (pl.col("group_size") - 1 + 1e-8))
    df = df.with_columns(base_conf.alias("confidence"))

    # 2) scorul RRF normalizat la [0, 1]
    rrf_max = 1.0 / (k + 1)  # valoarea RRF pentru selected=1
    df = df.with_columns(
        ((1.0 / (k + pl.col("selected"))) / rrf_max).alias("rrf_score")
    )

    # 3) combinație convexă între scorul de bază și RRF
    df = df.with_columns(
        (alpha * pl.col("confidence") + (1 - alpha) * pl.col("rrf_score")).alias("confidence")
    )

    # păstrăm 'selected'; curățăm doar coloanele temporare
    return df.drop(["group_size", "rrf_score"])

def process_file(input_path: str, alpha: float = 0.7, k: int = 5):
    print(f"🔄 Processing: {input_path}")

    if input_path.endswith(".csv"):
        df = pl.read_csv(input_path)
    elif input_path.endswith(".parquet"):
        df = pl.read_parquet(input_path)
    else:
        print(f"⚠️ Skipping unsupported file: {input_path}")
        return

    df_conf = compute_confidence(df, alpha=alpha, k=k)

    base, ext = os.path.splitext(input_path)
    if base.endswith("_with_confidence"):
        output_path = f"{base}.csv"
    else:
        output_path = f"{base}_with_confidence.csv"

    df_conf.write_csv(output_path)
    print(f"✅ Saved: {output_path}\n")


def main():
    files = [
        "submission_20250721083807_0.52244.parquet",
        "submission_20250724032338_0.51345.parquet",
        "submission_20250725083055_0.52391.parquet",
        "submission_20250727084025_0.52795.parquet",
        "submission_20250802074816_0.52603.parquet",
        "submission_20250804001151_0.51244.parquet",
        "submission_20250807032439_0.52538.parquet",
        "submission_dl_ranker_0.48755.parquet"
    ]

    alpha = 0.7
    k = 5

    for file in files:
        if os.path.exists(file):
            process_file(file, alpha=alpha, k=k)
        else:
            print(f"❌ File not found: {file}")


if __name__ == "__main__":
    main()


In [None]:
%%writefile ensemble.py
# %%
# Rank-based ensemble (no "confidence" scores):
# 1) rank -> percentile within each ranker_id group (1 for rank=1)
# 2) van der Waerden normal-scores: z = Phi^{-1}(percentile)
# 3) SML (leading eigenvector on correlations), zero diagonal
# 4) Shrink weights within clusters of highly correlated models
# 5) Aggregate and re-rank per ranker_id

import polars as pl
import numpy as np
import re
from numpy.linalg import eigh

EPS = 1e-12

FILES = [
    "submission_20250721083807_0.52244.parquet",
    "submission_20250724032338_0.51345.parquet",
    "submission_20250725083055_0.52391.parquet",
    "submission_20250727084025_0.52795.parquet",
    "submission_20250802074816_0.52603.parquet",
    "submission_20250804001151_0.51244.parquet",
    "submission_20250807032439_0.52538.parquet",
    "submission_dl_ranker_0.48755.parquet",
]

# ---------- helpers ----------
def load_rank_file(fp: str) -> pl.DataFrame:
    """Load a rank file and keep only the necessary columns."""
    return pl.read_parquet(fp).select(["Id", "ranker_id", "selected"])

def pct_from_rank(df: pl.DataFrame, col: str = "selected") -> pl.DataFrame:
    """
    Convert ranks to descending percentiles:
    percentile = 1 for rank=1, close to 0 for the worst rank in the group.
    """
    return (
        df.with_columns(pl.len().over("ranker_id").alias("n_in_group"))
          .with_columns(
              (1.0 - (pl.col(col) - 1) / (pl.col("n_in_group") - 1 + 1e-9)).alias("percentile")
          )
          .drop("n_in_group")
    )

# ---- robust inverse normal CDF (Acklam approximation) ----
def _phi_inv(p: np.ndarray) -> np.ndarray:
    """
    Numerically stable approximation of the inverse standard normal CDF Φ⁻¹(p).
    (Peter John Acklam's rational approximation)
    """
    p = np.asarray(p, dtype=float)
    a = [-3.969683028665376e+01, 2.209460984245205e+02, -2.759285104469687e+02,
         1.383577518672690e+02, -3.066479806614716e+01, 2.506628277459239e+00]
    b = [-5.447609879822406e+01, 1.615858368580409e+02, -1.556989798598866e+02,
         6.680131188771972e+01, -1.328068155288572e+01]
    c = [-7.784894002430293e-03, -3.223964580411365e-01, -2.400758277161838e+00,
         -2.549732539343734e+00, 4.374664141464968e+00, 2.938163982698783e+00]
    d = [7.784695709041462e-03, 3.224671290700398e-01, 2.445134137142996e+00, 3.754408661907416e+00]
    plow, phigh = 0.02425, 1 - 0.02425
    x = np.empty_like(p, dtype=float)

    mask = p < plow
    if np.any(mask):
        q = np.sqrt(-2 * np.log(p[mask]))
        x[mask] = (((((c[0]*q + c[1])*q + c[2])*q + c[3])*q + c[4])*q + c[5]) / \
                   ((((d[0]*q + d[1])*q + d[2])*q + d[3])*q + 1)

    mask = (p >= plow) & (p <= phigh)
    if np.any(mask):
        q = p[mask] - 0.5
        r = q * q
        x[mask] = (((((a[0]*r + a[1])*r + a[2])*r + a[3])*r + a[4])*r + a[5]) * q / \
                   (((((b[0]*r + b[1])*r + b[2])*r + b[3])*r + b[4])*r + 1)

    mask = p > phigh
    if np.any(mask):
        q = np.sqrt(-2 * np.log(1 - p[mask]))
        x[mask] = -(((((c[0]*q + c[1])*q + c[2])*q + c[3])*q + c[4])*q + c[5]) / \
                    ((((d[0]*q + d[1])*q + d[2])*q + d[3])*q + 1)
    return x

def normal_score(u: np.ndarray) -> np.ndarray:
    """van der Waerden normal scores via Φ⁻¹(u) using a robust approximation."""
    u = np.clip(u, 1e-6, 1 - 1e-6)
    return _phi_inv(u)

def zscore_cols(M: np.ndarray) -> np.ndarray:
    """Column-wise z-score standardization."""
    mu = M.mean(axis=0, keepdims=True)
    sd = M.std(axis=0, ddof=1, keepdims=True)
    sd = np.where(sd < 1e-8, 1.0, sd)
    return (M - mu) / sd

def leading_eigvec_sym(C: np.ndarray) -> np.ndarray:
    """Return the leading eigenvector of a symmetric matrix (positive-oriented)."""
    _, vecs = eigh(C)
    v = vecs[:, -1]
    return v if v.mean() >= 0 else -v

def cluster_by_corr(R: np.ndarray, thr: float = 0.92):
    """Greedy clustering of models with high absolute correlation (|corr| >= thr)."""
    m = R.shape[0]
    seen = np.zeros(m, dtype=bool)
    clusters = []
    for i in range(m):
        if seen[i]:
            continue
        grp = [i]
        seen[i] = True
        for j in range(i + 1, m):
            if not seen[j] and abs(R[i, j]) >= thr:
                grp.append(j)
                seen[j] = True
        clusters.append(grp)
    return clusters

def shrink_within_clusters(w: np.ndarray, clusters, alpha=0.5):
    """Blend weights with the cluster mean to avoid double-counting similar models."""
    w2 = w.copy()
    for grp in clusters:
        if len(grp) <= 1:
            continue
        mu = w[grp].mean()
        for k in grp:
            w2[k] = alpha * mu + (1 - alpha) * w[k]
    s = w2.sum()
    return (w2 / s) if s > 0 else (np.ones_like(w2) / len(w2))

# ---------- main ----------
def main():
    # 1) Load files and create percentile columns per model (drop 'selected' to avoid join collisions)
    dfs, colnames = [], []
    for fp in FILES:
        try:
            df = load_rank_file(fp)
        except Exception as e:
            print(f"⚠️ Skipping (cannot read): {fp} ({e})")
            continue

        m = re.search(r"_([\d.]+)\.parquet$", fp)
        tag = m.group(1) if m else re.sub(r"\W+", "", fp.split("/")[-1])

        df_pct = (
            pct_from_rank(df, col="selected")
            .select(["Id", "ranker_id", pl.col("percentile").alias(f"pct_{tag}")])
        )
        dfs.append(df_pct)
        colnames.append(f"pct_{tag}")

    if not dfs:
        print("❌ No valid rank files found.")
        return

    # 2) Join on Id + ranker_id
    base = dfs[0]
    for d in dfs[1:]:
        base = base.join(d, on=["Id", "ranker_id"])

    # 3) Process each ranker_id separately
    out = []
    for g in base.partition_by("ranker_id", as_dict=False, maintain_order=True):
        P = np.column_stack([g[c].to_numpy() for c in colnames])  # (n, m) percentiles
        n, m = P.shape

        # 3a) normal-scores + standardization
        Z = np.column_stack([normal_score(P[:, j]) for j in range(m)])
        if (m == 1) or (n < 3) or (not np.all(np.isfinite(Z))):
            w = np.ones(m) / m
        else:
            L = zscore_cols(Z)
            if not np.all(np.isfinite(L)):
                w = np.ones(m) / m
            else:
                # Correlation on standardized variables; zero diagonal for classic SML
                C = (L.T @ L) / max(1, n - 1)
                np.fill_diagonal(C, 0.0)
                if not np.all(np.isfinite(C)):
                    w = np.ones(m) / m
                else:
                    v = np.maximum(leading_eigvec_sym(C), 0) + EPS
                    w = v / v.sum()
                    # Shrink in clusters of highly correlated models
                    R = np.corrcoef(L.T)
                    R[~np.isfinite(R)] = 0.0
                    clusters = cluster_by_corr(R, thr=0.92)
                    w = shrink_within_clusters(w, clusters, alpha=0.5)

        # 4) Composite score and re-ranking within the group
        comp = Z.dot(w)
        gout = (
            g.with_columns(pl.Series("ensemble_score", comp))
             .with_columns(
                 pl.col("ensemble_score")
                   .rank("ordinal", descending=True)
                   .cast(pl.Int32)
                   .alias("selected")
             )
             .select(["Id", "ranker_id", "selected"])
        )
        out.append(gout)

    final = pl.concat(out, how="vertical")
    print("\n✅ Sample:")
    print(final.head())

    final.write_parquet("submission_ensemble_from_ranks.parquet")
    print("\n💾 Saved as: submission_ensemble_from_ranks.parquet")

if __name__ == "__main__":
    main()

Overwriting ensemble.py


In [None]:
%%writefile ensemble_em.py
# %%
# Label-free EM with Beta distributions on "rank-only" data:
# - For each model: transform 'selected' -> percentile within each ranker_id group
# - EM: P(u_k | y=c) ~ Beta(alpha_{k,c}, beta_{k,c}), u_k ∈ (0,1]
# - Compute posterior P(y=1 | u_1..u_m) and rank within each ranker_id
#
# Input: .parquet files with columns [Id, ranker_id, selected]
# Output: submission_ensemble_em_beta_from_ranks.parquet (Id, ranker_id, selected)

import polars as pl
import numpy as np
import re
from math import lgamma
import sys

EPS = 1e-12

FILES = [
    "submission_20250721083807_0.52244.parquet",
    "submission_20250724032338_0.51345.parquet",
    "submission_20250725083055_0.52391.parquet",
    "submission_20250727084025_0.52795.parquet",
    "submission_20250802074816_0.52603.parquet",
    "submission_20250804001151_0.51244.parquet",
    "submission_20250807032439_0.52538.parquet",
    "submission_dl_ranker_0.48755.parquet",
]

# -------------------- Utilities --------------------
def load_rank_file(fp: str) -> pl.DataFrame:
    """Load a rank file with required columns."""
    return pl.read_parquet(fp).select(["Id", "ranker_id", "selected"])

def to_percentile_per_ranker(df: pl.DataFrame, rank_col: str = "selected") -> pl.DataFrame:
    """
    Convert integer ranks into descending percentiles within each ranker_id group:
    - percentile = 1 for rank=1 (best), close to 0 for the worst rank.
    """
    return (
        df.with_columns(pl.len().over("ranker_id").alias("n_in_group"))
          .with_columns(
              (1.0 - (pl.col(rank_col) - 1) / (pl.col("n_in_group") - 1 + 1e-9)).alias("percentile")
          )
          .drop("n_in_group")
    )

def clip01(x: np.ndarray) -> np.ndarray:
    """Clip values into the open interval (0,1) to avoid issues in log/Beta computations."""
    x = np.nan_to_num(x, nan=0.5, posinf=1.0, neginf=0.0)
    return np.clip(x, 1e-6, 1.0 - 1e-6)

# -------------------- Beta EM --------------------
def log_beta_pdf(x: np.ndarray, a: float, b: float) -> np.ndarray:
    """Log Beta probability density function."""
    return (a - 1.0) * np.log(x + EPS) + (b - 1.0) * np.log(1.0 - x + EPS) - (
        lgamma(a) + lgamma(b) - lgamma(a + b)
    )

def mom_alpha_beta(mean: float, var: float, floor: float = 1e-3) -> tuple[float, float]:
    """
    Estimate Beta parameters (alpha, beta) using the method of moments.
    If the variance is too large (t <= 0), default to near-uniform Beta.
    """
    mu = float(np.clip(mean, 1e-4, 1 - 1e-4))
    var = float(max(var, 1e-6))
    t = mu * (1 - mu) / var - 1.0
    if t <= 0:
        return 1.0 + floor, 1.0 + floor
    a = mu * t
    b = (1 - mu) * t
    return float(max(a, floor)), float(max(b, floor))

class EMBeta:
    """
    Beta EM model:
    P(u_k | y=c) ~ Beta(alpha_{k,c}, beta_{k,c}), c ∈ {0,1}
    Prior P(y=1) = pi.
    Parameters are estimated with weighted method-of-moments updates.
    """
    def __init__(self, n_models: int, init: str = "mean", seed: int = 0):
        self.m = n_models
        self.pi = 0.5
        self.alpha = np.ones((self.m, 2), dtype=float)
        self.beta  = np.ones((self.m, 2), dtype=float)
        self.rng = np.random.default_rng(seed)
        self.init_mode = init

    def initialize(self, S: np.ndarray) -> np.ndarray:
        """Initialize responsibilities gamma_i using mean or random start."""
        n, m = S.shape
        if self.init_mode == "rand":
            gamma = np.clip(self.rng.uniform(0.25, 0.75, size=n), 1e-3, 1 - 1e-3)
        else:  # "mean"
            gamma = np.clip(S.mean(axis=1), 1e-3, 1 - 1e-3)

        self.pi = float(gamma.mean())

        w1 = gamma
        w0 = 1.0 - gamma
        for k in range(m):
            x = S[:, k]

            sw1 = w1.sum() + EPS
            mu1 = (w1 * x).sum() / sw1
            var1 = (w1 * (x - mu1) ** 2).sum() / sw1
            a1, b1 = mom_alpha_beta(mu1, var1)

            sw0 = w0.sum() + EPS
            mu0 = (w0 * x).sum() / sw0
            var0 = (w0 * (x - mu0) ** 2).sum() / sw0
            a0, b0 = mom_alpha_beta(mu0, var0)

            self.alpha[k, 1], self.beta[k, 1] = a1, b1
            self.alpha[k, 0], self.beta[k, 0] = a0, b0

        return gamma

    def e_step(self, S: np.ndarray) -> np.ndarray:
        """E-step: compute responsibilities γ_i."""
        n, m = S.shape
        ll0 = np.zeros(n, dtype=float)
        ll1 = np.zeros(n, dtype=float)
        for k in range(m):
            x = S[:, k]
            ll0 += log_beta_pdf(x, self.alpha[k, 0], self.beta[k, 0])
            ll1 += log_beta_pdf(x, self.alpha[k, 1], self.beta[k, 1])
        ll0 += np.log(1 - self.pi + EPS)
        ll1 += np.log(self.pi + EPS)
        mx = np.maximum(ll0, ll1)
        p1 = np.exp(ll1 - mx)
        p0 = np.exp(ll0 - mx)
        gamma = p1 / (p1 + p0 + EPS)
        return np.clip(gamma, 1e-6, 1 - 1e-6)

    def m_step(self, S: np.ndarray, gamma: np.ndarray):
        """M-step: update pi, alpha, beta parameters."""
        self.pi = float(np.mean(gamma))
        w1 = gamma
        w0 = 1.0 - gamma
        for k in range(self.m):
            x = S[:, k]

            sw1 = w1.sum() + EPS
            mu1 = (w1 * x).sum() / sw1
            var1 = (w1 * (x - mu1) ** 2).sum() / sw1
            a1, b1 = mom_alpha_beta(mu1, var1)

            sw0 = w0.sum() + EPS
            mu0 = (w0 * x).sum() / sw0
            var0 = (w0 * (x - mu0) ** 2).sum() / sw0
            a0, b0 = mom_alpha_beta(mu0, var0)

            self.alpha[k, 1], self.beta[k, 1] = a1, b1
            self.alpha[k, 0], self.beta[k, 0] = a0, b0

    def fit(self, S: np.ndarray, max_iters: int = 60, tol: float = 1e-5, verbose: bool = True):
        """Run EM until convergence or max_iters reached."""
        S = clip01(S)
        gamma = self.initialize(S)
        prev = gamma.copy()
        for it in range(1, max_iters + 1):
            gamma = self.e_step(S)
            self.m_step(S, gamma)
            delta = float(np.mean(np.abs(gamma - prev)))
            if verbose:
                print(f"EM iter {it:02d} | pi={self.pi:.4f} | Δγ={delta:.6f}")
            if delta < tol:
                break
            prev = gamma
        return gamma

# -------------------- Pipeline --------------------
def main():
    # Optional CLI arguments: --iters=, --tol=, --init=mean|rand
    max_iters = 60
    tol = 1e-5
    init = "mean"
    for arg in sys.argv[1:]:
        if arg.startswith("--iters="):
            max_iters = int(arg.split("=")[1])
        elif arg.startswith("--tol="):
            tol = float(arg.split("=")[1])
        elif arg.startswith("--init="):
            init = arg.split("=")[1]

    # 1) Build percentile table per model from rank files
    dfs, colnames = [], []
    for fp in FILES:
        try:
            df = load_rank_file(fp)
        except Exception as e:
            print(f"⚠️ Skipping (cannot read): {fp} ({e})")
            continue

        # Extract a label for the column from the file name
        m = re.search(r"_([\d.]+)\.parquet$", fp)
        tag = m.group(1) if m else re.sub(r"\W+", "", fp.split("/")[-1])

        # Compute percentile, then KEEP ONLY Id, ranker_id, and the renamed percentile column
        df_pct = (
            to_percentile_per_ranker(df)
            .select(["Id", "ranker_id", pl.col("percentile").alias(f"u_{tag}")])
        )
        dfs.append(df_pct)
        colnames.append(f"u_{tag}")

    if not dfs:
        print("❌ No valid rank files found.")
        return

    # 2) Join on Id + ranker_id (no 'selected' columns to collide)
    base = dfs[0]
    for d in dfs[1:]:
        base = base.join(d, on=["Id", "ranker_id"])

    # 3) Prepare matrix S (n, m) with percentiles in (0,1]
    S = np.column_stack([clip01(base[c].to_numpy()) for c in colnames])
    n, m = S.shape
    print(f"Detected {m} rankers over {n} items.")

    # 4) Run Beta EM globally, then re-rank per ranker_id using posterior
    if m == 1:
        post = clip01(S[:, 0])
        print("Only one ranker → using its percentile as posterior.")
    else:
        em = EMBeta(n_models=m, init=init, seed=0)
        post = em.fit(S, max_iters=max_iters, tol=tol, verbose=True)

    out = (
        base.with_columns(pl.Series(name="posterior", values=post))
            .with_columns(
                pl.col("posterior")
                  .rank("ordinal", descending=True)
                  .cast(pl.Int32)
                  .alias("selected")
            )
            .select(["Id", "ranker_id", "selected"])
    )

    print("\n✅ Sample of final submission:")
    print(out.head())

    out.write_parquet("submission_ensemble_em_beta_from_ranks.parquet")
    print("\n💾 Saved as: submission_ensemble_em_beta_from_ranks.parquet")

if __name__ == "__main__":
    main()

Writing ensemble_em.py


# **Stacking**

In [None]:
%%writefile create_stacked_features.py
import polars as pl

# 1. Read prediction files
df_dl = pl.read_csv("validation_preds_dl.csv")
df_xgb = pl.read_csv("validation_preds_xgboost.csv")

# 2. Check that Ids and ranker_ids are aligned
assert df_dl["Id"].to_list() == df_xgb["Id"].to_list(), "⚠️ Ids are not in the same order"
assert df_dl["ranker_id"].to_list() == df_xgb["ranker_id"].to_list(), "⚠️ ranker_ids are not identical"

# 3. Build the base combined DataFrame
df_combined = pl.DataFrame({
    "Id": df_dl["Id"],
    "ranker_id": df_dl["ranker_id"],
    "selected_dl": df_dl["selected"],
    "selected_xgboost": df_xgb["selected"],
    "confidence_dl": df_dl["confidence"],
    "confidence_xgboost": df_xgb["confidence"],
    "label": df_dl["label"]
})

# 4. Add group size per ranker_id
df_combined = df_combined.with_columns([
    pl.len().over("ranker_id").alias("group_size")
])

# 5. Generate additional features
df_combined = df_combined.with_columns([
    # Differences
    (pl.col("selected_dl") - pl.col("selected_xgboost")).alias("selected_diff"),
    (pl.col("confidence_dl") - pl.col("confidence_xgboost")).alias("confidence_diff"),

    # Mean and product
    ((pl.col("confidence_dl") + pl.col("confidence_xgboost")) / 2).alias("confidence_mean"),
    (pl.col("confidence_dl") * pl.col("confidence_xgboost")).alias("confidence_product"),

    # Ratios
    (pl.col("confidence_dl") / (pl.col("confidence_xgboost") + 1e-6)).alias("confidence_ratio"),
    (pl.col("selected_dl") / (pl.col("selected_xgboost") + 1e-6)).alias("selected_ratio"),

    # Inverse ranks
    (1.0 / pl.col("selected_dl")).alias("inv_rank_dl"),
    (1.0 / pl.col("selected_xgboost")).alias("inv_rank_xgb"),

    # Agreement between ranks
    (pl.col("selected_dl") == pl.col("selected_xgboost")).cast(pl.Int8).alias("rank_agreement")
])

# 6. (Optional) Drop group size if not needed
df_combined = df_combined.drop("group_size")

# 7. Save the extended feature set
df_combined.write_csv("validation_stacking_features_extended.csv")
print("✅ 'validation_stacking_features_extended.csv' created with extended features.")

Writing create_stacked_features.py


In [None]:
%%writefile train_stacking.py
import pandas as pd
import polars as pl
import numpy as np
import torch
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split

# -----------------------
# Ranking Metrics
# -----------------------

def calc_group_metrics(df, k=3):
    grouped = df.groupby("ranker_id")
    hitrate, ndcg, map3 = 0.0, 0.0, 0.0
    count = 0

    for _, group in grouped:
        if len(group) <= 10:
            continue

        scores = torch.tensor(group["stacked_score"].values, dtype=torch.float32)
        labels = torch.tensor(group["label"].values, dtype=torch.float32)
        l = len(group)

        _, topk_idx = torch.topk(scores, min(k, l))
        if (labels[topk_idx] > 0).any():
            hitrate += 1

        _, idx_pred = torch.topk(scores, min(k, l))
        _, idx_ideal = torch.topk(labels, min(k, l))

        dcg = (labels[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred)).float())).sum()
        idcg = (labels[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal)).float())).sum()
        ndcg += (dcg / idcg).item() if idcg > 0 else 0.0

        y_true = labels[idx_pred] > 0
        if y_true.sum() > 0:
            precisions = [(y_true[:j + 1].float().sum() / (j + 1)) for j in range(len(y_true)) if y_true[j]]
            map3 += torch.stack(precisions).mean().item()

        count += 1

    return {
        "HitRate@3": hitrate / count if count > 0 else 0.0,
        "NDCG@3": ndcg / count if count > 0 else 0.0,
        "MAP@3": map3 / count if count > 0 else 0.0
    }

# -----------------------
# Training
# -----------------------

# 1. Load data
df = pl.read_csv("validation_stacking_features_extended.csv")
pdf = df.to_pandas()

# 2. Features
feature_cols = [
    "confidence_dl", "confidence_xgboost",
    "selected_dl", "selected_xgboost",
    "selected_diff", "confidence_diff",
    "confidence_mean", "confidence_product",
    "confidence_ratio", "selected_ratio",
    "inv_rank_dl", "inv_rank_xgb",
    "rank_agreement"
]
X = pdf[feature_cols]
y = pdf["label"]

# 3. Grouped split
unique_groups = pdf["ranker_id"].unique()
train_groups, val_groups = train_test_split(unique_groups, test_size=0.2, random_state=42)

train_mask = pdf["ranker_id"].isin(train_groups)
val_mask = pdf["ranker_id"].isin(val_groups)

X_train = X[train_mask]
y_train = y[train_mask]
X_val = X[val_mask]
y_val = y[val_mask]

group_train = pdf[train_mask].groupby("ranker_id").size().values
group_val = pdf[val_mask].groupby("ranker_id").size().values

# 4. Train ranker
model = xgb.XGBRanker(
    objective="rank:ndcg",
    eval_metric="ndcg@3",
    tree_method="hist",
    learning_rate=0.01,
    max_depth=8,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_estimators=1500
)

model.fit(
    X_train, y_train,
    group=group_train,
    eval_set=[(X_val, y_val)],
    eval_group=[group_val],
    verbose=True
)

# 5. Predict & re-rank
pdf["stacked_score"] = model.predict(X)
pdf["selected"] = (
    pdf.groupby("ranker_id")["stacked_score"]
    .rank(method="first", ascending=False)
    .astype(int)
)

# 6. Save predictions
pl.from_pandas(pdf[["Id", "ranker_id", "stacked_score", "selected", "label"]])\
    .write_csv("validation_preds_stacked.csv")
print("✅ 'validation_preds_stacked.csv' saved.")

# 7. Evaluate metrics
val_df = pdf[val_mask]
val_out = val_df[["Id", "ranker_id", "stacked_score", "selected", "label"]]
metrics = calc_group_metrics(val_out, k=3)
print("\n📊 Ranking metrics on validation set:")
for m, v in metrics.items():
    print(f"• {m}: {v:.4f}")

# 8. Save model
joblib.dump(model, "stacked_ranker.joblib")
print("✅ Model saved to 'stacked_ranker.joblib'")

Overwriting train_stacking.py


In [None]:
%%writefile train_stacking.py
import pandas as pd
import polars as pl
import numpy as np
from sklearn.linear_model import LogisticRegression
import torch
from collections import defaultdict
import joblib
import xgboost as xgb

# -----------------------
# Ranking Metrics
# -----------------------

def calc_group_metrics(df, k=3):
    grouped = df.groupby("ranker_id")
    hitrate, ndcg, map3 = 0.0, 0.0, 0.0
    count = 0

    for _, group in grouped:
        if len(group) <= 10:
            continue

        scores = torch.tensor(group["stacked_score"].values, dtype=torch.float32)
        labels = torch.tensor(group["label"].values, dtype=torch.float32)
        l = len(group)

        # HitRate@k
        _, topk_idx = torch.topk(scores, min(k, l))
        if (labels[topk_idx] > 0).any():
            hitrate += 1

        # NDCG@k
        _, idx_pred = torch.topk(scores, min(k, l))
        _, idx_ideal = torch.topk(labels, min(k, l))

        dcg = (labels[idx_pred] / torch.log2(torch.arange(2, 2 + len(idx_pred)).float())).sum()
        idcg = (labels[idx_ideal] / torch.log2(torch.arange(2, 2 + len(idx_ideal)).float())).sum()
        ndcg += (dcg / idcg).item() if idcg > 0 else 0.0

        # MAP@k
        y_true = labels[idx_pred] > 0
        if y_true.sum() > 0:
            precisions = [(y_true[:j+1].float().sum() / (j+1)) for j in range(len(y_true)) if y_true[j]]
            map3 += torch.stack(precisions).mean().item()

        count += 1

    return {
        "HitRate@3": hitrate / count if count > 0 else 0.0,
        "NDCG@3": ndcg / count if count > 0 else 0.0,
        "MAP@3": map3 / count if count > 0 else 0.0
    }

# -----------------------
# Training
# -----------------------

# 1. Load data
df = pl.read_csv("validation_stacking_features_extended.csv")
pdf = df.to_pandas()

# 2. Define input features (all engineered features)
feature_cols = [
    "confidence_dl", "confidence_xgboost",
    "selected_dl", "selected_xgboost",
    "selected_diff", "confidence_diff",
    "confidence_mean", "confidence_product",
    "confidence_ratio", "selected_ratio",
    "inv_rank_dl", "inv_rank_xgb",
    "rank_agreement"
]
X = pdf[feature_cols]
y = pdf["label"]

# 3. Define and train the model
model = xgb.XGBClassifier(
    n_estimators=1500,
    learning_rate=0.01,
    max_depth=8,
    min_child_weight=5,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
    n_jobs=-1,
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
)
model.fit(X, y)

# 4. Predict and re-rank
pdf["stacked_score"] = model.predict_proba(X)[:, 1]

pdf["selected"] = (
    pdf.groupby("ranker_id")["stacked_score"]
    .rank(method="first", ascending=False)
    .astype(int)
)

# 5. Save results
out = pdf[["Id", "ranker_id", "stacked_score", "selected", "label"]]
pl.from_pandas(out).write_csv("validation_preds_stacked.csv")
print("✅ 'validation_preds_stacked.csv' saved.")

# 6. Evaluate
metrics = calc_group_metrics(out, k=3)
print("\n📊 Ranking metrics on validation set:")
for m, v in metrics.items():
    print(f"• {m}: {v:.4f}")

# Save model to disk
joblib.dump(model, "stacked_model.joblib")
print("✅ Modelul a fost salvat ca 'stacked_logreg_model.joblib'")

Overwriting train_stacking.py


In [None]:
%%writefile predict_stacked.py
import pandas as pd
import polars as pl
import numpy as np
import joblib

# 1. Load trained model
model = joblib.load("stacked_logreg_model.joblib")
print("✅ Model loaded from 'stacked_logreg_model.joblib'")

# 2. Load feature set (without labels)
df = pl.read_csv("validation_stacking_features_extended.csv")
pdf = df.to_pandas()

# 3. Define input features (must match training)
feature_cols = [
    "confidence_dl", "confidence_xgboost",
    "selected_dl", "selected_xgboost",
    "selected_diff", "confidence_diff",
    "confidence_mean", "confidence_product",
    "confidence_ratio", "selected_ratio",
    "inv_rank_dl", "inv_rank_xgb",
    "rank_agreement"
]

X = pdf[feature_cols]

# 4. Predict scores
pdf["stacked_score"] = model.predict_proba(X)[:, 1]

# 5. Re-rank within each ranker_id
pdf["selected"] = (
    pdf.groupby("ranker_id")["stacked_score"]
    .rank(method="first", ascending=False)
    .astype(int)
)

# 6. Save submission (only Id, ranker_id, selected)
submission = pdf[["Id", "ranker_id", "selected"]]
pl.from_pandas(submission).write_parquet("submission_stacked.parquet")
print("✅ submission_stacked.parquet saved.")


Writing predict_stacked.py


# **Check Val Differencies**

In [None]:
%%writefile check.py
import polars as pl
import numpy as np
from tqdm import tqdm
from collections import Counter

def hitrate_at_k(group_df, k=3):
    topk = group_df.sort("score", descending=True)[:k]
    return (topk["label"] > 0).any()

def analyze_misses(df: pl.DataFrame, k=3, min_group_size=10):
    misses = []
    diff_counter = Counter()

    for group_id, group_df in tqdm(df.group_by("ranker_id"), desc="Evaluating HitRate@3"):
        if group_df.height < min_group_size:
            continue

        group_df = group_df.sort("score", descending=True)
        topk = group_df[:k]
        label_row_df = group_df.filter(pl.col("label") > 0)

        if label_row_df.is_empty():
            continue

        if (topk["label"] > 0).any():
            continue

        label_row = {col: label_row_df[col][0] for col in group_df.columns}
        differences = []

        for i, row in enumerate(topk.iter_rows(named=True)):
            diff = {}
            for col in group_df.columns:
                if row[col] != label_row[col]:
                    diff[col] = (label_row[col], row[col])
                    diff_counter[col] += 1
            differences.append((i + 1, diff))

        misses.append({
            "ranker_id": group_id,
            "topk_differences": differences
        })

    return misses, diff_counter


def main():
    csv_path = "validation_preds_dl_full.csv"
    df = pl.read_csv(csv_path)

    required_cols = {"ranker_id", "score", "label"}
    if not required_cols.issubset(set(df.columns)):
        raise ValueError(f"The CSV file must contain the following columns: {required_cols}")

    total_groups = 0
    hit_groups = 0
    for _, group_df in df.group_by("ranker_id"):
        if group_df.height < 10:
            continue
        total_groups += 1
        if hitrate_at_k(group_df, k=3):
            hit_groups += 1

    hitrate = hit_groups / total_groups if total_groups else 0.0
    print(f"\n🎯 HitRate@3: {hitrate:.4f} over {total_groups} valid groups")

    misses, diff_counter = analyze_misses(df, k=3, min_group_size=10)

    with open("hitrate3_miss_analysis.txt", "w") as f:
        for miss in misses:
            f.write(f"Group: {miss['ranker_id']}\n")
            f.write("❌ Differences from top 3 predictions compared to ground truth:\n")
            for rank, diffs in miss["topk_differences"]:
                f.write(f"  ↳ Top {rank} differences:\n")
                if not diffs:
                    f.write("    (Identical to ground truth)\n")
                else:
                    for key, (true_val, pred_val) in diffs.items():
                        f.write(f"    {key}: ground_truth={true_val} | predicted={pred_val}\n")
            f.write("\n" + "-" * 40 + "\n")

    print("📄 Group-wise differences saved to 'hitrate3_miss_analysis.txt'.")

    with open("hitrate3_diff_column_stats.txt", "w") as f:
        f.write("📊 Top Differing Columns (sorted by frequency):\n\n")
        for col, count in diff_counter.most_common():
            f.write(f"{col}: {count} differences\n")

    print("📊 Column difference summary saved to 'hitrate3_diff_column_stats.txt'.")

if __name__ == "__main__":
    main()

Overwriting check.py
