In [1]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
import gc  # Garbage Collector
import warnings
import sys
sys.path.append('/kaggle/input/newalbert')

from utils import (
    clean_features,
    create_features,
    create_initial_datetime_features,
    create_remaining_features,
    reduce_mem_usage,
    unify_nan_strategy,
    calculate_hit_rate_at_3,
)
from pipeline import (
    load_data,
    preprocess_dataframe,
    prepare_matrices,
    encode_categoricals,
    train_model,
)

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Training parameters - modify as needed
model_params = {
    'objective': 'lambdarank',
    'metric': 'None',
    'boosting_type': 'gbdt',
    'n_estimators': 8000,
    'learning_rate': 0.03,
    'num_leaves': 63,
    'max_depth': 8,
    'min_child_samples': 25,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'max_bin': 31,
    'lambda_l1': 1.0,
    'lambda_l2': 1.0,
    'min_gain_to_split': 0.1,
    'random_state': 42,
    'n_jobs': -1,
    'importance_type': 'gain',
    'verbose': -1,
    'seed': 42,
}
N_FOLDS = 5

# relleno inteligente

# In[2]:



# In[9]:


# Cell 3: Load Data
train_df, test_df, _, test_ids_df = load_data()
train_df_processed = preprocess_dataframe(train_df, is_train=True)
test_df_processed = preprocess_dataframe(test_df, is_train=False)
X, y, X_test, train_ranker_ids = prepare_matrices(train_df_processed, test_df_processed)
X, X_test, cat_features = encode_categoricals(X, X_test)
X, X_test, _ = clean_features(X, X_test, low_var_thresh=1)
categorical_features_for_encoding = cat_features
print(f"Final shapes -> X_train: {X.shape}, X_test: {X_test.shape}")




# Cell 6: Model Training

test_preds_scores, feature_importances = train_model(
    X,
    y,
    X_test,
    train_ranker_ids,
    categorical_features_for_encoding,
    params=model_params,
    n_folds=N_FOLDS,
)

# Display feature importance after training
feature_importances_display = feature_importances[['feature', 'average']]
print("\nTop Feature Importances:")
print(feature_importances_display)


# Use the test_ids_df we saved earlier which has original Id and ranker_id
submission_df = test_ids_df.copy()
submission_df['score'] = test_preds_scores 

submission_df['selected'] = submission_df.groupby('ranker_id')['score'].rank(method='first', ascending=False).astype(int)

# Select only required columns and ensure correct order
submission_df = submission_df[['Id', 'ranker_id', 'selected']]



# Save submission
submission_df.to_parquet('submission.parquet', index=False)
submission_df.to_csv('submission.csv', index=False)
print("\nSubmission file 'submission.parquet' created successfully.")
print(f"Submission shape: {submission_df.shape}")

# Basic validation of submission
# 1. All Ids from test set are present
assert len(submission_df) == len(test_ids_df), "Number of rows doesn't match test set"
assert submission_df['Id'].nunique() == len(test_ids_df['Id'].unique()), "Mismatch in unique Ids"

# 2. Ranks are integers and start from 1
assert submission_df['selected'].min() >= 1, "Ranks should be >= 1"
assert submission_df['selected'].dtype == 'int', "Ranks should be integers"

# 3. Ranks are a valid permutation within each group

print("Basic submission validation checks passed (row count, Id uniqueness, rank min value, rank dtype).")

# In[ ]:




# In[ ]:




# In[ ]:




# In[ ]:


Loading a subset of columns for train_df...
Train reducido (grupos completos): (9123530, 36)
Grupos con <11 filas: 7880
Loading a subset of columns for test_df...
Converting column legs0_departureAt (current dtype: object) to datetime.
Converting column legs0_arrivalAt (current dtype: object) to datetime.
Converting column legs1_departureAt (current dtype: object) to datetime.
Converting column legs1_arrivalAt (current dtype: object) to datetime.
Mem. usage decreased to 1116.44 Mb (39.6% reduction)
Processing group-wise features for train on columns: ['totalPrice']
  Calculating rank for totalPrice...
Mem. usage decreased to 2090.86 Mb (0.8% reduction)
Converting column legs0_departureAt (current dtype: object) to datetime.
Converting column legs0_arrivalAt (current dtype: object) to datetime.
Converting column legs1_departureAt (current dtype: object) to datetime.
Converting column legs1_arrivalAt (current dtype: object) to datetime.
Mem. usage decreased to 883.14 Mb (36.4% reduction)

KeyboardInterrupt: 