In [1]:
import pandas as pd 

df= pd.read_csv(r'yassir-ai-market-challenge\yassir_marekt_data_09_2025 2\test_new_version.csv')
unique = df['user_id'].nunique()
df2= pd.read_csv(r'submission_new_3_fixed_22222223_nine_wiw_yousefff.csv')
merged= df2[df2['user_id'].isin(df['user_id'])]
merged.to_csv('submission_new_3_fixed_22222223_nine_wiw_yousefff2232.csv', index=False)

In [2]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   - -------------------------------------- 0.0/1.5 MB 219.4 kB/s eta 0:00:07
   -- ------------------------------------- 0.1/1.5 MB 476.3 kB/s eta 0:00:03
   ----- ---------------------------------- 0.2/1.5 MB 794.9 kB/s eta 0:00:02
   ------- -------------------------------- 0.3/1.5 MB 1.1 MB/s eta 0:00:02
   ---------- ----------------------------- 0.4/1.5 MB 1.2 MB/s eta 0:00:01
   ---------- ----------------------------- 0.4/1.5 MB 1.0 MB/s eta 0:00:02
   ------------- -------------------------- 0.5/1.5 MB 1.3 MB/s eta 0:00:01
   ------------- -------------------------- 0.5/1.5 MB 1.3 MB/s eta 0:00:01
   ----------------- ----------

DEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [7]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("FAST OPTIMIZED REORDER PREDICTION - TARGET MAP 0.5+ in <10 MIN")
print("="*80)

CONFIG = {
    'data_path': 'yassir-ai-market-challenge/yassir_marekt_data_09_2025 2/',
    'threshold': 0.15,
    'sample_frac': 0.7,  # Sample training data for speed
}

def load_data():
    print("\n[1/6] Loading data...")
    base_path = CONFIG['data_path']
    
    # Load without dtype for columns that might be strings
    orders = pd.read_csv(base_path + 'orders_df.csv')
    order_products = pd.read_csv(base_path + 'orders_products_df.csv',
                                 dtype={'order_id': np.int32, 'product_id': np.int32,
                                        'add_to_cart_order': np.int16, 'reordered': np.int8})
    products = pd.read_csv(base_path + 'products_df.csv',
                          dtype={'product_id': np.int32, 'category_id': np.int16})
    user_test = pd.read_csv(r'yassir-ai-market-challenge\yassir_marekt_data_09_2025 2\test_new_version.csv',
                           dtype={'user_id': np.int32})
    
    # Convert and optimize orders dtypes
    orders['user_id'] = orders['user_id'].astype(np.int32)
    orders['order_id'] = orders['order_id'].astype(np.int32)
    orders['order_number'] = orders['order_number'].astype(np.int16)
    
    # Convert day/hour to numeric
    if orders['order_day'].dtype == 'object':
        day_map = {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 
                   'Friday': 5, 'Saturday': 6, 'Sunday': 0}
        orders['order_day'] = orders['order_day'].map(day_map).fillna(0).astype(np.int8)
    else:
        orders['order_day'] = orders['order_day'].astype(np.int8)
    
    if orders['order_hour'].dtype == 'object':
        orders['order_hour'] = pd.to_numeric(orders['order_hour'], errors='coerce').fillna(0).astype(np.int8)
    else:
        orders['order_hour'] = orders['order_hour'].astype(np.int8)
    
    orders['days_since_last_order'] = orders['days_since_last_order'].astype(np.float32)
    
    print(f"  Orders: {len(orders):,}, Order-Products: {len(order_products):,}")
    
    return orders, order_products, products, user_test['user_id'].values

def build_fast_features(order_products, orders, products):
    """Build critical features FAST using vectorized operations"""
    print("\n[2/6] Building features (vectorized)...")
    
    # Single merge with only needed columns
    op = order_products.merge(
        orders[['order_id', 'user_id', 'order_number']], 
        on='order_id', 
        how='left'
    )
    
    # Vectorized order age calculation
    user_max_order = op.groupby('user_id')['order_number'].transform('max')
    op['orders_ago'] = user_max_order - op['order_number']
    
    print("  User-Product features...")
    # Fast aggregation with only critical features
    up = op.groupby(['user_id', 'product_id'], sort=False).agg({
        'order_id': 'count',
        'reordered': 'sum',
        'order_number': ['min', 'max'],
        'orders_ago': 'min',
        'add_to_cart_order': 'mean'
    })
    
    up.columns = ['up_orders', 'up_reorders', 'up_first_order', 'up_last_order',
                  'up_orders_since_last', 'up_cart_pos']
    up.reset_index(inplace=True)
    
    # Vectorized feature creation
    up['user_total_orders'] = up['user_id'].map(user_max_order.groupby(op['user_id']).first())
    up['up_order_rate'] = up['up_orders'] / up['user_total_orders']
    up['up_reorder_ratio'] = np.where(up['up_orders'] > 1, 
                                       up['up_reorders'] / (up['up_orders'] - 1), 0)
    
    # Fast recency features (most important!)
    print("  Recency features...")
    for window in [3, 5]:
        mask = op['orders_ago'] <= window
        recent = op[mask].groupby(['user_id', 'product_id'], sort=False).size()
        up = up.merge(recent.rename(f'up_last_{window}'), 
                     left_on=['user_id', 'product_id'], 
                     right_index=True, how='left')
        up[f'up_last_{window}'] = up[f'up_last_{window}'].fillna(0)
    
    # Combined recency score
    up['up_recency_score'] = up['up_last_3'] * 2 + up['up_last_5']
    
    print("  Product features...")
    # Product stats
    prod = op.groupby('product_id', sort=False).agg({
        'order_id': 'count',
        'reordered': 'mean'
    })
    prod.columns = ['prod_orders', 'prod_reorder_rate']
    
    print("  User features...")
    # User stats
    user = orders.groupby('user_id', sort=False).agg({
        'order_id': 'count',
        'days_since_last_order': 'mean'
    })
    user.columns = ['user_orders', 'user_avg_days']
    
    # User reorder rate
    user_reorder = op.groupby('user_id', sort=False)['reordered'].mean()
    user['user_reorder_rate'] = user_reorder
    
    return up, prod, user

def create_training_data(order_products, orders, products, sample_frac=0.7):
    """Create training data with sampling for speed"""
    print("\n[3/6] Creating training data...")
    
    # Use last order for validation
    user_max = orders.groupby('user_id')['order_number'].max()
    orders_with_max = orders.merge(user_max.rename('max_order'), 
                                   left_on='user_id', right_index=True)
    
    # Sample users for faster training
    train_users = orders_with_max[orders_with_max['order_number'] == orders_with_max['max_order']]['user_id'].unique()
    if sample_frac < 1.0:
        np.random.seed(42)
        train_users = np.random.choice(train_users, 
                                      size=int(len(train_users) * sample_frac), 
                                      replace=False)
        print(f"  Sampled {len(train_users):,} users for training")
    
    # Get train and prior orders
    train_orders = orders_with_max[
        (orders_with_max['user_id'].isin(train_users)) & 
        (orders_with_max['order_number'] == orders_with_max['max_order'])
    ]['order_id'].values
    
    prior_orders = orders_with_max[
        (orders_with_max['user_id'].isin(train_users)) & 
        (orders_with_max['order_number'] < orders_with_max['max_order'])
    ]
    
    # Get labels
    train_labels = order_products[order_products['order_id'].isin(train_orders)].copy()
    train_labels = train_labels.merge(orders[['order_id', 'user_id']], on='order_id')
    train_labels['label'] = 1
    
    # Build features on prior
    up_feat, prod_feat, user_feat = build_fast_features(
        order_products[order_products['order_id'].isin(prior_orders['order_id'])],
        prior_orders,
        products
    )
    
    # Get candidates
    candidates = up_feat[['user_id', 'product_id']].copy()
    print(f"  Candidates: {len(candidates):,}")
    
    # Fast merge all features
    train_df = candidates.merge(up_feat, on=['user_id', 'product_id'], how='left')
    train_df = train_df.merge(prod_feat, left_on='product_id', right_index=True, how='left')
    train_df = train_df.merge(user_feat, left_on='user_id', right_index=True, how='left')
    
    # Add labels
    train_df = train_df.merge(
        train_labels[['user_id', 'product_id', 'label']],
        on=['user_id', 'product_id'],
        how='left'
    )
    train_df['label'] = train_df['label'].fillna(0).astype(np.int8)
    
    # Fill NaN efficiently
    for col in train_df.select_dtypes(include=[np.number]).columns:
        if train_df[col].isna().any():
            train_df[col] = train_df[col].fillna(0)
    
    print(f"  Samples: {len(train_df):,}, Positive: {train_df['label'].mean():.4f}")
    
    return train_df

def train_model(train_df):
    """Train LightGBM with speed optimizations"""
    print("\n[4/6] Training model...")
    
    exclude = ['user_id', 'product_id', 'label']
    features = [c for c in train_df.columns if c not in exclude]
    
    X = train_df[features].values.astype(np.float32)
    y = train_df['label'].values
    
    print(f"  Features: {len(features)}, Samples: {len(X):,}")
    
    # Smaller validation set
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.1, random_state=42
    )
    
    # Fast parameters
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.1,
        'num_leaves': 127,
        'max_depth': 8,
        'min_child_samples': 100,
        'subsample': 0.8,
        'subsample_freq': 1,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.05,
        'reg_lambda': 0.05,
        'verbose': -1,
        'force_col_wise': True,
        'num_threads': 4,
        'max_bin': 255
    }
    
    lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=True)
    lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train, free_raw_data=True)
    
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=300,
        valid_sets=[lgb_val],
        valid_names=['valid'],
        callbacks=[lgb.early_stopping(30), lgb.log_evaluation(50)]
    )
    
    print(f"\n  Best: iter={model.best_iteration}, AUC={model.best_score['valid']['auc']:.4f}")
    
    return model, features

def predict_test(model, features, orders, order_products, products, test_users):
    """Fast prediction"""
    print("\n[5/6] Predicting...")
    
    # Build features on full data
    up_feat, prod_feat, user_feat = build_fast_features(order_products, orders, products)
    
    # Filter to test users
    test_df = up_feat[up_feat['user_id'].isin(test_users)].copy()
    test_df = test_df.merge(prod_feat, left_on='product_id', right_index=True, how='left')
    test_df = test_df.merge(user_feat, left_on='user_id', right_index=True, how='left')
    
    # Fill NaN
    for col in test_df.select_dtypes(include=[np.number]).columns:
        if test_df[col].isna().any():
            test_df[col] = test_df[col].fillna(0)
    
    print(f"  Candidates: {len(test_df):,}")
    
    # Predict in batches
    X_test = test_df[features].values.astype(np.float32)
    test_df['prob'] = model.predict(X_test, num_threads=4)
    
    return test_df

def generate_submission(test_df, test_users, threshold, order_products):
    """Fast submission generation"""
    print("\n[6/6] Generating submission...")
    
    # Pre-compute fallback products
    fallback = order_products[order_products['reordered']==1]['product_id'].value_counts().head(50).index.tolist()
    
    # Vectorized submission creation
    test_df = test_df.sort_values(['user_id', 'prob'], ascending=[True, False])
    
    results = []
    for user_id in test_users:
        user_df = test_df[test_df['user_id'] == user_id].head(20)
        
        # Take top products above threshold
        selected = user_df[user_df['prob'] >= threshold]['product_id'].tolist()[:10]
        
        # Fill with high probability items
        if len(selected) < 5:
            additional = user_df[~user_df['product_id'].isin(selected)].head(10)
            selected.extend(additional['product_id'].tolist())
        
        # Fill to 10
        for p in fallback:
            if len(selected) >= 10:
                break
            if p not in selected:
                selected.append(p)
        
        results.append(' '.join(map(str, selected[:10])))
    
    submission = pd.DataFrame({
        'user_id': test_users,
        'products': results
    })
    
    submission = submission.sort_values('user_id')
    submission.to_csv('submission.csv', index=False)
    
    print(f"  ✓ Saved {len(submission):,} users")
    print("="*80)

def main():
    orders, order_products, products, test_users = load_data()
    
    train_df = create_training_data(
        order_products, orders, products, 
        sample_frac=CONFIG['sample_frac']
    )
    
    model, features = train_model(train_df)
    
    test_df = predict_test(model, features, orders, order_products, products, test_users)
    generate_submission(test_df, test_users, CONFIG['threshold'], order_products)

if __name__ == "__main__":
    main()

FAST OPTIMIZED REORDER PREDICTION - TARGET MAP 0.5+ in <10 MIN

[1/6] Loading data...
  Orders: 3,193,704, Order-Products: 31,536,952

[3/6] Creating training data...
  Sampled 144,346 users for training

[2/6] Building features (vectorized)...
  User-Product features...
  Recency features...
  Product features...
  User features...
  Candidates: 8,695,008
  Samples: 8,695,008, Positive: 0.0643

[4/6] Training model...
  Features: 17, Samples: 8,695,008
Training until validation scores don't improve for 30 rounds
[50]	valid's auc: 0.819464
[100]	valid's auc: 0.820656
[150]	valid's auc: 0.821436
[200]	valid's auc: 0.822108
[250]	valid's auc: 0.822596
[300]	valid's auc: 0.823039
Did not meet early stopping. Best iteration is:
[300]	valid's auc: 0.823039

  Best: iter=300, AUC=0.8230

[5/6] Predicting...

[2/6] Building features (vectorized)...
  User-Product features...
  Recency features...
  Product features...
  User features...
  Candidates: 4,164,843

[6/6] Generating submission...


In [8]:
import pandas as pd

# Load your file
df = pd.read_csv('submission.csv')

# Rename the wrong column if it exists
df.rename(columns={'products': 'product_id'}, inplace=True)

# (Optional) Verify the result
print(df.columns)

# Save back to CSV
df.to_csv('submission_yousef_new_sayi.csv', index=False)


Index(['user_id', 'product_id'], dtype='object')
