In [42]:
import numpy as np
import polars as pl
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import xgboost as xgb

In [43]:
# Load Data
train_data = pl.read_csv('train.csv')
test_data = pl.read_csv('test.csv')

In [44]:
train_data = train_data.drop('PassengerId')

In [45]:
def process_data(df, ticket_freq_map=None, age_medians=None, family_surv_map=None, is_train=True):
    # Age: fill by Sex + Pclass median
    if age_medians is None:
        age_medians = df.group_by(['Sex', 'Pclass']).agg(pl.col('Age').median().alias('Age_median'))
    
    df = df.join(age_medians, on=['Sex', 'Pclass'], how='left')
    df = df.with_columns(pl.col('Age').fill_null(pl.col('Age_median'))).drop('Age_median')
    
    # Fill other nulls
    df = df.with_columns([
        pl.col('Embarked').fill_null('S'),
        pl.col('Fare').fill_null(pl.col('Fare').median())
    ])
    
    # Extract surname for family grouping
    df = df.with_columns(
        pl.col('Name').str.split(',').list.get(0).alias('Surname')
    )
    
    # Extract title from name
    df = df.with_columns(
        pl.col('Name').str.split(', ').list.get(1).str.split('.').list.get(0).alias('Title')
    )
    
    # Is married
    df = df.with_columns(pl.col('Title').is_in(['Mrs', 'Mme']).cast(pl.Int64).alias('Is_Married'))
    
    # Group titles
    df = df.with_columns(
        pl.when(pl.col('Title') == 'Mr').then(0)
          .when(pl.col('Title').is_in(['Miss', 'Ms', 'Mlle'])).then(1)
          .when(pl.col('Title').is_in(['Mrs', 'Mme'])).then(2)
          .when(pl.col('Title') == 'Master').then(3)
          .otherwise(4).alias('Title')
    )
    
    # Family size
    df = df.with_columns((pl.col('SibSp') + pl.col('Parch') + 1).alias('Family_Size'))
    
    # Ticket frequency
    if ticket_freq_map is None:
        ticket_freq_map = df.group_by('Ticket').agg(pl.len().alias('Ticket_Freq'))
    
    df = df.join(ticket_freq_map, on='Ticket', how='left')
    df = df.with_columns(pl.col('Ticket_Freq').fill_null(1))
    
    # Family survival rate (leave-one-out for train, direct lookup for test)
    if is_train and 'Survived' in df.columns:
        # Calculate family survival stats
        family_stats = (df.filter(pl.col('Family_Size') > 1)
                        .group_by('Surname')
                        .agg([pl.col('Survived').sum().alias('Fam_Surv_Sum'),
                              pl.len().alias('Fam_Count')]))
        
        df = df.join(family_stats, on='Surname', how='left')
        
        # Leave-one-out: (sum - self) / (count - 1)
        df = df.with_columns(
            pl.when((pl.col('Fam_Count').is_not_null()) & (pl.col('Fam_Count') > 1))
              .then((pl.col('Fam_Surv_Sum') - pl.col('Survived')) / (pl.col('Fam_Count') - 1))
              .otherwise(0.5)
              .alias('Family_Surv_Rate')
        )
        df = df.drop(['Fam_Surv_Sum', 'Fam_Count'])
        
        # Build map for test set
        family_surv_map = (df.filter(pl.col('Family_Size') > 1)
                           .group_by('Surname')
                           .agg(pl.col('Survived').mean().alias('Family_Surv_Rate')))
    else:
        # Test set: use map from training
        if family_surv_map is not None:
            df = df.join(family_surv_map, on='Surname', how='left')
            df = df.with_columns(pl.col('Family_Surv_Rate').fill_null(0.5))
        else:
            df = df.with_columns(pl.lit(0.5).alias('Family_Surv_Rate'))
    
    # Encode categoricals
    df = df.with_columns([
        pl.when(pl.col('Sex') == 'male').then(1).otherwise(0).alias('Sex'),
        pl.col('Embarked').replace({'S': 0, 'C': 1, 'Q': 2}).cast(pl.Int64).alias('Embarked')
    ])
    
    feature_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 
                    'Embarked', 'Family_Size', 'Ticket_Freq', 'Title', 'Is_Married',
                    'Family_Surv_Rate']
    
    X = df.select(feature_cols)
    Y = df['Survived'] if 'Survived' in df.columns else None
    
    return X, Y, ticket_freq_map, age_medians, family_surv_map

In [46]:
X_train, Y_train, ticket_freq_map, age_medians, family_surv_map = process_data(train_data, is_train=True)
X_test, _, _, _, _ = process_data(test_data, ticket_freq_map, age_medians, family_surv_map, is_train=False)

print(f"Features: {X_train.columns}")

Features: ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Family_Size', 'Ticket_Freq', 'Title', 'Is_Married', 'Family_Surv_Rate']


In [None]:
# Train ensemble
X_np = X_train.to_numpy()
Y_np = Y_train.to_numpy()

lr = Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression(max_iter=1000, random_state=42))])
rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
xgb_clf = xgb.XGBClassifier(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42, eval_metric='logloss')

ensemble = VotingClassifier(estimators=[('lr', lr), ('rf', rf), ('xgb', xgb_clf)], voting='soft')
ensemble.fit(X_np, Y_np)

predictions = ensemble.predict(X_test.to_numpy())

In [48]:
submission = pl.DataFrame({
    'PassengerId': pl.read_csv('test.csv')['PassengerId'],
    'Survived': predictions.astype(int)
})
submission.write_csv('submission.csv')