In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
warnings.filterwarnings("ignore")

import optuna
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score



In [16]:
df = pd.read_csv(r'C:\Users\vaibhav1.shinde\Documents\practise\Pipelines-main\Pipelines-main\train.csv')
df = df.drop(columns=['PassengerId','Name','Ticket','Cabin'])
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


## Custom outlier transformer using IQR method

In [17]:

class OutlierHandler(BaseEstimator, TransformerMixin):
    def __init__(self, columns=[2]):  # Age column by default (assuming it's the 3rd column, index 2)
        self.columns = columns
        self.lower_bounds = {}
        self.upper_bounds = {}
    
    def fit(self, X, y=None):
        X_arr = np.array(X)
        for col in self.columns:
            q1 = np.nanpercentile(X_arr[:, col], 25)
            q3 = np.nanpercentile(X_arr[:, col], 75)
            iqr = q3 - q1
            
            # Define bounds (1.5 is standard for IQR method)
            self.lower_bounds[col] = q1 - 1.5 * iqr
            self.upper_bounds[col] = q3 + 1.5 * iqr
        
        return self
    
    def transform(self, X):
        X_transformed = np.array(X).copy()
        
        for col in self.columns:
            # Cap outliers at the boundaries
            mask_upper = X_transformed[:, col] > self.upper_bounds[col]
            mask_lower = X_transformed[:, col] < self.lower_bounds[col]
            
            X_transformed[mask_upper, col] = self.upper_bounds[col]
            X_transformed[mask_lower, col] = self.lower_bounds[col]
        
        return X_transformed
        

## Step 1: Split data into train+validation and test sets

In [18]:

X_temp, X_test, y_temp, y_test = train_test_split(
    df.drop(columns=['Survived']),
    df['Survived'],
    test_size=0.2,
    random_state=42
)


## Step 2: Split the temp data into training and validation sets

In [19]:

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, 
    y_temp,
    test_size=0.25,  # 0.25 x 0.8 = 0.2 of original data
    random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")


Training set: 534 samples
Validation set: 178 samples
Test set: 179 samples


## Step 3: Define the pipeline components

In [20]:
def create_pipeline(trial=None):
    # Outlier handling transformer (applied only to numerical columns, especially Age)
    trf_outlier = ColumnTransformer([
        ('outlier_handler', OutlierHandler(columns=[2]), slice(0, X_train.shape[1]))
    ], remainder='passthrough')
    
    # Imputation transformer
    trf_impute = ColumnTransformer([
        ('impute_age', SimpleImputer(), [2]),
        ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])
    ], remainder='passthrough')
    
    # One-hot encoding transformer
    trf_encode = ColumnTransformer([
        ('ohe_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1, 6])
    ], remainder='passthrough')
    
    # Scaling transformer
    trf_scale = ColumnTransformer([
        ('scale', MinMaxScaler(), slice(0, 10))
    ])
    
    # Feature selection
    k = trial.suggest_int('k', 5, 10) if trial else 8
    trf_select = SelectKBest(score_func=chi2, k=k)
    
    # Model with hyperparameters
    if trial:
        max_depth = trial.suggest_categorical('max_depth', [1, 2, 3, 4, 5, None])
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
        criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
        trf_model = DecisionTreeClassifier(
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            criterion=criterion
        )
    else:
        trf_model = DecisionTreeClassifier()
    
    # Complete pipeline
    pipeline = Pipeline([
        ('trf_outlier', trf_outlier),
        ('trf_impute', trf_impute),
        ('trf_encode', trf_encode),
        ('trf_scale', trf_scale),
        ('trf_select', trf_select),
        ('trf_model', trf_model)
    ])
    
    return pipeline


## Step 4: 
* Train initial model on training set  
* Evaluate on validation set 
* Hyperparameter tuning using Optuna with validation set

In [21]:
def objective(trial):
    # Create pipeline with trial-suggested hyperparameters
    pipeline = create_pipeline(trial)
    
    # Train pipeline
    pipeline.fit(X_train, y_train)
    
    # Evaluate on validation set
    y_val_pred = pipeline.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    
    return val_accuracy

# Create Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Print optimization results
print(f"Best trial: {study.best_trial.number}")
print(f"Best value (validation accuracy): {study.best_trial.value:.4f}")
print(f"Best hyperparameters: {study.best_trial.params}")


[I 2025-03-18 17:38:07,434] A new study created in memory with name: no-name-67078e0d-1655-4d68-905d-cee14d22dde7
[I 2025-03-18 17:38:07,471] Trial 0 finished with value: 0.651685393258427 and parameters: {'k': 6, 'max_depth': None, 'min_samples_split': 8, 'min_samples_leaf': 3, 'criterion': 'entropy'}. Best is trial 0 with value: 0.651685393258427.
[I 2025-03-18 17:38:07,487] Trial 1 finished with value: 0.651685393258427 and parameters: {'k': 10, 'max_depth': 3, 'min_samples_split': 9, 'min_samples_leaf': 5, 'criterion': 'gini'}. Best is trial 0 with value: 0.651685393258427.
[I 2025-03-18 17:38:07,515] Trial 2 finished with value: 0.651685393258427 and parameters: {'k': 7, 'max_depth': 1, 'min_samples_split': 3, 'min_samples_leaf': 2, 'criterion': 'entropy'}. Best is trial 0 with value: 0.651685393258427.
[I 2025-03-18 17:38:07,551] Trial 3 finished with value: 0.651685393258427 and parameters: {'k': 5, 'max_depth': 1, 'min_samples_split': 10, 'min_samples_leaf': 5, 'criterion': 'gi

[I 2025-03-18 17:38:08,704] Trial 35 finished with value: 0.651685393258427 and parameters: {'k': 5, 'max_depth': 1, 'min_samples_split': 8, 'min_samples_leaf': 4, 'criterion': 'gini'}. Best is trial 0 with value: 0.651685393258427.
[I 2025-03-18 17:38:08,783] Trial 36 finished with value: 0.651685393258427 and parameters: {'k': 6, 'max_depth': 3, 'min_samples_split': 9, 'min_samples_leaf': 5, 'criterion': 'gini'}. Best is trial 0 with value: 0.651685393258427.
[I 2025-03-18 17:38:08,819] Trial 37 finished with value: 0.651685393258427 and parameters: {'k': 5, 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 1, 'criterion': 'gini'}. Best is trial 0 with value: 0.651685393258427.
[I 2025-03-18 17:38:08,862] Trial 38 finished with value: 0.651685393258427 and parameters: {'k': 10, 'max_depth': 1, 'min_samples_split': 6, 'min_samples_leaf': 5, 'criterion': 'gini'}. Best is trial 0 with value: 0.651685393258427.
[I 2025-03-18 17:38:08,910] Trial 39 finished with value: 0.651

Best trial: 0
Best value (validation accuracy): 0.6517
Best hyperparameters: {'k': 6, 'max_depth': None, 'min_samples_split': 8, 'min_samples_leaf': 3, 'criterion': 'entropy'}


## Step 5: Train final model on combined train+validation set with best parameters

In [12]:
# Create final pipeline with best hyperparameters
final_pipeline = create_pipeline()
# Set the best hyperparameters
for param_name, param_value in study.best_trial.params.items():
    if param_name == 'k':
        final_pipeline.named_steps['trf_select'].k = param_value
    elif param_name == 'max_depth':
        final_pipeline.named_steps['trf_model'].max_depth = param_value
    elif param_name == 'min_samples_split':
        final_pipeline.named_steps['trf_model'].min_samples_split = param_value
    elif param_name == 'min_samples_leaf':
        final_pipeline.named_steps['trf_model'].min_samples_leaf = param_value
    elif param_name == 'criterion':
        final_pipeline.named_steps['trf_model'].criterion = param_value

# Train on combined data
final_pipeline.fit(X_temp, y_temp)

## Step 6: Evaluate final model on test set (previously unseen data)

In [14]:

y_test_pred = final_pipeline.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Final model test accuracy: {test_accuracy:.4f}")


Final model test accuracy: 0.6257


## For inference on new data

In [22]:

def predict_survival(new_data, model=final_pipeline):
    """
    Make predictions on new passenger data
    
    Parameters:
    -----------
    new_data : pd.DataFrame
        New passenger data in the same format as the training data
    model : trained pipeline
        The trained ML pipeline
        
    Returns:
    --------
    np.array : Survival predictions (0 or 1)
    """
    return model.predict(new_data)
    

In [23]:
new_input_record = X_test.iloc[:1,:]
new_input_record

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
709,3,male,,1,1,15.2458,C


In [24]:
predict_survival(new_input_record)

array([1], dtype=int64)