In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
warnings.filterwarnings("ignore")


In [27]:
df = pd.read_csv(r'C:\Users\ASUS\Documents\Scripts\train.csv')
df = df.drop(columns=['PassengerId','Name','Ticket','Cabin'])
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


## Custom outlier transformer using IQR method

In [28]:

class OutlierHandler(BaseEstimator, TransformerMixin):
    def __init__(self, columns=[2]):  # Age column by default (assuming it's the 3rd column, index 2)
        self.columns = columns
        self.lower_bounds = {}
        self.upper_bounds = {}
    
    def fit(self, X, y=None):
        X_arr = np.array(X)
        for col in self.columns:
            q1 = np.nanpercentile(X_arr[:, col], 25)
            q3 = np.nanpercentile(X_arr[:, col], 75)
            iqr = q3 - q1
            
            # Define bounds (1.5 is standard for IQR method)
            self.lower_bounds[col] = q1 - 1.5 * iqr
            self.upper_bounds[col] = q3 + 1.5 * iqr
        
        return self
    
    def transform(self, X):
        X_transformed = np.array(X).copy()
        
        for col in self.columns:
            # Cap outliers at the boundaries
            mask_upper = X_transformed[:, col] > self.upper_bounds[col]
            mask_lower = X_transformed[:, col] < self.lower_bounds[col]
            
            X_transformed[mask_upper, col] = self.upper_bounds[col]
            X_transformed[mask_lower, col] = self.lower_bounds[col]
        
        return X_transformed
        

## Step 1: Split data into train+validation and test sets

In [29]:

X_temp, X_test, y_temp, y_test = train_test_split(
    df.drop(columns=['Survived']),
    df['Survived'],
    test_size=0.2,
    random_state=42
)


## Step 2: Split the temp data into training and validation sets

In [30]:

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, 
    y_temp,
    test_size=0.25,  # 0.25 x 0.8 = 0.2 of original data
    random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")


Training set: 534 samples
Validation set: 178 samples
Test set: 179 samples


## Step 3: Define the pipeline components

In [31]:

def create_pipeline():
    # Outlier handling transformer (applied only to numerical columns, especially Age)
    trf_outlier = ColumnTransformer([
        ('outlier_handler', OutlierHandler(columns=[2]), slice(0, X_train.shape[1]))
    ], remainder='passthrough')
    
    # Imputation transformer
    trf_impute = ColumnTransformer([
        ('impute_age', SimpleImputer(), [2]),
        ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])
    ], remainder='passthrough')
    
    # One-hot encoding transformer
    trf_encode = ColumnTransformer([
        ('ohe_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1, 6])
    ], remainder='passthrough')
    
    # Scaling transformer
    trf_scale = ColumnTransformer([
        ('scale', MinMaxScaler(), slice(0, 10))
    ])
    
    # Feature selection
    trf_select = SelectKBest(score_func=chi2, k=8)
    
    # Model
    trf_model = DecisionTreeClassifier()
    
    # Complete pipeline
    pipeline = Pipeline([
        ('trf_outlier', trf_outlier),
        ('trf_impute', trf_impute),
        ('trf_encode', trf_encode),
        ('trf_scale', trf_scale),
        ('trf_select', trf_select),
        ('trf_model', trf_model)
    ])
    
    return pipeline


## Step 4: Train initial model on training set

In [32]:

pipe = create_pipeline()
pipe.fit(X_train, y_train)


## Step 5: Evaluate on validation set

In [33]:

y_val_pred = pipe.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Initial model validation accuracy: {val_accuracy:.4f}")


Initial model validation accuracy: 0.6517


## Step 6: Hyperparameter tuning using GridSearchCV with validation set

In [34]:

params = {
    'trf_model__max_depth': [1, 2, 3, 4, 5, None]
}

# Note: GridSearchCV will use the validation set indirectly through CV folds
grid = GridSearchCV(create_pipeline(), params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

print(f"Best parameters: {grid.best_params_}")
print(f"Best cross-validation accuracy: {grid.best_score_:.4f}")


Best parameters: {'trf_model__max_depth': 1}
Best cross-validation accuracy: 0.6349


In [35]:

# Verify best model on validation set
best_model_from_cv = grid.best_estimator_
y_val_pred_best = best_model_from_cv.predict(X_val)
best_val_accuracy = accuracy_score(y_val, y_val_pred_best)
print(f"Best model validation accuracy: {best_val_accuracy:.4f}")


Best model validation accuracy: 0.6517


## Step 7: Train final model on combined train+validation set with best parameters

In [36]:

# Create a pipeline with the best parameters
final_pipeline = create_pipeline()

# Set the best hyperparameters
final_pipeline.set_params(**grid.best_params_)

# Train on combined data
final_pipeline.fit(X_temp, y_temp)


## Step 8: Evaluate final model on test set (previously unseen data)

In [37]:

y_test_pred = final_pipeline.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Final model test accuracy: {test_accuracy:.4f}")


Final model test accuracy: 0.6257


## For inference on new data

In [38]:

def predict_survival(new_data, model=final_pipeline):
    """
    Make predictions on new passenger data
    
    Parameters:
    -----------
    new_data : pd.DataFrame
        New passenger data in the same format as the training data
    model : trained pipeline
        The trained ML pipeline
        
    Returns:
    --------
    np.array : Survival predictions (0 or 1)
    """
    return model.predict(new_data)
    

In [39]:
new_input_record = X_test.iloc[:1,:]
new_input_record

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
709,3,male,,1,1,15.2458,C


In [40]:
predict_survival(new_input_record)

array([1], dtype=int64)