In [2]:
import pandas as pd

# Load Titanic dataset (example: from seaborn or CSV file)
# If using seaborn:
# import seaborn as sns
# df = sns.load_dataset("titanic")

# Or from a local CSV:
df = pd.read_csv('titanic.csv')  # Adjust path as needed

# Show first few rows before adding the feature
print("Before adding FamilySize:")
print(df[['SibSp', 'Parch']].head())

# Create FamilySize feature
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Show updated DataFrame
print("\nAfter adding FamilySize:")
print(df[['SibSp', 'Parch', 'FamilySize']].head())


Before adding FamilySize:
   SibSp  Parch
0      1      0
1      1      0
2      0      0
3      1      0
4      0      0

After adding FamilySize:
   SibSp  Parch  FamilySize
0      1      0           2
1      1      0           2
2      0      0           1
3      1      0           2
4      0      0           1


In [3]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Load Titanic dataset
df = pd.read_csv('titanic.csv')  # adjust path as needed

# Drop columns with too many missing or not useful (example)
df = df.drop(columns=['Cabin', 'Ticket', 'Name', 'PassengerId'])

# Create target variable
y = df['Survived']
X = df.drop(columns='Survived')

# Step 1: Create a FunctionTransformer for FamilySize
def add_family_size(X):
    X = X.copy()
    X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
    return X

# Wrap in FunctionTransformer
family_size_transformer = FunctionTransformer(add_family_size, validate=False)

# Step 2: Define preprocessing
numeric_features = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['Sex', 'Embarked', 'Pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Step 3: Combine everything into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Step 4: Full pipeline
pipeline = Pipeline(steps=[
    ('family_size', family_size_transformer),
    ('preprocessor', preprocessor)
])

# Step 5: Split data and transform
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline
X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)

print("Shape of transformed training data:", X_train_transformed.shape)


Shape of transformed training data: (712, 13)


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer

# Load Titanic dataset
df = pd.read_csv("titanic.csv")  # Make sure titanic.csv is in your working directory

# Drop irrelevant or highly null columns
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

# Separate target and features
y = df['Survived']
X = df.drop(columns='Survived')

# Step 1: Create a FunctionTransformer to add 'FamilySize'
def add_family_size(X_df):
    X_df = X_df.copy()
    X_df['FamilySize'] = X_df['SibSp'] + X_df['Parch'] + 1
    return X_df

family_size_transformer = FunctionTransformer(add_family_size, validate=False)

# Step 2: Define preprocessing for each type of column
numeric_features = ['Age', 'Fare', 'SibSp', 'Parch', 'FamilySize']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['Sex', 'Embarked', 'Pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Step 3: Create ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Step 4: Final pipeline
pipeline = Pipeline(steps=[
    ('add_family_size', family_size_transformer),
    ('preprocessing', preprocessor)
])

# Step 5: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Fit and transform
X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)

# Output shape and preview
print("Transformed training data shape:", X_train_transformed.shape)
print("First row of transformed data:\n", X_train_transformed[0])


Transformed training data shape: (712, 13)
First row of transformed data:
 [ 1.25364106 -0.07868358 -0.47072241 -0.47934164 -0.55466613  0.
  1.          0.          0.          1.          1.          0.
  0.        ]


In [5]:
from sklearn.ensemble import RandomForestClassifier

full_pipeline = Pipeline(steps=[
    ('add_family_size', family_size_transformer),
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

full_pipeline.fit(X_train, y_train)
print("Accuracy on test set:", full_pipeline.score(X_test, y_test))

Accuracy on test set: 0.8268156424581006
