In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from feature_engine.selection import DropFeatures
from feature_engine.imputation import CategoricalImputer, MeanMedianImputer

from category_encoders.ordinal import OrdinalEncoder

import preprocessors as pp

import joblib

pd.set_option('display.max_columns', None)

SEED = 42

In [2]:
df = pd.read_csv('../data/data.csv')
X_df, y_df = df.drop('Survived', axis=1), df['Survived']

selected_features = pd.read_csv('../data/selected_features.csv')
selected_features = selected_features['0'].tolist()

print(X_df.shape, y_df.shape)
print(selected_features)
X_df.head()

(891, 11) (891,)
['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_S']


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Split data into train and test

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=SEED)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((712, 11), (712,), (179, 11), (179,))

# Config

In [4]:
DROP_VAR = ['PassengerId', 'Name', 'Cabin', 'Ticket']

CAT_NA_WITH_MODE = ['Embarked']

NUM_NA_WITH_MEDIAN = ['Age']

BINS_FARE = [0, 7.9104, 14.4542, 31.0, 512.3292]
LABELS_FARE = ['Low', 'Medium', 'High', 'Very High']

MAPPING_VAR = [
    {'col': 'Sex', 'mapping': {'male': 1, 'female': 0}},
    {'col': 'Fare', 'mapping': {'Low': 0, 'Medium': 1, 'High': 2, 'Very High': 3}}
]

ONE_HOT_VAR = ['Embarked']
ONE_HOT_DROP = ['Embarked_C']

# Pipeline Feature Engineering

In [5]:
preprocessing_pipeline = Pipeline([
    
    # ====== DROP ======
    # Drop unnecessary features
    ('drop_unnecessary', DropFeatures(features_to_drop=DROP_VAR)),

    # ====== IMPUTER ======
    # Impute categprocal data
    ('frequent_imputer', CategoricalImputer(imputation_method='frequent', variables=CAT_NA_WITH_MODE)),

    # Impute numerical data
    ('median_imputer', MeanMedianImputer(imputation_method='median', variables=NUM_NA_WITH_MEDIAN)),

    # ====== CUT ======
    # Cut skewness data
    ('cut_skewness', pp.FareDiscretizer(bins=BINS_FARE, labels=LABELS_FARE)),

    # ======  MAPPING ======
    # Apply mapping
    ('Map_categorical', OrdinalEncoder(mapping=MAPPING_VAR)),

    # ====== DUMMIES ======
    # Apply dummies
    ('Encode_categorical', pp.CustomOneHotEncoder(drop_cols=ONE_HOT_DROP, columns=ONE_HOT_VAR)),

    # ====== SCALING ======
    # Feature Scaling
    ('Scale_features', pp.CustomScaler(columns=['Age'])),

    # ====== SELECTION ======
    ('Select_features', pp.FeatureSelector(columns=selected_features))
])

model_pipeline = Pipeline([
    # ====== MODEL ======
    ('train_model', LogisticRegression(random_state=SEED))
])

main_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('modeling', model_pipeline)
])

In [6]:
main_pipeline.fit(X_train, y_train)

# Evaluate the model

In [7]:
train_prediction = main_pipeline.predict(X_train)
test_prediction = main_pipeline.predict(X_test)

print('Training Accuracy is :', accuracy_score(y_train, train_prediction))
print('Testing Accuracy is :', accuracy_score(y_test, test_prediction))

# Training Accuracy is : 0.7949438202247191
# Testing Accuracy is : 0.7988826815642458

Training Accuracy is : 0.7949438202247191
Testing Accuracy is : 0.7988826815642458


# Test this notebook match the other

In [8]:
X_train = preprocessing_pipeline.transform(X_train)
X_test = preprocessing_pipeline.transform(X_test)

X_train_to_test = pd.read_csv('../data/X_train_selected_feature.csv')
y_train_to_test = pd.read_csv('../data/y_train.csv')

X_test_to_test = pd.read_csv('../data/X_test_selected_feature.csv')
y_test_to_test = pd.read_csv('../data/y_test.csv')

train_prediction_to_test = pd.read_csv('../data/train_prediction.csv')
test_prediction_to_test = pd.read_csv('../data/test_prediction.csv')

In [9]:
def compare_dataframes(df1, df2, tolerance=1e-5):
    # Check for the same shape
    assert df1.shape == df2.shape, "DataFrames do not have the same shape."
    
    # Check for columns in both DataFrames
    assert df1.columns.equals(df2.columns), "DataFrames do not have the same columns."
    
    # For each column, check if it is close within the given tolerance
    for column in df1.columns:
        if df1[column].dtype.kind in 'fc' and df2[column].dtype.kind in 'fc':
            # If columns are float, compare using numpy.isclose
            if not np.isclose(df1[column], df2[column], atol=tolerance).all():
                diff = np.isclose(df1[column], df2[column], atol=tolerance) == False
                error_message = (f"Column '{column}' values differ. \n"
                                 f"First differing elements:\n{df1[column][diff]}\n{df2[column][diff]}")
                raise AssertionError(error_message)
        else:
            # If columns are not float, use equals
            if not df1[column].equals(df2[column]):
                diff_index = df1[column] != df2[column]
                error_message = (f"Column '{column}' values differ. \n"
                                 f"First differing elements:\n{df1[column][diff_index]}\n{df2[column][diff_index]}")
                raise AssertionError(error_message)

    print("DataFrames are equal within the given tolerance.")

In [10]:
compare_dataframes(X_train.reset_index(drop=True), X_train_to_test, tolerance=1e-5)
compare_dataframes(X_test.reset_index(drop=True), X_test_to_test, tolerance=1e-5)

compare_dataframes(y_train.reset_index(drop=True).to_frame(name='Survived'),
                   y_train_to_test, tolerance=1e-5)

compare_dataframes(y_test.reset_index(drop=True).to_frame(name='Survived'),
                   y_test_to_test, tolerance=1e-5)

compare_dataframes(pd.Series(train_prediction).to_frame(name='0'),
                   train_prediction_to_test, tolerance=1e-5)

compare_dataframes(pd.Series(test_prediction).to_frame(name='0'),
                   test_prediction_to_test, tolerance=1e-5)

DataFrames are equal within the given tolerance.
DataFrames are equal within the given tolerance.
DataFrames are equal within the given tolerance.
DataFrames are equal within the given tolerance.
DataFrames are equal within the given tolerance.
DataFrames are equal within the given tolerance.


# Save the Pipeline

In [11]:
joblib.dump(main_pipeline, '../model/main_pipeline.joblib')

['../model/main_pipeline.joblib']

# We can score on new unseen data

In [12]:
sample = df.sample(100, random_state=SEED)

X_sample, y_sample = sample.drop('Survived', axis=1), sample['Survived']

In [13]:
main_pipeline.fit(X_sample, y_sample)

In [14]:
main_pipeline.predict(X_sample)

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])