In [1]:
# Importing necessary libraries
import pandas as pd  # to read the dataset 
import seaborn as sns  # to use titanic dataset
from sklearn.model_selection import train_test_split  # to use the ML model with train, test data
from sklearn.preprocessing import StandardScaler    #encoding
from feature_engine.imputation import MeanMedianImputer
from feature_engine.encoding import OneHotEncoder
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.outliers import Winsorizer

In [2]:
# Load the Titanic dataset
data = sns.load_dataset('titanic')
print("Dataset loaded:")
print(data.head())

Dataset loaded:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [3]:
# Get an overview of the DataFrame, including non-null values
print("\nOverview of the DataFrame:")
print(data.info())


Overview of the DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
None


In [4]:
#Number of rows and columns in titanic dataset
print(f"Rows:",data.shape[0])
print(f"Cols:",data.shape[1])

Rows: 891
Cols: 15


In [5]:
# Check for missing values in each column
print("\nMissing values in each column:")
print(data.isnull().sum())


Missing values in each column:
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64


In [6]:
# Drop columns that won't be used for this demonstration
data = data.drop(columns=['embarked', 'deck', 'embark_town', 'alive', 'alone', 'adult_male', 'who', 'class'],axis=1)
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns=['survived']), data['survived'], test_size=0.2, random_state=42)

In [8]:
# Use MeanMedianImputer to fill missing numerical values
num_imputer = MeanMedianImputer(imputation_method='median', variables=['age'])
X_train = num_imputer.fit_transform(X_train)
X_test = num_imputer.transform(X_test)
print("\nMeanMedianImputer applied:")
print(X_train.isnull().sum())
print(X_test.isnull().sum())


MeanMedianImputer applied:
pclass    0
sex       0
age       0
sibsp     0
parch     0
fare      0
dtype: int64
pclass    0
sex       0
age       0
sibsp     0
parch     0
fare      0
dtype: int64


In [9]:
# Apply one-hot encoding to categorical variables
encoder = OneHotEncoder(variables=['sex'], drop_last=True)
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)
print("\nOne-hot encoding applied:")
print(X_train.head(20))


One-hot encoding applied:
     pclass   age  sibsp  parch      fare  sex_male
331       1  45.5      0      0   28.5000         1
733       2  23.0      0      0   13.0000         1
382       3  32.0      0      0    7.9250         1
704       3  26.0      1      0    7.8542         1
813       3   6.0      4      2   31.2750         0
118       1  24.0      0      1  247.5208         1
536       1  45.0      0      0   26.5500         1
361       2  29.0      1      0   27.7208         1
29        3  28.0      0      0    7.8958         1
55        1  28.0      0      0   35.5000         1
865       2  42.0      0      0   13.0000         0
595       3  36.0      1      1   24.1500         1
239       2  33.0      0      0   12.2750         1
721       3  17.0      1      0    7.0542         1
81        3  29.0      0      0    9.5000         1
259       2  50.0      0      1   26.0000         0
486       1  35.0      1      0   90.0000         0
716       1  38.0      0      0  227.

In [10]:
# Apply Winsorizer to handle outliers in 'fare'
winsorizer = Winsorizer(capping_method='iqr', tail='both', fold=1.5, variables=['fare'])
X_train = winsorizer.fit_transform(X_train)
X_test = winsorizer.transform(X_test)
print("\nOutliers handled:")
print(X_train.head(20))


Outliers handled:
     pclass   age  sibsp  parch     fare  sex_male
331       1  45.5      0      0  28.5000         1
733       2  23.0      0      0  13.0000         1
382       3  32.0      0      0   7.9250         1
704       3  26.0      1      0   7.8542         1
813       3   6.0      4      2  31.2750         0
118       1  24.0      0      1  64.3625         1
536       1  45.0      0      0  26.5500         1
361       2  29.0      1      0  27.7208         1
29        3  28.0      0      0   7.8958         1
55        1  28.0      0      0  35.5000         1
865       2  42.0      0      0  13.0000         0
595       3  36.0      1      1  24.1500         1
239       2  33.0      0      0  12.2750         1
721       3  17.0      1      0   7.0542         1
81        3  29.0      0      0   9.5000         1
259       2  50.0      0      1  26.0000         0
486       1  35.0      1      0  64.3625         0
716       1  38.0      0      0  64.3625         0
800       2 

In [11]:
# Apply StandardScaler
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
print("\nFeatures scaled:")
print(X_train.head())


Features scaled:
     pclass       age     sibsp     parch      fare  sex_male
0 -1.614136  1.253641 -0.470722 -0.479342  0.236751  0.724310
1 -0.400551 -0.477284 -0.470722 -0.479342 -0.532339  0.724310
2  0.813034  0.215086 -0.470722 -0.479342 -0.784153  0.724310
3  0.813034 -0.246494  0.379923 -0.479342 -0.787666  0.724310
4  0.813034 -1.785093  2.931860  2.048742  0.374443 -1.380624


In [12]:
from sklearn.linear_model import LogisticRegression

# Assuming X_train and X_test are already scaled features
# Assuming y_train and y_test are the target labels

# Initialize and fit the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

In [13]:
from sklearn.metrics import confusion_matrix, classification_report
# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[90 15]
 [21 53]]

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179

