Importing **Dependencies**

# **Import Libraries**

In [None]:
# Import necessary libraries for data handling, preprocessing, modeling, and evaluation
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# **Load and Explore Data**

In [None]:
# Load the dataset and perform initial exploration
def load_data(file_path):
    data = pd.read_csv('/content/tested.csv')
    print("First 5 rows of the dataset:\n", data.head())
    print("\nDataset information:\n", data.info())
    print("\nMissing values:\n", data.isnull().sum())
    return data

# Usage
data = load_data("/content/tested.csv")  # Replace with your file path

First 5 rows of the dataset:
    PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 en

**Preprocessing**

In [None]:
# Handle missing values, encode categorical variables, and normalize numerical data
def preprocess_data(df):
    # Handle missing values
    df['Age'].fillna(df['Age'].median(), inplace=True)  # Fill missing age with median
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)  # Fill missing embarked with mode
    df['Cabin'] = df['Cabin'].apply(lambda x: 1 if pd.notna(x) else 0)  # Cabin known: 1, unknown: 0

    # Drop irrelevant columns
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

    # Encode categorical variables
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})  # Male: 0, Female: 1
    df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)  # One-hot encoding for Embarked

    # Normalize numerical features
    scaler = StandardScaler()
    df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

    # Fix infinite values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(df.max(), inplace=True)

    return df

# Apply preprocessing
processed_data = preprocess_data(data)
print("Preprocessed data:\n", processed_data.head())

Preprocessed data:
    Survived  Pclass  Sex       Age  SibSp  Parch      Fare  Cabin  Embarked_Q  \
0         0       3    0  0.386231      0      0 -0.497811      0        True   
1         1       3    1  1.371370      1      0 -0.512660      0       False   
2         0       2    0  2.553537      0      0 -0.464532      0        True   
3         0       3    0 -0.204852      0      0 -0.482888      0       False   
4         1       3    1 -0.598908      1      1 -0.417971      0       False   

   Embarked_S  
0       False  
1        True  
2       False  
3        True  
4        True  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)  # Fill missing age with median
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)  # Fill missing embarked with mode


# **Feature Engineering**

In [None]:
# Create new features to improve model performance
def engineer_features(df):
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1  # Family size = siblings + parents + 1
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)  # Alone: 1, Not alone: 0
    return df

# Apply feature engineering
final_data = engineer_features(processed_data)
print("Data with engineered features:\n", final_data.head())

Data with engineered features:
    Survived  Pclass  Sex       Age  SibSp  Parch      Fare  Cabin  Embarked_Q  \
0         0       3    0  0.386231      0      0 -0.497811      0        True   
1         1       3    1  1.371370      1      0 -0.512660      0       False   
2         0       2    0  2.553537      0      0 -0.464532      0        True   
3         0       3    0 -0.204852      0      0 -0.482888      0       False   
4         1       3    1 -0.598908      1      1 -0.417971      0       False   

   Embarked_S  FamilySize  IsAlone  
0       False           1        1  
1        True           2        0  
2       False           1        1  
3        True           1        1  
4        True           3        0  


# **Train - Test split**

In [None]:
# Split the data into training and testing sets
def split_data(df, target='Survived', test_size=0.2, random_state=42):
    X = df.drop(target, axis=1)  # Features
    y = df[target]  # Target (Survived)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
    return X_train, X_test, y_train, y_test

# Apply split
X_train, X_test, y_train, y_test = split_data(final_data)

Train size: 334, Test size: 84


# **Model Training and Evaluation**

In [None]:
# Train multiple models and evaluate their performance
def train_and_evaluate(X_train, X_test, y_train, y_test):
    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    }

    results = {}
    for name, model in models.items():
        model.fit(X_train, y_train)  # Train the model
        y_pred = model.predict(X_test)  # Predict on test data

        results[name] = {
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1-Score': f1_score(y_test, y_pred)
        }

    # Print results
    for name, metrics in results.items():
        print(f"\n{name}:")
        for metric, value in metrics.items():
            print(f"{metric}: {value:.4f}")

    return models

# Train models
trained_models = train_and_evaluate(X_train, X_test, y_train, y_test)


Logistic Regression:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000

Random Forest:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000

XGBoost:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1-Score: 1.0000


Parameters: { "use_label_encoder" } are not used.



# **Hyperparameter Tuning**

In [None]:
# Tune the Random Forest model with the best parameters
def tune_model(X_train, y_train, model=RandomForestClassifier(random_state=42)):
    param_grid = {
        'n_estimators': [100, 200],  # Number of trees
        'max_depth': [10, 20, None],  # Tree depth
        'min_samples_split': [2, 5]   # Minimum samples for split
    }
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    print("Best parameters:", grid_search.best_params_)
    print("Best CV score:", grid_search.best_score_)

    return grid_search.best_estimator_

# Tune Random Forest
best_model = tune_model(X_train, y_train)
y_pred = best_model.predict(X_test)
print("Final test accuracy:", accuracy_score(y_test, y_pred))

Best parameters: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Best CV score: 1.0
Final test accuracy: 1.0
