# Data Science pipeline for titnaic dataset

ChatGPT:

The data science pipeline for the Titanic dataset, structured into different classes, yielded the following results:

Data Loading: The dataset was successfully loaded from the provided file path.

Data Preprocessing: Missing values were handled (median for 'Age', mode for 'Embarked', and dropping the 'Cabin' column). Categorical variables ('Sex' and 'Embarked') were encoded.

Feature Selection: Relevant features were selected, including 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', and encoded 'Sex' and 'Embarked' columns.

Model Training: A Random Forest Classifier was trained with the selected features. The dataset was split into training and testing sets, with 80% of the data used for training.

Model Evaluation: The trained model was evaluated on the test set. The model achieved an accuracy of approximately 82.12%. The classification report provides further details:

Precision for non-survivors (0) was 85%, and for survivors (1) was 78%.
Recall for non-survivors was 86%, and for survivors was 75%.
The F1-score, which balances precision and recall, was 86% for non-survivors and 76% for survivors.
The pipeline demonstrates a systematic approach to handling a data science task, with each step encapsulated in a separate class, enhancing modularity and clarity. The model's performance indicates a good level of predictive ability, especially in distinguishing between survivors and non-survivors on the Titanic

In [2]:
import pandas as pd

# Creating a generic data science pipeline for the Titanic dataset
# Each step in the pipeline will be a separate class

class DataLoader:
    """Load the dataset."""
    def __init__(self, file_path):
        self.file_path = file_path

    def load_data(self):
        return pd.read_csv(self.file_path)


class DataPreprocessor:
    """Preprocess the data: handle missing values, encode categorical variables, etc."""
    def __init__(self, data):
        self.data = data

    def preprocess(self):
        # Filling missing values
        self.data['Age'].fillna(self.data['Age'].median(), inplace=True)
        self.data['Embarked'].fillna(self.data['Embarked'].mode()[0], inplace=True)
        self.data.drop('Cabin', axis=1, inplace=True)  # Dropping the Cabin column due to high missing values

        # Encoding categorical variables
        self.data = pd.get_dummies(self.data, columns=['Sex', 'Embarked'], drop_first=True)

        return self.data


class FeatureSelector:
    """Select features for the model."""
    def __init__(self, data):
        self.data = data

    def select_features(self):
        features = self.data[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S']]
        return features


class ModelTrainer:
    """Train the machine learning model."""
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def train_model(self):
        from sklearn.model_selection import train_test_split
        from sklearn.ensemble import RandomForestClassifier

        # Splitting the dataset into the Training set and Test set
        X_train, X_test, y_train, y_test = train_test_split(self.features, self.labels, test_size=0.2, random_state=0)

        # Fitting Random Forest Classification to the Training set
        classifier = RandomForestClassifier(n_estimators=100, random_state=0)
        classifier.fit(X_train, y_train)

        return classifier, X_test, y_test


class ModelEvaluator:
    """Evaluate the trained model."""
    def __init__(self, model, X_test, y_test):
        self.model = model
        self.X_test = X_test
        self.y_test = y_test

    def evaluate(self):
        from sklearn.metrics import classification_report, accuracy_score

        # Predicting the Test set results
        y_pred = self.model.predict(self.X_test)

        # Creating the evaluation report
        accuracy = accuracy_score(self.y_test, y_pred)
        report = classification_report(self.y_test, y_pred)

        return accuracy, report


# Using the classes to create a pipeline
file_path = '../titanic.csv'

# Load data
data_loader = DataLoader(file_path)
data = data_loader.load_data()

# Preprocess data
preprocessor = DataPreprocessor(data)
processed_data = preprocessor.preprocess()

# Select features and labels
selector = FeatureSelector(processed_data)
features = selector.select_features()
labels = processed_data['Survived']

# Train model
trainer = ModelTrainer(features, labels)
model, X_test, y_test = trainer.train_model()

# Evaluate model
evaluator = ModelEvaluator(model, X_test, y_test)
accuracy, report = evaluator.evaluate()

accuracy, report



(0.8212290502793296,
 '              precision    recall  f1-score   support\n\n           0       0.85      0.86      0.86       110\n           1       0.78      0.75      0.76        69\n\n    accuracy                           0.82       179\n   macro avg       0.81      0.81      0.81       179\nweighted avg       0.82      0.82      0.82       179\n')