# Run the Titanic Pipeline

In [1]:
import sys
sys.path.append('..')

In [3]:
# import all the classes and functions we need
from mlads_ds.data_loader import DataLoader
from mlads_ds.data_preprocessor import DataPreprocessor
from mlads_ds.feature_engineer import FeatureEngineer
from mlads_ds.model_trainer import ModelTrainer
from mlads_ds.model_evaluator import ModelEvaluator

In [4]:
def main():
    # Load data
    loader = DataLoader('../titanic.csv')
    data = loader.load_data()

    # Preprocess data
    preprocessor = DataPreprocessor(data)
    preprocessed_data = preprocessor.preprocess()

    # Feature engineering
    engineer = FeatureEngineer(preprocessed_data)
    engineered_data = engineer.engineer_features()

    # Splitting features and target
    features = engineered_data.drop('Survived', axis=1)
    target = engineered_data['Survived']

    # Model training
    trainer = ModelTrainer(features, target)
    trained_model = trainer.train_and_tune()
    return trained_model

In [None]:
# Execute the main function
accuracy = main()

In [6]:
accuracy

{'logistic_regression': {'best_params': {'model__C': 0.1},
  'best_score': 0.783669851275485,
  'test_score': 0.776536312849162,
  'confusion_matrix': array([[87, 18],
         [22, 52]], dtype=int64),
  'classification_report': '              precision    recall  f1-score   support\n\n           0       0.80      0.83      0.81       105\n           1       0.74      0.70      0.72        74\n\n    accuracy                           0.78       179\n   macro avg       0.77      0.77      0.77       179\nweighted avg       0.78      0.78      0.78       179\n'},
 'random_forest': {'best_params': {'model__n_estimators': 100},
  'best_score': 0.7753373387176203,
  'test_score': 0.7374301675977654,
  'confusion_matrix': array([[82, 23],
         [24, 50]], dtype=int64),
  'classification_report': '              precision    recall  f1-score   support\n\n           0       0.77      0.78      0.78       105\n           1       0.68      0.68      0.68        74\n\n    accuracy              

In [11]:
for result in accuracy:
    print(accuracy[result]['best_score'])

# find the model with the highest accuracy
max_accuracy = 0
best_model = None
for result in accuracy:
    if accuracy[result]['best_score'] > max_accuracy:
        max_accuracy = accuracy[result]['best_score']
        best_model = result

0.783669851275485
0.7753373387176203
0.7977248104008667
0.787914901999409
0.776696542893726
0.8160248202501723
0.7878755047769133
0.8033684625233921
0.7991726583275879
0.7836501526642371
0.783669851275485
0.796385304836009


In [13]:
best_model, max_accuracy

('gradient_boosting', 0.8160248202501723)