In [1]:
import sys
sys.path.append("..")

In [2]:
# Import modules

from mlads_ds.data_loader import DataLoader
from mlads_ds.data_preprocessor import DataPreprocessor
from mlads_ds.feature_selector import FeatureSelector
from mlads_ds.model_trainer import ModelTrainer
from mlads_ds.model_evaluator import ModelEvaluator

In [3]:
# Using the classes to create a pipeline
file_path = '../titanic.csv'

data_loader = DataLoader(file_path)
data = data_loader.load_data()
preprocessor = DataPreprocessor(data)
processed_data = preprocessor.preprocess()
selector = FeatureSelector(processed_data)
features = selector.select_features()
labels = processed_data['Survived']


In [None]:
# Train and evaluate models
trainer = ModelTrainer(features, labels)
best_models, evaluation_results = trainer.train_and_evaluate_models()

# Outputting the results
best_models, evaluation_results

# Advance run

In [13]:
# init the experiment

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

models = {
            'RandomForest': RandomForestClassifier(),
            'SVM': SVC(),
            'LogisticRegression': LogisticRegression(),
            'NaiveBayes': GaussianNB(),
            'DecisionTree': DecisionTreeClassifier(),
            'KNeighbors': KNeighborsClassifier(),
            'AdaBoost': AdaBoostClassifier(),
            'GradientBoosting': GradientBoostingClassifier(),
            'RidgeClassifier': RidgeClassifier(),
            'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
            'LightGBM': LGBMClassifier()
        }
hyperparameters = {
            'RandomForest': {'n_estimators': [50, 100], 'max_depth': [10, 20]},
            'SVM': {'C': [1, 10], 'gamma': [0.1, 0.01]},
            'LogisticRegression': {'C': [1, 10]},
            'NaiveBayes': {},
            'DecisionTree': {'max_depth': [5, 10]},
            'KNeighbors': {'n_neighbors': [3, 5]},
            'AdaBoost': {'n_estimators': [50, 100]},
            'GradientBoosting': {'n_estimators': [50, 100], 'learning_rate': [0.1, 0.01]},
            'RidgeClassifier': {'alpha': [0.1, 1]},
            'XGBoost': {'n_estimators': [50, 100], 'learning_rate': [0.1, 0.01]},
            'LightGBM': {'n_estimators': [50, 100], 'learning_rate': [0.1, 0.01]}
        }

In [None]:
# Multi Train and evaluate models
trainer = ModelTrainer(features=features, labels=labels, models=models, hyperparameters=hyperparameters)
best_models, evaluation_results = trainer.train_and_evaluate_models()

In [10]:
evaluation_results

{'RandomForest': {'Best Parameters': {'max_depth': 10, 'n_estimators': 50},
  'Accuracy': 0.8324022346368715,
  'Classification Report': '              precision    recall  f1-score   support\n\n           0       0.83      0.91      0.87       110\n           1       0.83      0.71      0.77        69\n\n    accuracy                           0.83       179\n   macro avg       0.83      0.81      0.82       179\nweighted avg       0.83      0.83      0.83       179\n'},
 'SVM': {'Best Parameters': {'C': 10, 'gamma': 0.01},
  'Accuracy': 0.7597765363128491,
  'Classification Report': '              precision    recall  f1-score   support\n\n           0       0.79      0.84      0.81       110\n           1       0.71      0.64      0.67        69\n\n    accuracy                           0.76       179\n   macro avg       0.75      0.74      0.74       179\nweighted avg       0.76      0.76      0.76       179\n'},
 'LogisticRegression': {'Best Parameters': {'C': 10},
  'Accuracy': 0.

In [11]:
# Extracting the top 3 models based on their accuracy from the evaluation results

# Sorting the models based on accuracy
sorted_models = sorted(evaluation_results.items(), key=lambda x: x[1]['Accuracy'], reverse=True)

# Selecting the top 3 models
top_3_models = sorted_models[:3]

top_3_models_info = {model_name: {'Accuracy': details['Accuracy'], 'Best Parameters': details['Best Parameters']}
                     for model_name, details in top_3_models}

top_3_models_info

{'GradientBoosting': {'Accuracy': 0.8491620111731844,
  'Best Parameters': {'learning_rate': 0.1, 'n_estimators': 100}},
 'XGBoost': {'Accuracy': 0.8435754189944135,
  'Best Parameters': {'learning_rate': 0.01, 'n_estimators': 100}},
 'LightGBM': {'Accuracy': 0.8435754189944135,
  'Best Parameters': {'learning_rate': 0.1, 'n_estimators': 50}}}