In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [3]:
# Load datasets
train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')

In [4]:
print(train_dataset.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [5]:
# Drop non-numeric columns
train_dataset = train_dataset.drop(columns=['Name', 'Ticket', 'Cabin'])


In [6]:
# Convert categorical columns to numeric using one-hot encoding
train_dataset = pd.get_dummies(train_dataset, columns=['Sex', 'Embarked'], drop_first=True)


In [7]:
# Handle missing values by filling them with the mean for numerical columns and mode for categorical columns
train_dataset['Age'].fillna(train_dataset['Age'].mean(), inplace=True)
train_dataset['Fare'].fillna(train_dataset['Fare'].mean(), inplace=True)


In [8]:
# Split data into features (X) and target (y)
X = train_dataset.drop(columns=['Survived'])  # All columns except 'Survived'
y = train_dataset['Survived']  # The target column

In [9]:
# Train-test split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
# Train the model
logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train, y_train)

# Predict on validation set
y_pred_logreg = logreg.predict(X_val)

# Evaluate the model
accuracy_logreg = accuracy_score(y_val, y_pred_logreg)
precision_logreg = precision_score(y_val, y_pred_logreg)
recall_logreg = recall_score(y_val, y_pred_logreg)
f1_logreg = f1_score(y_val, y_pred_logreg)
roc_auc_logreg = roc_auc_score(y_val, logreg.predict_proba(X_val)[:, 1])

print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_logreg:.4f}")
print(f"Precision: {precision_logreg:.4f}")
print(f"Recall: {recall_logreg:.4f}")
print(f"F1-Score: {f1_logreg:.4f}")
print(f"ROC-AUC: {roc_auc_logreg:.4f}\n")


Logistic Regression Performance:
Accuracy: 0.8045
Precision: 0.7746
Recall: 0.7432
F1-Score: 0.7586
ROC-AUC: 0.8761



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
from sklearn.tree import DecisionTreeClassifier

# Initialize Decision Tree model
dt = DecisionTreeClassifier(random_state=42)

# Train the model
dt.fit(X_train, y_train)

# Predict on validation set
y_pred_dt = dt.predict(X_val)

# Evaluate the model
accuracy_dt = accuracy_score(y_val, y_pred_dt)
precision_dt = precision_score(y_val, y_pred_dt)
recall_dt = recall_score(y_val, y_pred_dt)
f1_dt = f1_score(y_val, y_pred_dt)
roc_auc_dt = roc_auc_score(y_val, dt.predict_proba(X_val)[:,1])

print("Decision Tree Performance:")
print(f"Accuracy: {accuracy_dt:.4f}")
print(f"Precision: {precision_dt:.4f}")
print(f"Recall: {recall_dt:.4f}")
print(f"F1-Score: {f1_dt:.4f}")
print(f"ROC-AUC: {roc_auc_dt:.4f}\n")

Decision Tree Performance:
Accuracy: 0.7542
Precision: 0.7083
Recall: 0.6892
F1-Score: 0.6986
ROC-AUC: 0.7446



In [12]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Predict on validation set
y_pred_rf = rf.predict(X_val)

# Evaluate the model
accuracy_rf = accuracy_score(y_val, y_pred_rf)
precision_rf = precision_score(y_val, y_pred_rf)
recall_rf = recall_score(y_val, y_pred_rf)
f1_rf = f1_score(y_val, y_pred_rf)
roc_auc_rf = roc_auc_score(y_val, rf.predict_proba(X_val)[:,1])

print("Random Forest Performance:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")
print(f"ROC-AUC: {roc_auc_rf:.4f}\n")


Random Forest Performance:
Accuracy: 0.8324
Precision: 0.8333
Recall: 0.7432
F1-Score: 0.7857
ROC-AUC: 0.8894



In [13]:
from sklearn.svm import SVC

# Initialize SVM model
svm = SVC(probability=True, random_state=42)

# Train the model
svm.fit(X_train, y_train)

# Predict on validation set
y_pred_svm = svm.predict(X_val)

# Evaluate the model
accuracy_svm = accuracy_score(y_val, y_pred_svm)
precision_svm = precision_score(y_val, y_pred_svm)
recall_svm = recall_score(y_val, y_pred_svm)
f1_svm = f1_score(y_val, y_pred_svm)
roc_auc_svm = roc_auc_score(y_val, svm.predict_proba(X_val)[:,1])

print("Support Vector Machine Performance:")
print(f"Accuracy: {accuracy_svm:.4f}")
print(f"Precision: {precision_svm:.4f}")
print(f"Recall: {recall_svm:.4f}")
print(f"F1-Score: {f1_svm:.4f}")
print(f"ROC-AUC: {roc_auc_svm:.4f}\n")


Support Vector Machine Performance:
Accuracy: 0.5978
Precision: 0.6667
Recall: 0.0541
F1-Score: 0.1000
ROC-AUC: 0.7416



In [14]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize GBM model
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
gbm.fit(X_train, y_train)

# Predict on validation set
y_pred_gbm = gbm.predict(X_val)

# Evaluate the model
accuracy_gbm = accuracy_score(y_val, y_pred_gbm)
precision_gbm = precision_score(y_val, y_pred_gbm)
recall_gbm = recall_score(y_val, y_pred_gbm)
f1_gbm = f1_score(y_val, y_pred_gbm)
roc_auc_gbm = roc_auc_score(y_val, gbm.predict_proba(X_val)[:,1])

print("Gradient Boosting Machine Performance:")
print(f"Accuracy: {accuracy_gbm:.4f}")
print(f"Precision: {precision_gbm:.4f}")
print(f"Recall: {recall_gbm:.4f}")
print(f"F1-Score: {f1_gbm:.4f}")
print(f"ROC-AUC: {roc_auc_gbm:.4f}\n")


Gradient Boosting Machine Performance:
Accuracy: 0.8156
Precision: 0.8060
Recall: 0.7297
F1-Score: 0.7660
ROC-AUC: 0.8712



In [15]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid_rf,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# Fit GridSearchCV
grid_rf.fit(X_train, y_train)

# Best parameters
print("Best Parameters for Random Forest:")
print(grid_rf.best_params_)

# Best estimator
best_rf = grid_rf.best_estimator_

# Predict on validation set
y_pred_best_rf = best_rf.predict(X_val)

# Evaluate the tuned model
accuracy_best_rf = accuracy_score(y_val, y_pred_best_rf)
precision_best_rf = precision_score(y_val, y_pred_best_rf)
recall_best_rf = recall_score(y_val, y_pred_best_rf)
f1_best_rf = f1_score(y_val, y_pred_best_rf)
roc_auc_best_rf = roc_auc_score(y_val, best_rf.predict_proba(X_val)[:,1])

print("\nTuned Random Forest Performance:")
print(f"Accuracy: {accuracy_best_rf:.4f}")
print(f"Precision: {precision_best_rf:.4f}")
print(f"Recall: {recall_best_rf:.4f}")
print(f"F1-Score: {f1_best_rf:.4f}")
print(f"ROC-AUC: {roc_auc_best_rf:.4f}\n")


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters for Random Forest:
{'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}

Tuned Random Forest Performance:
Accuracy: 0.8156
Precision: 0.8254
Recall: 0.7027
F1-Score: 0.7591
ROC-AUC: 0.8915



In [16]:
# Define parameter grid
param_grid_gbm = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5]
}

# Initialize GridSearchCV
grid_gbm = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=42),
    param_grid=param_grid_gbm,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)

# Fit GridSearchCV
grid_gbm.fit(X_train, y_train)

# Best parameters
print("Best Parameters for GBM:")
print(grid_gbm.best_params_)

# Best estimator
best_gbm = grid_gbm.best_estimator_

# Predict on validation set
y_pred_best_gbm = best_gbm.predict(X_val)

# Evaluate the tuned model
accuracy_best_gbm = accuracy_score(y_val, y_pred_best_gbm)
precision_best_gbm = precision_score(y_val, y_pred_best_gbm)
recall_best_gbm = recall_score(y_val, y_pred_best_gbm)
f1_best_gbm = f1_score(y_val, y_pred_best_gbm)
roc_auc_best_gbm = roc_auc_score(y_val, best_gbm.predict_proba(X_val)[:,1])

print("\nTuned GBM Performance:")
print(f"Accuracy: {accuracy_best_gbm:.4f}")
print(f"Precision: {precision_best_gbm:.4f}")
print(f"Recall: {recall_best_gbm:.4f}")
print(f"F1-Score: {f1_best_gbm:.4f}")
print(f"ROC-AUC: {roc_auc_best_gbm:.4f}\n")


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters for GBM:
{'learning_rate': 0.1, 'max_depth': 3, 'min_samples_split': 2, 'n_estimators': 100}

Tuned GBM Performance:
Accuracy: 0.8156
Precision: 0.8060
Recall: 0.7297
F1-Score: 0.7660
ROC-AUC: 0.8712



In [17]:
# Compile model performances
models = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM', 'GBM', 'Tuned Random Forest', 'Tuned GBM']
accuracies = [accuracy_logreg, accuracy_dt, accuracy_rf, accuracy_svm, accuracy_gbm, accuracy_best_rf, accuracy_best_gbm]
precisions = [precision_logreg, precision_dt, precision_rf, precision_svm, precision_gbm, precision_best_rf, precision_best_gbm]
recalls = [recall_logreg, recall_dt, recall_rf, recall_svm, recall_gbm, recall_best_rf, recall_best_gbm]
f1_scores = [f1_logreg, f1_dt, f1_rf, f1_svm, f1_gbm, f1_best_rf, f1_best_gbm]
roc_aucs = [roc_auc_logreg, roc_auc_dt, roc_auc_rf, roc_auc_svm, roc_auc_gbm, roc_auc_best_rf, roc_auc_best_gbm]

performance_df = pd.DataFrame({
    'Model': models,
    'Accuracy': accuracies,
    'Precision': precisions,
    'Recall': recalls,
    'F1-Score': f1_scores,
    'ROC-AUC': roc_aucs
})

print("\nModel Performance Comparison:")
print(performance_df)



Model Performance Comparison:
                 Model  Accuracy  Precision    Recall  F1-Score   ROC-AUC
0  Logistic Regression  0.804469   0.774648  0.743243  0.758621  0.876062
1        Decision Tree  0.754190   0.708333  0.689189  0.698630  0.744595
2        Random Forest  0.832402   0.833333  0.743243  0.785714  0.889447
3                  SVM  0.597765   0.666667  0.054054  0.100000  0.741570
4                  GBM  0.815642   0.805970  0.729730  0.765957  0.871236
5  Tuned Random Forest  0.815642   0.825397  0.702703  0.759124  0.891506
6            Tuned GBM  0.815642   0.805970  0.729730  0.765957  0.871236


In [18]:
# Identify the model with the highest ROC-AUC
best_model_index = performance_df['ROC-AUC'].idxmax()
best_model = performance_df.loc[best_model_index, 'Model']

print(f"\nBest Performing Model: {best_model}")



Best Performing Model: Tuned Random Forest


In [19]:
import os

# Create the 'models' directory if it doesn't exist
os.makedirs('models', exist_ok=True)


In [20]:
import joblib

# Save the best model
joblib.dump(best_rf, 'models/best_model.pkl')

print("Best model saved as 'models/best_model.pkl'")


Best model saved as 'models/best_model.pkl'


In [21]:
print(os.listdir('models'))


['best_model.pkl']


In [23]:
from google.colab import drive
drive.mount('/content/drive')

# Define the path in Google Drive
model_drive_path = '/content/drive/MyDrive/models/best_model.pkl'

# Save the model to Google Drive
joblib.dump(best_rf, model_drive_path)

print(f"Best model saved to Google Drive at '{model_drive_path}'")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Best model saved to Google Drive at '/content/drive/MyDrive/models/best_model.pkl'
