In [1]:
import pandas as pd
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Define your models
models = {
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'Support Vector Machine': SVC(max_iter=1000),
    'Random Forest': RandomForestClassifier()
}

In [2]:
df_train = pd.read_csv("data/train.csv", delimiter=",")
df_test = pd.read_csv("data/test.csv", delimiter=",")

# Define your data (X and y)

df_train['Embarked'] = df_train['Embarked'].fillna(df_train['Embarked'].mode()[0],)
df_train['Age'] = df_train['Age'].fillna(df_train['Age'].median())
df_test['Embarked'] = df_test['Embarked'].fillna(df_train['Embarked'].mode()[0],)
df_test['Age'] = df_test['Age'].fillna(df_train['Age'].median())
df_test['Fare'] = df_test['Fare'].fillna(df_train['Fare'].median())

X = pd.get_dummies(df_train[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']], columns=['Pclass','Sex', 'Embarked'], drop_first=False).astype(int)

y = df_train['Survived']

x_score = df_test[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
x_score_encoded = pd.get_dummies(x_score, columns=['Pclass','Sex', 'Embarked'], drop_first=True).astype(int)
x_score_encoded = x_score_encoded.reindex(columns=X.columns, fill_value=0)

In [3]:
# Use cross_validate to get multiple metrics (e.g., accuracy, precision, recall)
results = {}
for name, model in models.items():
    cv_results = cross_validate(model, X, y, cv=5, scoring=['accuracy', 'precision', 'recall'])  # 5-fold cross-validation
    results[name] = cv_results

# Print the results
for name, result in results.items():
    print(f"Model: {name}")
    for metric in ['test_accuracy', 'test_precision', 'test_recall']:
        print(f"{metric}: {result[metric].mean()}")


# Alternatively, use cross_val_score for a single metric (e.g., accuracy)
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5)  # 5-fold cross-validation
    print(f"Model: {name}, Accuracy: {scores.mean()}")

Model: Logistic Regression
test_accuracy: 0.793496955621116
test_precision: 0.7498759369490852
test_recall: 0.6957800511508953
Model: Support Vector Machine
test_accuracy: 0.6757328479065972
test_precision: 0.6804159331745538
test_recall: 0.2926257459505541
Model: Random Forest
test_accuracy: 0.802510827945515
test_precision: 0.7546092312763305
test_recall: 0.7220375106564365
Model: Logistic Regression, Accuracy: 0.793496955621116
Model: Support Vector Machine, Accuracy: 0.6757328479065972
Model: Random Forest, Accuracy: 0.8036281463812692


In [7]:
results = {}
feature_importances = {}  

for name, model in models.items():
    cv_results = cross_validate(model, X, y, cv=5, scoring=['accuracy']) 
    results[name] = cv_results

    # Fit the model on the whole dataset to get stable feature importances (outside cross-validation loop)
    model.fit(X, y)  # Careful, potentially overfitting, good for demo only, ideally incorporate in the CV loop
    y_pred = model.predict(x_score_encoded)
    pd.Series(y_pred, index=df_test['PassengerId'], name='Survived').reset_index().to_csv(f"pred_result/submission_{name}.csv", index=False)

    if hasattr(model, 'feature_importances_'): # check for feature importance attribute before accessing it!
        feature_importances[name] = model.feature_importances_

In [8]:
for name, result in results.items():
    print(f"Model: {name}")
    print(f"Accuracy: {result['test_accuracy'].mean()}")

    if name in feature_importances:
        importances = feature_importances[name]
        # Assuming X is a pandas DataFrame for easy feature name extraction
        if isinstance(X, pd.DataFrame): # use feature names if available
           feature_names = X.columns
        else:
            feature_names = [f'Feature {i}' for i in range(X.shape[1])]


        for i in range(len(feature_names)):
           print(f"  {feature_names[i]}: {importances[i]}")

Model: Logistic Regression
Accuracy: 0.793496955621116
Model: Support Vector Machine
Accuracy: 0.6757328479065972
Model: Random Forest
Accuracy: 0.8047454648170234
  Age: 0.2817556044538948
  SibSp: 0.04654906440299198
  Parch: 0.04012398584601047
  Fare: 0.19723755024180697
  Pclass_1: 0.027081772509029406
  Pclass_2: 0.01895786686779366
  Pclass_3: 0.05571206113256901
  Sex_female: 0.12501303585783685
  Sex_male: 0.17343811455655891
  Embarked_C: 0.011764138579695306
  Embarked_Q: 0.00798406723552456
  Embarked_S: 0.014382738316287952
