In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Load the Titanic dataset into a pandas DataFrame
df = pd.read_csv('titanicData/train.csv')

# Drop irrelevant columns
df = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

# Handle missing values
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Encode categorical variables
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1}).astype(int)
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

# Create a new feature for family size
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Create a new feature for whether the passenger was alone
df['IsAlone'] = 0
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

# Drop the SibSp, Parch, and FamilySize columns
df = df.drop(['SibSp', 'Parch', 'FamilySize'], axis=1)

# Split the data into features and target variable
X = df.drop('Survived', axis=1)
y = df['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models to use
models = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    SVC(),
    GaussianNB()
]

# Train and evaluate each model
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    cr = classification_report(y_test, y_pred)
    print(f"Model: {type(model).__name__}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Confusion matrix:\n{cm}")
    print(f"Classification report:\n{cr}\n")


Model: LogisticRegression
Accuracy: 0.7989
Confusion matrix:
[[89 16]
 [20 54]]
Classification report:
              precision    recall  f1-score   support

           0       0.82      0.85      0.83       105
           1       0.77      0.73      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179


Model: DecisionTreeClassifier
Accuracy: 0.7877
Confusion matrix:
[[85 20]
 [18 56]]
Classification report:
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       105
           1       0.74      0.76      0.75        74

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179


Model: RandomForestClassifier
Accuracy: 0.8156
Confusion matrix:
[[88 17]
 [16 58]]
Classification report:
              precision    

In [None]:
#Discussion/Notes for Lab 6

# I was initially having issues with ChatGPT accessing the Kaggle link. This was the message I was receiving when I run the recommended code.
#
# { I'm sorry, but as an AI language model, I cannot run Python code or access external websites like Kaggle. However, I can provide you with an overview of commonly used supervised ML models that can be used to predict survival on the Titanic dataset.
#
# Logistic Regression
# Decision Trees
# Random Forest
# Naive Bayes
# Support Vector Machines (SVM)
# K-Nearest Neighbors (KNN)
# Gradient Boosting
# Neural Networks }
#
#
# Additional Notes
#
# Read CSV file from Kaggle using Python:
#
# Download the Kaggle API key by following the instructions in the "API" section of your Kaggle account page.
# Install the Kaggle API client using the command !pip install kaggle.
# Upload the Kaggle API key to your working directory.
# Use the Kaggle API client to download the CSV file to your working directory.
# Use the pandas library to read the CSV file into a DataFrame.
#
# # Import the required libraries
# import pandas as pd
# from kaggle.api.kaggle_api_extended import KaggleApi
#
# # Instantiate the Kaggle API client
# api = KaggleApi()
#
# # Authenticate the Kaggle API client using the API key
# api.authenticate()
#
# # Download the Titanic dataset from Kaggle
# api.dataset_download_files('competitions/titanic', path='./', unzip=True)
#
# # Read the train.csv file into a pandas DataFrame
# df = pd.read_csv('./train.csv')
#
# This code downloads the Titanic dataset from Kaggle, extracts the train.csv file, and reads it into a pandas DataFrame called df. Note that you need to provide your own Kaggle API key and specify the correct path to your working directory.
