In [2]:
# Libary imports can go here. As always, you can import more as needed!
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# Loads the file
file = "corrupted_titanic_data.csv"
df = pd.read_csv(file)

In [4]:
# <------------ TODO ----------------------->
# Drop all the rows with null values
# (TIP: make sure to reset the index after dropping)
df1 = df.dropna().reset_index(drop=True)
print("\n=== After Dropping Nulls ===")
print(f"Shape: {df1.shape}")
print(df1.head())


=== After Dropping Nulls ===
Shape: (117, 17)
   survived  pclass     sex    age  sibsp  parch        fare embarked   class  \
0         1       1  female  420.0      1      0   53.284868        S   First   
1         0       1    male   54.0      0      0   51.314334        S   First   
2         1       2    male   34.0      0      0   13.553364        s  Second   
3         0       1    male   19.0      3      2  263.176986        s   First   
4         1       1  FEMALE   23.0      3      2  262.789833        C   First   

     who  adult_male deck  embark_town alive  alone  pclass_redundant  \
0  woman       False    C  Southampton   yes  False                 1   
1    man        True    E  Southampton    no   True                 1   
2    man        True    D  Southampton   yes   True                 2   
3    man        True    C  Southampton    no  False                 1   
4  woman       False    C  Southampton   yes  False                 1   

   fare_age_combination  
0

In [5]:
# <------------ TODO ----------------------->
# Drop all the feature columns that you may think are not neccessary for the models evaluation
# (HINT: around 6-8 columns!)
cols_to_drop = ['class', 'who', 'adult_male', 'deck', 'embark_town', 'alive', 'alone', 'pclass_redundant']
df2 = df1.drop(columns=cols_to_drop)
print("\n=== After Dropping Unnecessary Columns ===")
print(f"Shape: {df2.shape}")
print("Remaining Columns:", df2.columns.tolist())


=== After Dropping Unnecessary Columns ===
Shape: (117, 9)
Remaining Columns: ['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'fare_age_combination']


In [6]:
# <------------ TODO ----------------------->
# Clean any relevant columns here
# (TIP: Pay attention to the values in the column)
df3 = df2.copy()
df3['embarked'] = df3['embarked'].str.strip()
print("\n=== After Cleaning Columns ===")
print("Unique values in 'embarked':", df3['embarked'].unique())


=== After Cleaning Columns ===
Unique values in 'embarked': ['S' 's' 'C' 'Q' 'q' 'c']


In [7]:
# <------------ TODO ----------------------->
# Encode all the text values to numeric ones
# (TIP: Double check your final dataframe at this point before moving on)
le_sex = LabelEncoder()
df3['sex'] = le_sex.fit_transform(df3['sex'])
le_emb = LabelEncoder()
df3['embarked'] = le_emb.fit_transform(df3['embarked'])
print("\n=== After Encoding Text to Numeric ===")
print(df3.head())


=== After Encoding Text to Numeric ===
   survived  pclass  sex    age  sibsp  parch        fare  embarked  \
0         1       1    2  420.0      1      0   53.284868         2   
1         0       1    3   54.0      0      0   51.314334         2   
2         1       2    3   34.0      0      0   13.553364         5   
3         0       1    3   19.0      3      2  263.176986         5   
4         1       1    0   23.0      3      2  262.789833         0   

   fare_age_combination  
0          22379.644511  
1           2770.974017  
2            460.814361  
3           5000.362740  
4           6044.166161  


In [8]:
# <------------ TODO ----------------------->
# Create and train a classification model of your choosing
# Print the accuracy, classification report, and the confusion matrix
X = df3.drop('survived', axis=1)
y = df3['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("\n=== Model Evaluation ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


=== Model Evaluation ===
Accuracy: 0.8333333333333334

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.20      0.33         5
           1       0.83      1.00      0.90        19

    accuracy                           0.83        24
   macro avg       0.91      0.60      0.62        24
weighted avg       0.86      0.83      0.79        24


Confusion Matrix:
 [[ 1  4]
 [ 0 19]]
