# Data Visualization for Titanic open dataset

This is an open source dataset of the Titanic and this project is an attempt to manipulate and extract informations from it with Python and scikit-learn.

In [None]:
%pip install pandas 



In [3]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [None]:
print("Working on Titanic dataset")
data = pd.read_csv("titanic.csv")
data.info()
print(data.isnull().sum())

Working on Titanic dataset


FileNotFoundError: [Errno 2] No such file or directory: 'titanic.csv'

In [None]:
def fill_missing_ages(df: pd.DataFrame) -> pd.DataFrame:
    """
    filling missing ages in dataFrame (df)
    """
    age_fill_map = {}

    for pclass in df["Pclass"].unique():
        if pclass not in age_fill_map:
            age_fill_map[pclass] = df[df["Pclass"] == pclass]["Age"].median()

    # Apply the median onto df if row["Age"] is null otherwize keep the original age
    df["Age"] = df.apply(lambda row: age_fill_map[row["Pclass"]] if pd.isnull(row["Age"]) else row["Age"], axis=1)
    # df["Age"].fillna(df["Pclass"].map(age_fill_map), inplace=True)
    print(f"Age fill map: {age_fill_map}")

    return df

In [None]:
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Drop unused columns, fill null values and convert in number type
    """
    df.drop(columns=["PassengerId","Name","Ticket","Cabin"], inplace=True)

    # Fill the missing values as "S" for Southampton, the most common embarkation point in the data
    df["Embarked"].fillna("S", inplace=True)
    df.drop(columns=["Embarked"], inplace=True)

    fill_missing_ages(df)

    # Convert Gender for model
    df["Sex"] = df["Sex"].map({'male': 1, 'female': 0})

    # Feature engineering
    df["FamilySize"] = df["SibSp"] + df["Parch"] # parents + children
    df["IsAlone"] = np.where(df["FamilySize"] == 0, 1, 0) # where there is no one then insert 1
    df["FareBin"] = pd.qcut(df["Fare"], 4, labels=False) # categorization for ticket prices
    df["AgeBin"] = pd.cut(df["Age"], bins=[0,12,20,40,60, np.inf], labels=False) # bins for ranged age of passengers
    print(df)
    with open("data_preprocessed.csv", "w") as f:
        df.to_csv(f, index=False)

    return df

In [None]:
# Preprocessing data
print("Preprocessing data...")
preprocessed_data = preprocess_data(data)

# Create Features / Target Variables (Make Flashcards)
X = preprocessed_data.drop(columns=["Survived"])
y = preprocessed_data["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)