In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

titanic_path = kagglehub.competition_download('titanic')

print('Data source import complete.')


# 1. Data Preparation
## 1.1 Get the fullpath of the input data

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

"""
    Import libraries for data analytics
"""
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory

"""
    Get the fullpath of the input data
"""
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1.2 Read data from files

In [None]:
# Read data from files, based on paths we got above
training_set = pd.read_csv('/kaggle/input/titanic/train.csv')
test_set = pd.read_csv('/kaggle/input/titanic/test.csv')

# Explore data structures of dataframes
from IPython.display import display # import Iteractive to enhance functionality

display(training_set.head(10))
display(test_set.head(10))

There's no 'survived' column in the test dataset.

In [None]:
"""
    Combine the training dataset & test dataset
"""

# Add an identifier column for later separation
training_set['train_test'] = 1
test_set['train_test'] = 0

# Add Survived column to test set to match columns for concatenation
test_set['Survived'] = np.NaN

# Combine datasets for joint preprocessing
all_data = pd.concat([training_set, test_set])

# Enable inline plotting (useful for Jupyter/Notebook environments)
%matplotlib inline

# View column names of the entire dataset
all_data.columns

In [None]:
all_data.head(30)

In [None]:
"""
    3. Explore details of training dataset
"""
training_set.info()

In [None]:
# Using .describe() method to understand the central tendencies of the numeric data
training_set.describe()

In [None]:
# Break data into Numeric variables & Categorical variables
df_num = training_set[['Age', 'SibSp', 'Parch', 'Fare']]
df_cat = training_set[['Survived', 'Pclass', 'Sex', 'Ticket','Cabin', 'Embarked']]

In [None]:
# Distributions (Histograms) for all numeric variables
for i in df_num.columns:
    plt.hist(df_num[i].dropna())
    plt.title(i)
    plt.show()

In [None]:
# Correlations
print(df_num.corr())
sns.heatmap(df_num.corr(), annot=True, cmap='coolwarm')

plt.title('Correlation Matrix for All Numeric Features')

plt.show()

In [None]:
# Compare survival rate across Age,SibSp,Parch,and Fare
pd.pivot_table(training_set, index='Survived',values=['Age','SibSp','Parch','Fare'],aggfunc='mean')

In [None]:
"""
    side-by-side barplot for categorical features
"""


for i in df_cat.columns:
    sns.barplot(x=df_cat[i].value_counts().index, y=df_cat[i].value_counts().values).set_title(i)
    plt.show()

In [None]:
# Comparing survival and each of these categorical variables, aggregated by count
print(pd.pivot_table(training_set, index = 'Survived', columns = 'Pclass', values = 'Ticket', aggfunc = 'count'))
print()
print(pd.pivot_table(training_set, index = 'Survived', columns = 'Sex', values = 'Ticket', aggfunc = 'count'))
print()
print(pd.pivot_table(training_set, index = 'Survived', columns = 'Embarked', values = 'Ticket', aggfunc = 'count'))

In [None]:
# Comparing survival and each of these categorical variables, aggregated by percent

pivot_Pclass = pd.pivot_table(training_set, index = 'Survived', columns = 'Pclass', values = 'Ticket', aggfunc = 'count')
pivot_pct_Pclass = pivot_Pclass.apply(lambda col: col / col.sum(), axis=0) * 100
print(pivot_pct_Pclass, '\n')

pivot_Sex = pd.pivot_table(training_set, index = 'Survived', columns = 'Sex', values = 'Ticket', aggfunc = 'count')
pivot_pct_Sex = pivot_Sex.apply(lambda col: col / col.sum(), axis=0) * 100
print(pivot_pct_Sex, '\n')

pivot_Embarked = pd.pivot_table(training_set, index = 'Survived', columns = 'Embarked', values = 'Ticket', aggfunc = 'count')
pivot_pct_Embarked = pivot_Embarked.apply(lambda col: col / col.sum(), axis=0) * 100
print(pivot_pct_Embarked, '\n')

In [None]:
df_cat.Cabin.head(30)

In [None]:
# Create a new feature 'cabin_multi' filling with num of cabins
training_set['cabin_multi'] = training_set.Cabin.apply(
    lambda x: 0 if pd.isna(x) else
              1 if len(x.split(' ')) == 1 else
              len(x.split(' ')))
training_set['cabin_multi'].value_counts()

In [None]:
# Show categories and corres percents of 'cabin_multi'
training_set['cabin_multi'].value_counts(normalize = True)

In [None]:
# Check the training set
training_set.info()

In [None]:
"""
    Create a pivot table for 'cabin_multi' with index 'Survived', aggregated by count
"""
pivot_cabin_multi = pd.pivot_table(training_set, index = 'Survived', columns = 'cabin_multi', values = 'Ticket', aggfunc = 'count')
pivot_cabin_multi.fillna(0)

In [None]:
"""
    Create a pivot table for 'cabin_multi' with index 'Survived', aggregated by percent
"""

pivot_pct_cabin_multi = pivot_cabin_multi.apply(
    lambda col: col / col.sum(), axis=0) * 100
pivot_pct_cabin_multi.fillna(0)

In [None]:
"""
    Create a new feature 'cabin_adv', categorized by first letter
    - Extract the first letter of each cell of Series 'Cabin'
        (training_set.Cabin is a Pandas.Series type, each item x of it is a cell)
"""
training_set['cabin_type'] = training_set.Cabin.apply(
    lambda x: x[0] if pd.notna(x) else 'Unknown'
)

print(training_set.cabin_type.value_counts())

In [None]:
"""
    Create a pivot table for the feature 'cabin_adv', aggregated by count
"""
pivot_cabin_type = pd.pivot_table(training_set, index='Survived', columns='cabin_type', values='Name', aggfunc='count')
pivot_cabin_type.fillna(0)

In [None]:
"""
    Create a pivot table for the feature 'cabin_adv', aggregated by percent
"""
pivot_pct_cabin_type = pivot_cabin_type.apply(
    lambda col: col / col.sum(), axis=0) * 100

pivot_pct_cabin_type = pivot_pct_cabin_type.fillna(0)
pivot_pct_cabin_type

In [None]:
"""

"""
plt.figure(figsize=(8, 4))
sns.heatmap(pivot_pct_cabin_multi.fillna(0), annot=True, fmt=".1f", cmap="Blues")

plt.title("Survival Rate by Cabin Count")
plt.ylabel("Survived")
plt.xlabel("Number of Cabins (cabin_multi)")
plt.show()

In [None]:
"""

"""
plt.figure(figsize=(8, 4))
sns.heatmap(pivot_pct_cabin_type.fillna(0), annot=True, fmt=".1f", cmap="Blues")

plt.title("Survival Rate by Cabin Type")
plt.ylabel("Survived")
plt.xlabel("Type of Cabins (cabin_type)")
plt.show()

In [None]:
"""
    Ticket
"""

training_set.Ticket.head(30)

In [None]:
"""
    Create a new feature 'numeric_ticket'
        - if ticket is pure number, then 1
        - else 0
"""

training_set['numeric_ticket'] = training_set.Ticket.apply(
    lambda x: 1 if x.isnumeric() else 0
)
training_set.numeric_ticket.head(10)

In [None]:
"""
    Create a new feature 'ticket_letters'
        - if ticket is pure number, then 1
        - else 0
"""

training_set['ticket_letters'] = training_set.Ticket.apply(
    lambda x: ''.join(x.split(' ')[:-1]).replace('.','').replace('/','').upper() if len(x.split(' ')[:-1]) > 0 else 0
)
training_set.ticket_letters.head(10)

In [None]:
training_set['numeric_ticket'].value_counts(normalize=False) # normalize=False -> count

In [None]:
training_set['ticket_letters'].value_counts(normalize=False) # normalize=True -> proportion

In [None]:
"""
    Pivot table for 'numeric_ticket' with index 'Survived'
"""
pivot_num_ticket = pd.pivot_table(training_set, index = 'Survived', columns = 'numeric_ticket', values = 'PassengerId', aggfunc = 'count')
pivot_num_ticket

In [None]:
"""
    列归一化
"""
pivot_pct_num_ticket = pivot_num_ticket.apply(
    lambda x: x / x.sum(), axis = 0) * 100
pivot_pct_num_ticket

In [None]:
"""
    Pivot table for 'ticket_letters' with index 'Survived'
"""

pivot_ticket_let = pd.pivot_table(training_set, index = 'Survived', columns = 'ticket_letters', values = 'PassengerId', aggfunc = 'count')
pivot_ticket_let = pivot_ticket_let.fillna(0)
pivot_ticket_let

In [None]:
"""
    列归一化
"""
pivot_pct_ticket_let = pivot_ticket_let.apply(
    lambda x: x / x.sum(), axis = 0) * 100
pivot_pct_ticket_let

分的太过于细致，每个种类总数太少，得到的百分比不太能说明问题。

In [None]:
"""
    Feature engineering for 'Name' to get identity info

"""

training_set.Name.head(30)

In [None]:
"""

"""
training_set['name_title'] = training_set.Name.apply(
    lambda x: x.split(',')[1].split('.')[0] if ',' in x and '.' in x else 0
#    lambda x: x.split(',')[1].split('.')[0] if len(x.split(',')[1].split('.')) > 0 else 0
)
training_set.name_title.head(10)

In [None]:
training_set.name_title.value_counts(normalize = False)
# training_set['name_title'].value_counts(normalize = False)

In [None]:
pivot_name_title = pd.pivot_table(training_set, index = 'Survived', columns = 'name_title', values = 'PassengerId', aggfunc = 'count')
pivot_name_title = pivot_name_title.fillna(0)
pivot_name_title

In [None]:
pivot_pct_name_title = pivot_name_title.apply(
    lambda x: x / x.sum(), axis = 0) * 100
pivot_pct_name_title

## Data Preprocessing for Model

1) Drop null values from Embarked (only 2) 处理极少量的缺失值，删除对模型影响极小，用删去方式最简单
2) Include only relevant variables (exclude data like name and passengerId so that we could have a reasonable number of features for our models to deal with) 降低特征维度，去掉无预测价值的字段。如名字是字符串且极不规律，不适合建模；ID 与目标变量无关。
3) Do categorical transforms on all data. Usually we would use a transformer, but with this approach we can ensure that our training and test data have the same columns. We also may be able to infer something about the shape of the test data through this method. I will stress, this is generally not recommend outside of a competition (use onehot encoder).将非数值型（categorical）特征转成数值型。
4) Impute data with mean for fare and age (Should also experiment with median)
5) Normalized fare using logarithm to give more semblance of a normal distribution.
6) Scaled data 0-1 with standard scaler

In [None]:
# Create all categorical variables that we did

## 1) Number of Cabins Per Person: 'cabin_multi'
all_data['cabin_multi'] = all_data.Cabin.apply(
    lambda x: 0 if pd.isna(x) else
              1 if len(x.split(' ')) == 1 else
              len(x.split(' ')))

## 2) Type of Cabins: 'cabin_type'
all_data['cabin_type'] = all_data.Cabin.apply(
    lambda x: x[0] if pd.notna(x) else 'Unknown'
)

## 3) Value Type of Ticket: 'numeric_ticket'
all_data['numeric_ticket'] = all_data.Ticket.apply(
    lambda x: 1 if x.isnumeric() else 0
)

## 4) Prefix of Ticket: 'ticket_letters'
all_data['ticket_letters'] = all_data.Ticket.apply(
    lambda x: ''.join(x.split(' ')[:-1]).replace('.','').replace('/','').upper() if len(x.split(' ')[:-1]) > 0 else 0
)

## 5) Title of Name: 'name_title'
all_data['name_title'] = all_data.Name.apply(
    lambda x: x.split(',')[1].split('.')[0].strip() if ',' in x and '.' in x else 0
)

In [None]:
display(all_data.head(10))
display(all_data.info())

In [None]:
# Fill in NaNs of Numeric features with mean values
all_data.Age = all_data.Age.fillna(training_set.Age.mean())
all_data.Fare = all_data.Fare.fillna(training_set.Fare.mean())

display(training_set.info())
display(all_data.info())

In [None]:
# Drop rows with null 'Embarked' rows. Since only 2 in training_set
all_data.dropna(subset=['Embarked'],inplace=True)
print(all_data['Embarked'].isnull().sum())
display(all_data.info())

In [None]:
# log norm of 'SibSp'
all_data['norm_sibsp'] = np.log(all_data.SibSp + 1)
all_data['norm_sibsp'].hist()
# log norm of 'Fare'
all_data['norm_fare'] = np.log(all_data.Fare + 1)
all_data['norm_fare'].hist()

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 2, figsize=(12, 4))

# 原始分布
all_data['SibSp'].hist(ax=ax[0])
ax[0].set_title('Original SibSp')

# 对数变换后分布
all_data['norm_sibsp'].hist(ax=ax[1])
ax[1].set_title('Log-Transformed SibSp')

plt.show()


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 4))

# 原始分布
all_data['Fare'].hist(ax=ax[0])
ax[0].set_title('Original Fare')

# 对数变换后分布
all_data['norm_fare'].hist(ax=ax[1])
ax[1].set_title('Log-Transformed Fare')

plt.show()

In [None]:
# Converted 'Fare' to Categorical Feature for pd.get_dummies()
all_data.Pclass = all_data.Pclass.astype(str)

# Created dummy variables form categories (also can use OneHotEncoder)
all_dummies = pd.get_dummies(all_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'norm_fare', 'Embarked', 'cabin_type', 'cabin_multi', 'numeric_ticket', 'name_title', 'train_test']])

# Split to train & test again
X_train = all_dummies[all_dummies.train_test == 1].drop(['train_test'], axis = 1)
X_test = all_dummies[all_dummies.train_test == 0].drop(['train_test'], axis = 1)

y_train = all_data[all_data.train_test == 1].Survived
y_train.shape