In [None]:
# reference: https://www.kaggle.com/code/shivamja/titanic-survival-prediction

#------Data Handling & Visualization-------
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
train_data=pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
train_data.head()

In [None]:
train_data.info()

In [None]:
test_data.head()

In [None]:
test_data.info()

In [None]:
print("Missing data on train ")
display(train_data.isnull().sum().sort_values(ascending=False))

print("Missing data on test ")
display(test_data.isnull().sum().sort_values(ascending=False))

In [None]:
print("Summary Statistics of train")
display(train_data.describe().round(2))

In [None]:
print("Survival Rate :",np.round((train_data[train_data['Survived']==1].shape[0]/train_data['Survived'].shape[0])*100,2),"%")

# Count bar and hist of age and fare

In [None]:
plt.figure(figsize=(14,10))

plt.subplot(3,2,1)
sns.countplot(data=train_data,x='Survived')

plt.subplot(3,2,2)
sns.countplot(data=train_data,x='Pclass',hue='Survived')

plt.subplot(3,2,3)
sns.countplot(data=train_data,x='Sex',hue='Survived')

plt.subplot(3,2,4)
sns.countplot(data=train_data,x='Embarked',hue='Survived')

plt.subplot(3,2,5)
sns.histplot(data=train_data,x='Age',kde=True,edgecolor="blue",bins=60)

plt.subplot(3,2,6)
sns.histplot(data=train_data,x='Fare',kde=True,edgecolor="blue",bins=60)

plt.tight_layout()
plt.show()

# Correction matrix 

In [None]:
corr_matrix=train_data.corr(numeric_only=True)
sns.heatmap(corr_matrix,annot=True,linewidths=5,linecolor='white',cmap='Greys')
plt.show()

In [None]:
corr_matrix=train_data.corr(numeric_only=True)
annot_labels = corr_matrix.map(lambda x: f'{x:.2f}' if abs(x) >= 0.1 else '')

# 顯示熱圖
plt.figure(figsize=(10, 8)) # 設定圖形大小
sns.heatmap(corr_matrix, 
            annot=annot_labels, # 使用我們自訂的註釋矩陣
            fmt='',          # 設定註釋格式為小數點後兩位
            linewidths=5,
            linecolor='white',
            cmap='coolwarm',
            vmin=-1,
            vmax=1)
plt.show()

# Decision Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex",  "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Saved Successfully")


In [None]:
output.head()