# Titanic tutorial
- EDA
- sklearn
- ann

# EDA

In [1]:
# plot 을 그릴 때, FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version 제거
# import warnings
# warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# heatmap 같은 경우, 데이터가 커지면 수치가 잘 안 보인다
# plot 사이즈를 키운다
# sns.set(rc={'figure.figsize':(10,8)})

In [2]:
df_train = pd.read_csv('titanic/train.csv')
df_test = pd.read_csv('titanic/test.csv')

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
women = df_train.loc[df_train.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [6]:
men = df_train.loc[df_train.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of men who survived: 0.18890814558058924


# Tutorial Results
- 0.77511

In [7]:
from sklearn.ensemble import RandomForestClassifier

y = df_train["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(df_train[features])
X_test = pd.get_dummies(df_test[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


# EDA Start
- 데이터들을 카테고리화 한다
- one hot encoding 을 한다

In [8]:
# 데이터 별 가족수를 카테고리화 한다.

df_train["family"] = df_train.SibSp + df_train.Parch
df_test["family"] = df_test.SibSp + df_test.Parch

def change_family_category(x):
  if x < 2: return "Single"
  if x == 2: return "Couple"
  if x < 5: return "Small"
  if x >= 5: return "Large"

df_train.family = df_train.family.apply(change_family_category)
df_test.family = df_test.family.apply(change_family_category)
df_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,family
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,Single
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Single
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,Single
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,Single
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,Single
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,Single
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,Single
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,Small
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,Single


In [9]:
# 5세 미만의 아동들의 생존률
df_train_baby_survive = df_train.loc[:, ['Survived', 'Age']]
df_train_baby_survive.fillna(df_train.Age.median())
df_train_baby_survive['is_baby'] = df_train_baby_survive.Age.map(lambda age: 1 if age < 5 else 0)

df_train_baby_survive_test = df_test.loc[:, ['Age']]
df_train_baby_survive_test.fillna(df_train.Age.median())
df_train_baby_survive_test['is_baby'] = df_train_baby_survive_test.Age.map(lambda age: 1 if age < 5 else 0)

df_train_baby_survive.corr()

Unnamed: 0,Survived,Age,is_baby
Survived,1.0,-0.077221,0.129801
Age,-0.077221,1.0,-0.46058
is_baby,0.129801,-0.46058,1.0


In [10]:
#is_baby one hot encoding
df_train["is_baby"] = df_train_baby_survive.is_baby.copy()
df_test["is_baby"] = df_train_baby_survive_test.is_baby.copy()

In [11]:
# get initial from names

def get_initial(name:str):
  return name.split(',')[1].split('.')[0].strip()
df_train.Name.apply(get_initial).unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

In [12]:
df_train_initial_survive = df_train.loc[:, ['Survived', 'Name']]
df_train_initial_survive['initial'] = df_train_initial_survive.Name.apply(get_initial)

for initial in df_train_initial_survive['initial'].unique():
  df_train_initial_survive[initial] = df_train_initial_survive['initial'].map(lambda x: 1 if x == initial else 0)

df_train_initial_survive.drop(columns=["initial", "Name"], inplace=True)  
df_train_initial_survive.corr()

Unnamed: 0,Survived,Mr,Mrs,Miss,Master,Don,Rev,Dr,Mme,Ms,Major,Lady,Sir,Mlle,Col,Capt,the Countess,Jonkheer
Survived,1.0,-0.549199,0.33904,0.327093,0.085221,-0.026456,-0.064988,0.008185,0.04247,0.04247,0.011329,0.04247,0.04247,0.060095,0.011329,-0.026456,0.04247,-0.026456
Mr,-0.549199,1.0,-0.474952,-0.595692,-0.254903,-0.039411,-0.096808,-0.104624,-0.039411,-0.039411,-0.055767,-0.039411,-0.039411,-0.055767,-0.055767,-0.039411,-0.039411,-0.039411
Mrs,0.33904,-0.474952,1.0,-0.20467,-0.08758,-0.013541,-0.033262,-0.035947,-0.013541,-0.013541,-0.01916,-0.013541,-0.013541,-0.01916,-0.01916,-0.013541,-0.013541,-0.013541
Miss,0.327093,-0.595692,-0.20467,1.0,-0.109844,-0.016983,-0.041717,-0.045085,-0.016983,-0.016983,-0.024031,-0.016983,-0.016983,-0.024031,-0.024031,-0.016983,-0.016983,-0.016983
Master,0.085221,-0.254903,-0.08758,-0.109844,1.0,-0.007267,-0.017851,-0.019292,-0.007267,-0.007267,-0.010283,-0.007267,-0.007267,-0.010283,-0.010283,-0.007267,-0.007267,-0.007267
Don,-0.026456,-0.039411,-0.013541,-0.016983,-0.007267,1.0,-0.00276,-0.002983,-0.001124,-0.001124,-0.00159,-0.001124,-0.001124,-0.00159,-0.00159,-0.001124,-0.001124,-0.001124
Rev,-0.064988,-0.096808,-0.033262,-0.041717,-0.017851,-0.00276,1.0,-0.007327,-0.00276,-0.00276,-0.003905,-0.00276,-0.00276,-0.003905,-0.003905,-0.00276,-0.00276,-0.00276
Dr,0.008185,-0.104624,-0.035947,-0.045085,-0.019292,-0.002983,-0.007327,1.0,-0.002983,-0.002983,-0.004221,-0.002983,-0.002983,-0.004221,-0.004221,-0.002983,-0.002983,-0.002983
Mme,0.04247,-0.039411,-0.013541,-0.016983,-0.007267,-0.001124,-0.00276,-0.002983,1.0,-0.001124,-0.00159,-0.001124,-0.001124,-0.00159,-0.00159,-0.001124,-0.001124,-0.001124
Ms,0.04247,-0.039411,-0.013541,-0.016983,-0.007267,-0.001124,-0.00276,-0.002983,-0.001124,1.0,-0.00159,-0.001124,-0.001124,-0.00159,-0.00159,-0.001124,-0.001124,-0.001124


In [13]:
df_train_initial_survive = df_train.loc[:, ['Survived', 'Name']]
df_train_initial_survive['initial'] = df_train_initial_survive.Name.apply(get_initial)

special_initials = ['Mr', 'Mrs', 'Miss']
df_train_initial_survive['initial'] = df_train_initial_survive['initial'].map(lambda x: x if x in special_initials else "Others")
df_train['initial'] = df_train_initial_survive['initial'].copy()


# get dummies 에서 해주고 있기 때문에 지웁니다.
# label incoder랑 같은 역할을 해준다.

# special_initials = ['Mr', 'Mrs', 'Miss']
# for initial in special_initials:
#   df_train_initial_survive[initial] = df_train_initial_survive['initial'].map(lambda x: 1 if x == initial else 0)


# df_train_initial_survive['Others'] = df_train_initial_survive.initial.map(lambda x: 0 if x in special_initials else 1)
# df_train_initial_survive.drop(columns=["initial", "Name"], inplace=True)
# df_train_initial_survive.corr()

In [14]:
# pd_dummies 에서 해주기 때문에 지웁니다.

# df_train['Mr'] = df_train_initial_survive.Mr.copy()
# df_train['Mrs'] = df_train_initial_survive.Mrs.copy()
# df_train['Miss'] = df_train_initial_survive.Miss.copy()
# df_train['Others'] = df_train_initial_survive.Others.copy()

In [15]:
df_test_initial_survive = df_test.loc[:, ['Name']]
df_test_initial_survive['initial'] = df_test_initial_survive.Name.apply(get_initial)

special_initials = ['Mr', 'Mrs', 'Miss']
df_test_initial_survive['initial'] = df_test_initial_survive['initial'].map(lambda x: x if x in special_initials else "Others")
df_test['initial'] = df_test_initial_survive['initial'].copy()

# 밑에서 pd.dummies 가 다 해주기 때문에 지웁니다.

# special_initials = ['Mr', 'Mrs', 'Miss']
# for initial in special_initials:
#   df_test_initial_survive[initial] = df_test_initial_survive['initial'].map(lambda x: 1 if x == initial else 0)


# df_test_initial_survive['Others'] = df_test_initial_survive.initial.map(lambda x: 0 if x in special_initials else 1)
# df_test_initial_survive.drop(columns=["initial", "Name"], inplace=True)

# df_test['Mr'] = df_test_initial_survive.Mr.copy()
# df_test['Mrs'] = df_test_initial_survive.Mrs.copy()
# df_test['Miss'] = df_test_initial_survive.Miss.copy()
# df_test['Others'] = df_test_initial_survive.Others.copy()

In [16]:
df_train.Pclass = df_train.Pclass.map(lambda x : f"Pclass_{x}")
df_test.Pclass = df_test.Pclass.map(lambda x : f"Pclass_{x}")

In [17]:
# The mode is the value that appears most often
df_train.Embarked.mode()[0]

'S'

In [18]:
# 일단 추가
df_train.Embarked.fillna(df_train.Embarked.mode()[0], inplace=True)
df_test.Embarked.fillna(df_test.Embarked.mode()[0], inplace=True)

In [19]:
# 일단 추가
df_train.Fare.fillna(df_train.Fare.median(), inplace=True)
df_test.Fare.fillna(df_test.Fare.median(), inplace=True)

# RainForest Tries

In [20]:
df_test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked', 'family', 'is_baby', 'initial'],
      dtype='object')

In [21]:
df_test_passenger_id = df_test.PassengerId.copy()

In [22]:
from sklearn.ensemble import RandomForestClassifier

y = df_train["Survived"]

features = ["Pclass", "Sex", "family", "is_baby", "initial", "Embarked"]
X = pd.get_dummies(df_train[features])
X_test = pd.get_dummies(df_test[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': predictions})
output.to_csv('submission2.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


- 테스트 submission 결과: 0.77751

- Embarked 를 추가하고 나서 오히려 미세하게 증가함.
- 데이터 컬럼의 양에 따라서 정확도가 올라갈 수 있구나 알 수 있음
- 데이터 분석이 미흡했구나 알 수 있었다.


# KFold, confusion_matrix
# accuracy_score, precision_score, recall_score, f1_score
- use the same data as above

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
#from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
import pandas as pd

y = df_train["Survived"]

features = ["Pclass", "Sex", "family", "is_baby", "initial", "Embarked"]
X = pd.get_dummies(df_train[features])
X_test = pd.get_dummies(df_test[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': predictions})
output.to_csv('submission2.csv', index=False)
print("Your submission was successfully saved!")

#features = ["Pclass", "Sex", "family", "is_baby", "initial", "Embarked"]

#df_train = pd.read_csv('titanic/train.csv')

X = pd.get_dummies(df_train[features]).iloc[:]
y = df_train["Survived"].iloc[:]  # Ensure y has the same number of samples as X

rf_model = RandomForestClassifier()

kf = KFold(n_splits=5)
accuracies = []
precisions = []
recalls = []
f1_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    rf_model.fit(X_train, y_train)
    predictions = rf_model.predict(X_test)
    
    accuracies.append(accuracy_score(y_test, predictions))
    precisions.append(precision_score(y_test, predictions))
    recalls.append(recall_score(y_test, predictions))
    f1_scores.append(f1_score(y_test, predictions))

    print(f"Confusion Matrix (Fold {len(accuracies)}):\n{confusion_matrix(y_test, predictions)}\n")

average_accuracy = sum(accuracies) / len(accuracies)
average_precision = sum(precisions) / len(precisions)
average_recall = sum(recalls) / len(recalls)
average_f1 = sum(f1_scores) / len(f1_scores)

print(f"Average Accuracy: {average_accuracy}")
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1 Score: {average_f1}")

Your submission was successfully saved!
Confusion Matrix (Fold 1):
[[110  10]
 [ 21  38]]

Confusion Matrix (Fold 2):
[[93  6]
 [26 53]]

Confusion Matrix (Fold 3):
[[102   7]
 [ 21  48]]

Confusion Matrix (Fold 4):
[[94 12]
 [32 40]]

Confusion Matrix (Fold 5):
[[108   7]
 [ 22  41]]

Average Accuracy: 0.8159249262444291
Average Precision: 0.8372192920074276
Average Recall: 0.6433910505643573
Average F1 Score: 0.7272979786618354


In [24]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import numpy as np

# 가정: X는 특성 데이터, y는 타겟 데이터
features = ["Pclass", "Sex", "SibSp", "Parch"]

X = pd.get_dummies(df_train[features]).iloc[:20]
y = df_train["Survived"]

# 랜덤 포레스트 모델 초기화
rf_model = RandomForestClassifier()

# K-Fold 교차 검증 설정
kf = KFold(n_splits=5)  # 5겹 교차 검증

# 각 분할에 대한 정확도를 저장할 리스트
accuracies = []

# K-Fold 교차 검증 수행
for train_index, test_index in kf.split(X):
    # 훈련 데이터와 테스트 데이터 분할
    # print(train_index, test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # 모델 훈련
    rf_model.fit(X_train, y_train)

    # 예측 및 정확도 계산
    predictions = rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    accuracies.append(accuracy)

# 평균 정확도 출력
average_accuracy = np.mean(accuracies)
print(f"Average Accuracy: {average_accuracy}")

Average Accuracy: 0.7


# ANN Tries
- use the same data as above

In [25]:
X.shape

(20, 7)

In [26]:
X_test.shape

(4, 7)

In [27]:
pd.get_dummies(df_train[features])

Unnamed: 0,SibSp,Parch,Pclass_Pclass_1,Pclass_Pclass_2,Pclass_Pclass_3,Sex_female,Sex_male
0,1,0,False,False,True,False,True
1,1,0,True,False,False,True,False
2,0,0,False,False,True,True,False
3,1,0,True,False,False,True,False
4,0,0,False,False,True,False,True
...,...,...,...,...,...,...,...
886,0,0,False,True,False,False,True
887,0,0,True,False,False,True,False
888,1,2,False,False,True,True,False
889,0,0,True,False,False,False,True


In [28]:
from torch.utils import data

In [29]:
training_dataloader = data.DataLoader(pd.get_dummies(df_train[features]), shuffle=True, batch_size=100)

In [30]:
from utils import train, test
from torch import nn

loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(nn_model.parameters(), lr=learning_rate)

NameError: name 'optim' is not defined

In [None]:
train(training_dataloader, nn_model, loss, optimizer)

KeyError: 125

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

one_hot_encoder = OneHotEncoder()
X_train = one_hot_encoder.fit_transform(X).toarray()
X_test = one_hot_encoder.fit_transform(X_test).toarray()

In [None]:
X_train.shape

(891, 34)

In [None]:
X_test.shape

(418, 34)

In [None]:
X_test

array([[1., 0., 1., ..., 1., 1., 0.],
       [1., 0., 1., ..., 0., 0., 1.],
       [1., 0., 1., ..., 1., 1., 0.],
       ...,
       [1., 0., 1., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 1., 0.]])

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(X_train, y.array, test_size = 0.1)

In [None]:
from sklearn.utils import shuffle
from torch.autograd import Variable
from torch import FloatTensor, LongTensor, nn, optim
import torch

import numpy as np

from model import nn_model

In [None]:
batch_size = 50
num_epochs = 50
learning_rate = 0.01
batch_no = len(x_train) // batch_size



for epoch in range(num_epochs):
    if epoch % 5 == 0:
        print('Epoch {}'.format(epoch+1))
    x_train, y_train = shuffle(x_train, y_train)
    # Mini batch learning
    for i in range(batch_no):
        start = i * batch_size
        end = start + batch_size
        x_var = Variable(FloatTensor(x_train[start:end]))
        y_var = Variable(LongTensor(y_train[start:end]))
        # Forward + Backward + Optimize
        optimizer.zero_grad()
        ypred_var = nn_model(x_var)
        loss =criterion(ypred_var, y_var)
        loss.backward()
        optimizer.step()

Epoch 1


  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch 6
Epoch 11
Epoch 16
Epoch 21
Epoch 26
Epoch 31
Epoch 36
Epoch 41
Epoch 46


In [None]:
test_var = Variable(FloatTensor(x_val), requires_grad=True)
with torch.no_grad():
    result = nn_model(test_var)
values, labels = torch.max(result, 1)
num_right = np.sum(labels.data.numpy() == y_val)
print('Accuracy {:.2f}'.format(num_right / len(y_val)))

Accuracy 0.79


In [None]:
X_test_var = Variable(FloatTensor(X_test), requires_grad=True)
with torch.no_grad():
    result = nn_model(X_test_var)
values, labels = torch.max(result, 1)
survived = labels.data.numpy()

In [None]:
X_test_passenger_id = df_test['PassengerId']

In [None]:
import csv

submission = [['PassengerId', 'Survived']]
for i in range(len(survived)):
    submission.append([X_test_passenger_id[i], survived[i]])

with open('submission.csv', 'w') as submissionFile:
    writer = csv.writer(submissionFile)
    writer.writerows(submission)
    
print('Writing Complete!')

Writing Complete!


In [None]:
77033