In [12]:
import pandas as pd
import tensorflow as tf

In [13]:
train_df = pd.read_csv("./data/train.csv")
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [14]:
train_df.isnull().sum(axis=0)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
def stackedBarChart(feature):
    survived = train_df.loc[train_df["Survived"] == 1][feature].value_counts()
    # train_df["Survived"] == 1 부분이 마스크가 된다!
    survived.name = "Survived"  # name이 원 df와 똑같이 Sex로 되어 있기 때문에 바꿔줌
    dead = train_df.loc[train_df["Survived"] == 0][feature].value_counts()
    dead.name = "Dead"
    chart_df = pd.DataFrame([survived,dead])
    chart_df.plot(kind="bar",stacked=True)


In [15]:
# 제일 먼저 해결해야 하는 feature는 Name
# name은 그 자체로 크게 의미가 없어 보이지만,
# 이름에는 특정 키워드가 들어 있고 그것이 생존에 영향을 미칠 수 있다!
train_df["Title"] = train_df["Name"].str.extract("([A-Za-z]+)\.")
# [A-Za-z]+ : 대소문자를 구별하지 않고 영문자가 여러개 나옴
# ([A-Za-z]+)\. : 영문자 여러개 나오고 끝에 점 찍히는 패턴 하나를 뽑아내라

train_df["Title"].value_counts()

# title안에 Mr, Miss, Mrs, other를 각각 0,1,2,3으로 변환
title_mapping_dict = {"Mr":0,"Miss":1,"Mrs":2,"Master":3,"Dr" :3,"Rev":3,"Col":3,
                      "Mlle":3,"Major":3,"Sir":3,"Countess":3,"Don":3,
                      "Lady":3,"Capt":3,"Mme":3,"Ms":3,"Jonkheer":3}
train_df["Title"] = train_df["Title"].map(title_mapping_dict)

train_df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [16]:
train_df.drop("PassengerId", axis = 1, inplace =True)
train_df.drop("Ticket", axis = 1, inplace =True)
train_df.drop("Cabin", axis = 1, inplace =True)
train_df.drop("Name", axis = 1, inplace =True)

train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,male,22.0,1,0,7.2500,S,0
1,1,1,female,38.0,1,0,71.2833,C,2
2,1,3,female,26.0,0,0,7.9250,S,1
3,1,1,female,35.0,1,0,53.1000,S,2
4,0,3,male,35.0,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,3
887,1,1,female,19.0,0,0,30.0000,S,1
888,0,3,female,,1,2,23.4500,S,1
889,1,1,male,26.0,0,0,30.0000,C,0


In [17]:
# 성별 컬럼에 대해 male => 0 , female => 1로 변환
sex_mapping_dict = {"male" : 0, "female" : 1}
train_df["Sex"] = train_df["Sex"].map(sex_mapping_dict)
# train_df

In [18]:
# 탑승지역의 결측치를 "S"로 대체 (S에서 가장 많은 사람들이 탔기 때문에!)
train_df["Embarked"].fillna("S",inplace=True)


# 탑승지역 컬럼에 대해 S => 0, Q =>1, C => 2로 변환
embarked_mapping_dict = {"S" : 0, "Q" : 1,"C" : 2}
train_df["Embarked"] = train_df["Embarked"].map(embarked_mapping_dict)
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.2500,0,0
1,1,1,1,38.0,1,0,71.2833,2,2
2,1,3,1,26.0,0,0,7.9250,0,1
3,1,1,1,35.0,1,0,53.1000,0,2
4,0,3,0,35.0,0,0,8.0500,0,0
...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.0,0,0,13.0000,0,3
887,1,1,1,19.0,0,0,30.0000,0,1
888,0,3,1,,1,2,23.4500,0,1
889,1,1,0,26.0,0,0,30.0000,2,0


In [19]:
age_mean = train_df.groupby("Title")["Age"].mean()
a = train_df[train_df["Title"]==0]["Age"].fillna(age_mean[0])
b = train_df[train_df["Title"]==1]["Age"].fillna(age_mean[1])
c = train_df[train_df["Title"]==2]["Age"].fillna(age_mean[2])
d = train_df[train_df["Title"]==3]["Age"].fillna(age_mean[3])
result_series = pd.concat([a,b,c,d])
result_series.sort_index()
train_df["Age"] = result_series.sort_index()
train_df


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.000000,1,0,7.2500,0,0
1,1,1,1,38.000000,1,0,71.2833,2,2
2,1,3,1,26.000000,0,0,7.9250,0,1
3,1,1,1,35.000000,1,0,53.1000,0,2
4,0,3,0,35.000000,0,0,8.0500,0,0
...,...,...,...,...,...,...,...,...,...
886,0,2,0,27.000000,0,0,13.0000,0,3
887,1,1,1,19.000000,0,0,30.0000,0,1
888,0,3,1,21.773973,1,2,23.4500,0,1
889,1,1,0,26.000000,0,0,30.0000,2,0


In [20]:
train_df.loc[train_df["Age"] <= 20,"Age"] = 0
train_df.loc[(train_df["Age"] > 20) & (train_df["Age"]<=40),"Age"] = 1
train_df.loc[(train_df["Age"] > 40) & (train_df["Age"]<=60),"Age"] = 2
train_df.loc[train_df["Age"] > 60,"Age"] = 3
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,1.0,1,0,7.2500,0,0
1,1,1,1,1.0,1,0,71.2833,2,2
2,1,3,1,1.0,0,0,7.9250,0,1
3,1,1,1,1.0,1,0,53.1000,0,2
4,0,3,0,1.0,0,0,8.0500,0,0
...,...,...,...,...,...,...,...,...,...
886,0,2,0,1.0,0,0,13.0000,0,3
887,1,1,1,0.0,0,0,30.0000,0,1
888,0,3,1,1.0,1,2,23.4500,0,1
889,1,1,0,1.0,0,0,30.0000,2,0


In [21]:
train_df.loc[train_df["Fare"] <= 10,"Fare"] = 0
train_df.loc[(train_df["Fare"] > 10) & (train_df["Fare"]<=30),"Fare"] = 1
train_df.loc[(train_df["Fare"] > 30) & (train_df["Fare"]<=100),"Fare"] = 2
train_df.loc[train_df["Fare"] > 100,"Fare"] = 3
train_df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,1.0,1,0,0.0,0,0
1,1,1,1,1.0,1,0,2.0,2,2
2,1,3,1,1.0,0,0,0.0,0,1
3,1,1,1,1.0,1,0,2.0,0,2
4,0,3,0,1.0,0,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...
886,0,2,0,1.0,0,0,1.0,0,3
887,1,1,1,0.0,0,0,1.0,0,1
888,0,3,1,1.0,1,2,1.0,0,1
889,1,1,0,1.0,0,0,1.0,2,0


In [22]:
# data set을 준비해야 해요!
# 정확도를 측정하기 위해서 학습용 데이터와 평가용 데이터를 따로 분리해야 해요!
train_df

# train_df를 살짝 분리해서 학습용 데이터와 평가용 데이터를 생성
# 상위 80%를 학습용, 20%를 평가용 데이터로 사용
train_df.shape  # 튜플! 리스트처럼 인덱스 사용 가능
train_num = int(train_df.shape[0] * 0.8)      # 712

# Train data set & Test data set 만들어주기!!!
train_x_data = train_df.drop("Survived",axis=1,inplace=False)[:train_num].values
# inplace = F : 원본은 변하지 않게 처리  # values붙여서 값 형태로 도출
test_x_data = train_df.drop("Survived",axis=1,inplace=False)[train_num:].values

train_y_data = train_df["Survived"][:train_num].values.reshape([-1,1])
# reshape([-1,1]) => 컬럼 하나짜리 2차원 형태로 만들어주기 위해 사용
test_y_data = train_df["Survived"][train_num:].values.reshape([-1,1])


In [13]:
tf.__version__

'1.5.0'

In [43]:
# tensorflow를 이용한 logistic regression code가 나오면 돼요!

# 초기화
tf.reset_default_graph()

# placeholder
# train_x_data.shape
X = tf.placeholder(shape=[None,8],dtype=tf.float32)
Y = tf.placeholder(shape=[None,1],dtype=tf.float32)
keep_rate = tf.placeholder(dtype=tf.float32) # scaler이기 때문에 shape안줘도 된다!!


# Weight , bias
# W1 = tf.Variable(tf.random_normal([8,16]),name='weight1')
# b1 = tf.Variable(tf.random_normal([16]),name='bias1')
# layer1 = tf.nn.relu(tf.matmul(X,W1)+b1)

# W2 = tf.Variable(tf.random_normal([16,1]),name='weight2')
# b2 = tf.Variable(tf.random_normal([1]),name='bias2')

W1 = tf.get_variable('weight1',shape=[8,16],
                    initializer=tf.contrib.layers.xavier_initializer())
b1 = tf.Variable(tf.random_normal([16]),name='bias1')

_layer1 = tf.nn.relu(tf.matmul(X,W1)+b1)
layer1 = tf.nn.dropout(_layer1,keep_prob=keep_rate)

W2 = tf.get_variable('weight2',shape=[16,2],
                    initializer=tf.contrib.layers.xavier_initializer())
b2 = tf.Variable(tf.random_normal([2]),name='bias2')
_layer2 = tf.nn.relu(tf.matmul(layer1,W2)+b2)
layer2 = tf.nn.dropout(_layer2,keep_prob=keep_rate)

W3 = tf.get_variable('weight3',shape=[2,1],
                    initializer=tf.contrib.layers.xavier_initializer())
b3 = tf.Variable(tf.random_normal([1]),name='bias3')

# Hypothesis
logit = tf.matmul(layer2,W3)+b3
H = tf.nn.relu(logit)

# cost
cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = logit,
                                                                labels = Y))

# train
train = tf.train.AdamOptimizer(learning_rate = 0.01).minimize(cost)

# session, 초기화작업 필요
sess = tf.Session()
sess.run(tf.global_variables_initializer())



In [44]:
# 학습
for step in range(3000):
    _, cost_val = sess.run([train, cost], feed_dict = {X : train_x_data,
                                                      Y : train_y_data,
                                                      keep_rate:0.7})
    # => cost값을 3000번 줄여라 -> 이 과정 겪으면서 최적의 W,b 값이 도출된다
    if step % 300 == 0:
        print("cost 값은 : {}".format(cost_val))
    
# 우리가 원하는 W와 b를 구했어요! => model을 구성했어요!
# 정확도를 측정
# 테스트용 x입력데이터를(test_x_data) 넣어서 예측을 해요!
# 이렇게 구한 예측값과 y 입력데이터를(test_y_data) 비교해요!

cost 값은 : 0.8421611785888672
cost 값은 : 0.4978284537792206
cost 값은 : 0.48863375186920166
cost 값은 : 0.4734633266925812
cost 값은 : 0.4802887439727783
cost 값은 : 0.4568052887916565
cost 값은 : 0.4707481265068054
cost 값은 : 0.4873955547809601
cost 값은 : 0.47488051652908325
cost 값은 : 0.474821537733078


In [45]:
# 예측값과 실제데이터의 차이를 "비율"로 계산해보아요

# predict = tf.argmax(H,1)
# correct = tf.equal(predict,tf.argmax(Y,1))
# accuracy = tf.reduce_mean(tf.cast(correct,dtype=tf.float32))

# print("정확도: {}".format(sess.run(accuracy,feed_dict = {X: test_x_data,
#                                                       Y: test_y_data})))

# 예측값과 실제데이터의 차이를 "비율"로 계산해보아요
predict = tf.cast(H > 0.5, dtype = tf.float32)
# boolean으로 나온 값을 0과 1과 같은 숫자형태로 나오게 해줌
correct = tf.equal(predict, Y)
accuracy = tf.reduce_mean(tf.cast(correct, dtype = tf.float32))

print("정확도: {}".format(sess.run(accuracy,feed_dict = {X: test_x_data,
                                                      Y: test_y_data,
                                                     keep_rate: 1.0})))


정확도: 0.8603351712226868


In [46]:
test_df = pd.read_csv("./data/test.csv")
test_df.isnull().sum(axis=0)
id_df = test_df["PassengerId"]

test_df["Title"] = test_df["Name"].str.extract("([A-Za-z]+)\.")
# [A-Za-z]+ : 대소문자를 구별하지 않고 영문자가 여러개 나옴
# ([A-Za-z]+)\. : 영문자 여러개 나오고 끝에 점 찍히는 패턴 하나를 뽑아내라

test_df["Title"].value_counts()

# title안에 Mr, Miss, Mrs, other를 각각 0,1,2,3으로 변환
title_mapping_dict = {"Mr":0,"Miss":1,"Mrs":2,"Master":3,"Dr" :3,"Rev":3,"Col":3,
                      "Mlle":3,"Major":3,"Sir":3,"Countess":3,"Don":3,"Dona":3,
                      "Lady":3,"Capt":3,"Mme":3,"Ms":3,"Jonkheer":3}
test_df["Title"] = test_df["Title"].map(title_mapping_dict)

test_df.head()

test_df.drop("PassengerId", axis = 1, inplace =True)
test_df.drop("Ticket", axis = 1, inplace =True)
test_df.drop("Cabin", axis = 1, inplace =True)
test_df.drop("Name", axis = 1, inplace =True)

# 성별 컬럼에 대해 male => 0 , female => 1로 변환
sex_mapping_dict = {"male" : 0, "female" : 1}
test_df["Sex"] = test_df["Sex"].map(sex_mapping_dict)


# Embarked 컬럼에 숫자형태로
embark_mapping_dict = {"S":0, "Q":1,"C":2}
test_df["Embarked"] = test_df["Embarked"].map(embark_mapping_dict)
test_df

age_t_mean = test_df.groupby("Title")["Age"].mean()
a1 = test_df[test_df["Title"]==0]["Age"].fillna(age_t_mean[0])
b1 = test_df[test_df["Title"]==1]["Age"].fillna(age_t_mean[1])
c1 = test_df[test_df["Title"]==2]["Age"].fillna(age_t_mean[2])
d1 = test_df[test_df["Title"]==3]["Age"].fillna(age_t_mean[3])
result_t1_series = pd.concat([a1,b1,c1,d1])
result_t1_series.sort_index()
test_df["Age"] = result_t1_series.sort_index()

# Age 0 ~ 20 => 0
# Age 20초과 ~ 40 => 1
# Age 40초과 ~ 60 => 2
# Age 60초과 ~    => 3
test_df.loc[test_df["Age"] <= 20,"Age"] = 0
test_df.loc[(test_df["Age"] > 20) & (test_df["Age"]<=40),"Age"] = 1
test_df.loc[(test_df["Age"] > 40) & (test_df["Age"]<=60),"Age"] = 2
test_df.loc[test_df["Age"] > 60,"Age"] = 3
test_df

# NaN 없애주기
test_df = test_df.fillna(0)
test_df.isnull().sum()
# Fare 0 ~ 10 => 0
# Fare 10초과 ~ 20 => 1
# Fare 20초과 ~ 30 => 2
# Fare 30초과 ~ 40 => 3
# Fare 40초과 ~ 100 => 4
# Fare 100초과 ~ => 5

test_df.loc[test_df["Fare"] <= 10,"Fare"] = 0
test_df.loc[(test_df["Fare"] > 10) & (test_df["Fare"]<=30),"Fare"] = 1
test_df.loc[(test_df["Fare"] > 30) & (test_df["Fare"]<=100),"Fare"] = 2
test_df.loc[test_df["Fare"] > 100,"Fare"] = 3

test_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,3,0,1.0,0,0,0.0,1,0
1,3,1,2.0,1,0,0.0,0,2
2,2,0,3.0,0,0,0.0,1,0
3,3,0,1.0,0,0,0.0,0,0
4,3,1,1.0,1,1,1.0,0,2
...,...,...,...,...,...,...,...,...
413,3,0,1.0,0,0,0.0,0,0
414,1,1,1.0,0,0,3.0,2,3
415,3,0,1.0,0,0,0.0,0,0
416,3,0,1.0,0,0,0.0,0,0


In [47]:
test_df.shape  # 튜플! 리스트처럼 인덱스 사용 가능

test_xt_data = test_df.values
test_xt_data
# test_yt_data = test_df["Survived"].values.reshape([-1,1])

# print("정확도: {}".format(sess.run(accuracy,feed_dict = {X: test_xt_data,
#                                                       Y: test_yt_data})))

result = tf.cast(sess.run(H > 0.5, feed_dict = {X : test_xt_data,
                                               keep_rate: 1.0}), dtype=tf.int32)
res_df = pd.DataFrame(sess.run(result))
# res_df
# test_df["Fare"].isnull().sum()


In [48]:
df = pd.DataFrame({"PassengerId":id_df,"Survived":res_df[0]})
print(df)

df.to_csv("./data/answer_new_titanic2.csv",
          sep = ",",
          index = False)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         1

[418 rows x 2 columns]
