In [38]:
import easydict
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
args = easydict.EasyDict()


args.default_path = 'C:/Users/Playdata/code/data/titanic/'
args.train_csv = args.default_path+'train.csv'
args.test_csv = args.default_path+'test.csv'
args.submission_csv = args.default_path+'submission.csv'
args.submission_csv_0220_1 = args.default_path+'submission_0220_1.csv'

args.random_state = 21

In [39]:
import seaborn as sns

In [40]:
train = pd.read_csv(args.train_csv)
print(f'{train.shape}')
train.head()

(916, 12)


Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,0,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S
1,1,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q
2,2,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5,B69,S
3,3,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S
4,4,0,2,"Hold, Mr. Stephen",male,44.0,1,0,26707,26.0,,S


In [41]:
test = pd.read_csv(args.test_csv)
print(f'{test.shape}')
test.head()

(393, 11)


Unnamed: 0,passengerid,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,916,3,"McGowan, Miss. Anna ""Annie""",female,15.0,0,0,330923,8.0292,,Q
1,917,2,"Pinsky, Mrs. (Rosa)",female,32.0,0,0,234604,13.0,,S
2,918,3,"McCarthy, Miss. Catherine Katie""""",female,,0,0,383123,7.75,,Q
3,919,3,"Franklin, Mr. Charles (Charles Fardon)",male,,0,0,SOTON/O.Q. 3101314,7.25,,S
4,920,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45.0,1,1,36928,164.8667,,S


In [42]:
train["age"] = train["age"].fillna(-0.5)   # age의 결측값 대체
test["age"] = test["age"].fillna(-0.5)

bins = [-1,0,5,10,18,22,35,60, np.inf]
labels = ['unknown', 'baby','child', 'teenage', 'student', 'adult','middle age','senior']
train['age_group'] = pd.cut(train['age'],bins,labels= labels)
test['age_group'] = pd.cut(test['age'],bins,labels=labels)

In [43]:
combine = [train, test]  # train, test 가 같이 있는 combine 이라는 그룹을 생성 

for dataset in combine:
    dataset['Title'] = dataset.name.str.extract(' ([A-Za-z]+)\.', expand =False)
    # 데이터에서 각 이름에 앞 글자들만 떼서 가져옴
pd.crosstab(train['Title'], train['gender'])

gender,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,3
Countess,1,0
Don,0,1
Dr,1,4
Jonkheer,0,1
Major,0,2
Master,0,42
Miss,185,0
Mlle,1,0


In [44]:
for dataset in combine:      # 위에 가져온 이름들을 통상적인 단어로 바꾸기
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Capt', 'Col',
    'Don', 'Dr', 'Major', 'Rev', 'Jonkheer', 'Dona'], 'Rare')
    
    dataset['Title'] = dataset['Title'].replace(['Countess', 'Lady', 'Sir'], 'Royal')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

train[['Title', 'survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,survived
0,Master,0.285714
1,Miss,0.807487
2,Mr,0.109848
3,Mrs,0.869565
4,Rare,0.157895
5,Royal,1.0


In [45]:
# map 함수를 사용하기 위해 번호 할당하기
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Royal": 5, "Rare": 6}
for dataset in combine:
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)

train.head()

Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked,age_group,Title
0,0,0,2,"Wheeler, Mr. Edwin Frederick""""",male,-0.5,0,0,SC/PARIS 2159,12.875,,S,unknown,1
1,1,0,3,"Henry, Miss. Delia",female,-0.5,0,0,382649,7.75,,Q,unknown,2
2,2,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5,B69,S,middle age,3
3,3,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S,adult,1
4,4,0,2,"Hold, Mr. Stephen",male,44.0,1,0,26707,26.0,,S,middle age,1


In [46]:
mr_age = train [train["Title"] == 1]["age_group"].mode() # adult
miss_age = train[train["Title"] ==2]["age_group"].mode() # student
mrs_age = train[train["Title"] ==3]["age_group"].mode() #middle age
master_age = train[train["Title"]==4]["age_group"].mode() #baby
royal_age = train[train["Title"]==5]["age_group"].mode() # middle age
rare_age = train[train["Title"]==6]["age_group"].mode() # middle age

age_title_mapping = {1:"adult", 2:"student", 3: "middle age", 4:"baby",
                      5: "middle age", 6: "senior"}

# train과 test 그룹의 unknown을 맵핑해서 없애기
for i in range(len(train["age_group"])):
    if train["age_group"][i] == "unknown":
        train["age_group"][i] = age_title_mapping[train["Title"][i]]
        
for i in range(len(test["age_group"])):
    if test["age_group"][i] == "unknown":
        test["age_group"][i] = age_title_mapping[test["Title"][i]]

In [47]:
age_mapping = {'baby':1, 'child':2, 'teenage':3, 'student':4,'adult':5,
               'middle age':6,'senior':7}
train["age_group"] = train['age_group'].map(age_mapping)
test["age_group"] = test['age_group'].map(age_mapping)

train = train.drop(['age'], axis=1)
test = test.drop(['age'], axis=1)

In [48]:
sex_mapping = {"male":0, "female":1}    # 성별 매핑

train['gender'] = train["gender"].map(sex_mapping)
test['gender'] = test["gender"].map(sex_mapping)

In [49]:
embarked_mapping = {"S":1, "C":2, "Q":3 }  # 탑승지 매핑

train["embarked"] = train["embarked"].map(embarked_mapping)
test["embarked"] =test["embarked"].map(embarked_mapping)

train.head()

Unnamed: 0,passengerid,survived,pclass,name,gender,sibsp,parch,ticket,fare,cabin,embarked,age_group,Title
0,0,0,2,"Wheeler, Mr. Edwin Frederick""""",0,0,0,SC/PARIS 2159,12.875,,1.0,5.0,1
1,1,0,3,"Henry, Miss. Delia",1,0,0,382649,7.75,,3.0,4.0,2
2,2,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",1,1,1,12749,93.5,B69,1.0,6.0,3
3,3,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",0,0,0,350043,7.7958,,1.0,5.0,1
4,4,0,2,"Hold, Mr. Stephen",0,1,0,26707,26.0,,1.0,6.0,1


In [50]:
train = train.drop(['name','ticket','cabin'],axis =1) # name, ticket, cabin 제거
test = test.drop(['name','ticket','cabin'],axis =1)
train.head()

Unnamed: 0,passengerid,survived,pclass,gender,sibsp,parch,fare,embarked,age_group,Title
0,0,0,2,0,0,0,12.875,1.0,5.0,1
1,1,0,3,1,0,0,7.75,3.0,4.0,2
2,2,1,1,1,1,1,93.5,1.0,6.0,3
3,3,1,3,0,0,0,7.7958,1.0,5.0,1
4,4,0,2,0,1,0,26.0,1.0,6.0,1


In [51]:
#test.groupby('pclass').mean()['fare'] # fare에 결측치가 하나잇어서 확인

In [52]:

from sklearn.preprocessing import OneHotEncoder
train = pd.get_dummies(data = train, columns=['pclass','embarked','gender','age_group','sibsp'])
test = pd.get_dummies(data = test, columns=['pclass','embarked','gender','age_group','sibsp'])

# dummie 쓰지말고 인코딩으로 다시 바꾸기

In [53]:
from sklearn.model_selection import train_test_split

predictors = train.drop(['survived', 'passengerid'], axis =1) #feature
target = train["survived"]
x_train, x_val, y_train, y_val = train_test_split(predictors, target, test_size = 0.2, random_state =42)

In [54]:
from sklearn.tree import DecisionTreeClassifier                # DT
from sklearn.metrics import accuracy_score

decisiontree = DecisionTreeClassifier(random_state=42)
decisiontree.fit(x_train, y_train)
y_pred = decisiontree.predict(x_val)
acc_decisiontree = round(accuracy_score(y_pred, y_val)*100,2)
acc_decisiontree

79.35

In [55]:
from sklearn.ensemble import RandomForestClassifier            # RF

rf = RandomForestClassifier(random_state=42)
rf.fit(x_train, y_train)
predrf = rf.predict(x_val)

rf.score(x_val,y_val)

0.8206521739130435

In [56]:
#submission = pd.read_csv('./submission.csv')
#submission['survived'] = preds
#submission.to_csv('submission.csv', header = True, index = False)

In [None]:
# !pip install lightgbm

In [58]:
from lightgbm import LGBMRegressor, LGBMClassifier   # lgbm
lgbm = LGBMClassifier(random_state=42)
lgbm.fit(x_train, y_train)
preds = lgbm.predict(x_val)

lgbm.score(x_train, y_train), lgbm.score(x_val, y_val)

(0.9412568306010929, 0.8369565217391305)

In [59]:
x_train.head()

Unnamed: 0,parch,fare,Title,pclass_1,pclass_2,pclass_3,embarked_1.0,embarked_2.0,embarked_3.0,gender_0,...,age_group_5.0,age_group_6.0,age_group_7.0,sibsp_0,sibsp_1,sibsp_2,sibsp_3,sibsp_4,sibsp_5,sibsp_8
25,0,7.7958,1,0,0,1,1,0,0,1,...,1,0,0,1,0,0,0,0,0,0
84,0,77.9583,2,1,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
10,1,81.8583,3,1,0,0,1,0,0,0,...,0,1,0,0,1,0,0,0,0,0
797,0,13.0,1,0,1,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
451,0,227.525,1,1,0,0,0,1,0,1,...,1,0,0,1,0,0,0,0,0,0


In [61]:
test.set_index(['passengerid'], inplace=True)  # 컬럼수를 맞추기 위해서 
print(f'{test.shape}')
test.head()

(393, 25)


Unnamed: 0_level_0,parch,fare,Title,pclass_1,pclass_2,pclass_3,embarked_1.0,embarked_2.0,embarked_3.0,gender_0,...,age_group_5.0,age_group_6.0,age_group_7.0,sibsp_0,sibsp_1,sibsp_2,sibsp_3,sibsp_4,sibsp_5,sibsp_8
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
916,0,8.0292,2,0,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
917,0,13.0,3,0,1,0,1,0,0,0,...,1,0,0,1,0,0,0,0,0,0
918,0,7.75,2,0,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
919,0,7.25,1,0,0,1,1,0,0,1,...,1,0,0,1,0,0,0,0,0,0
920,1,164.8667,3,1,0,0,1,0,0,0,...,0,1,0,0,1,0,0,0,0,0


In [62]:
test.isnull().sum()    # 실행했을때 결측치가 있다는 오류가 나와서 확인하기 위함

parch            0
fare             1
Title            0
pclass_1         0
pclass_2         0
pclass_3         0
embarked_1.0     0
embarked_2.0     0
embarked_3.0     0
gender_0         0
gender_1         0
age_group_1.0    0
age_group_2.0    0
age_group_3.0    0
age_group_4.0    0
age_group_5.0    0
age_group_6.0    0
age_group_7.0    0
sibsp_0          0
sibsp_1          0
sibsp_2          0
sibsp_3          0
sibsp_4          0
sibsp_5          0
sibsp_8          0
dtype: int64

In [63]:
test[test['fare'].isna()]    # fare에 결측치가 하나잇어서 확인

Unnamed: 0_level_0,parch,fare,Title,pclass_1,pclass_2,pclass_3,embarked_1.0,embarked_2.0,embarked_3.0,gender_0,...,age_group_5.0,age_group_6.0,age_group_7.0,sibsp_0,sibsp_1,sibsp_2,sibsp_3,sibsp_4,sibsp_5,sibsp_8
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1121,0,,1,0,0,1,1,0,0,1,...,0,0,1,1,0,0,0,0,0,0


In [65]:
test['fare'].fillna({1121:12.690590},inplace=True)  # fare 결측값을 평균으로 대체

In [66]:
from sklearn.linear_model import LogisticRegression      # 로지스틱 회귀분석
from sklearn.datasets import load_breast_cancer

In [67]:
logreg = LogisticRegression().fit(x_train, y_train)
print(f'{logreg.score(x_train, y_train)} / {logreg.score(x_val, y_val)}')

0.8729508196721312 / 0.8478260869565217


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [68]:
submission = pd.read_csv('./submission.csv')
test_pred = logreg.predict_proba(test)[:,1]
submission['survived'] = test_pred
submission.to_csv('submission.csv', header=True, index=False)