In [1]:
from google.colab import drive
drive.mount('/content/data')

Mounted at /content/data


In [2]:
# 코렙 한글깨짐 방지
!apt -qq -y install fonts-nanum > /dev/null

# 데이터 시각화에 사용할 라이브러리
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns

fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font_name = fm.FontProperties(fname=fontpath).get_name() 
fm._rebuild()  


%config InlineBackend.figure_format = 'retina'

plt.rc('font', family=font_name)  
plt.rcParams['axes.unicode_minus'] = False





In [50]:
import easydict
import json
args = easydict.EasyDict()

# path 정보
args.default_path = '/content/data/MyDrive/lecture/data/titanic/kaggle_competition/'
args.train_csv = args.default_path+'train.csv'
args.test_csv = args.default_path+'test.csv'
args.submission_csv = args.default_path+'submission.csv'
args.submission_csv_0220_1 = args.default_path+'submission_0220_1.csv'

# 데이터 분석을 위한 변수들
args.random_state = 21

In [4]:
import pandas as pd 

In [5]:
train = pd.read_csv(args.train_csv)
print(f'{train.shape}')
train.head()

(916, 12)


Unnamed: 0,passengerid,survived,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,0,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S
1,1,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q
2,2,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gr...",female,52.0,1,1,12749,93.5,B69,S
3,3,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27.0,0,0,350043,7.7958,,S
4,4,0,2,"Hold, Mr. Stephen",male,44.0,1,0,26707,26.0,,S


In [6]:
test = pd.read_csv(args.test_csv)
print(f'{test.shape}')
test.head()

(393, 11)


Unnamed: 0,passengerid,pclass,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked
0,916,3,"McGowan, Miss. Anna ""Annie""",female,15.0,0,0,330923,8.0292,,Q
1,917,2,"Pinsky, Mrs. (Rosa)",female,32.0,0,0,234604,13.0,,S
2,918,3,"McCarthy, Miss. Catherine Katie""""",female,,0,0,383123,7.75,,Q
3,919,3,"Franklin, Mr. Charles (Charles Fardon)",male,,0,0,SOTON/O.Q. 3101314,7.25,,S
4,920,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45.0,1,1,36928,164.8667,,S


In [7]:
submission = pd.read_csv(args.submission_csv)
print(f'{submission.shape}')
submission.head()

(393, 2)


Unnamed: 0,passengerid,survived
0,916,0.5
1,917,0.5
2,918,0.5
3,919,0.5
4,920,0.5


# 연습

In [9]:
ori_train = pd.read_csv(args.train_csv)
ori_test = pd.read_csv(args.test_csv)

print(f'{ori_train.shape} / {ori_test.shape}')

(916, 12) / (393, 11)


In [10]:
ori_train.isnull().sum()

passengerid      0
survived         0
pclass           0
name             0
gender           0
age            180
sibsp            0
parch            0
ticket           0
fare             0
cabin          718
embarked         1
dtype: int64

In [11]:
ori_test.isnull().sum()

passengerid      0
pclass           0
name             0
gender           0
age             83
sibsp            0
parch            0
ticket           0
fare             1
cabin          296
embarked         1
dtype: int64

In [19]:
age_mean = ori_train['age'].mean()
fare_mean = ori_train['fare'].mean()
cabin_mode = ori_train['cabin'].mode().values[0]
embarked_mode = ori_train['embarked'].mode().values[0]

age_mean, fare_mean, cabin_mode, embarked_mode 

(29.69836956521739, 32.40271048034934, 'B57 B59 B63 B66', 'S')

In [20]:
ori_train['age'].fillna(age_mean, inplace=True)
ori_test['age'].fillna(age_mean, inplace=True)

ori_train['fare'].fillna(fare_mean, inplace=True)
ori_test['fare'].fillna(fare_mean, inplace=True)

ori_train['cabin'].fillna(cabin_mode, inplace=True)
ori_test['cabin'].fillna(cabin_mode, inplace=True)

ori_train['embarked'].fillna(embarked_mode, inplace=True)
ori_test['embarked'].fillna(embarked_mode, inplace=True)

ori_train.isnull().sum().sum(), ori_test.isnull().sum().sum()

(0, 0)

In [21]:
ori_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  916 non-null    int64  
 1   survived     916 non-null    int64  
 2   pclass       916 non-null    int64  
 3   name         916 non-null    object 
 4   gender       916 non-null    object 
 5   age          916 non-null    float64
 6   sibsp        916 non-null    int64  
 7   parch        916 non-null    int64  
 8   ticket       916 non-null    object 
 9   fare         916 non-null    float64
 10  cabin        916 non-null    object 
 11  embarked     916 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 86.0+ KB


In [30]:
cols = ['pclass', 'age', 'sibsp', 'parch', 'fare']

X = ori_train[cols] 
y = ori_train['survived']

test = ori_test[cols+['passengerid']]

X.shape, y.shape, test.shape

((916, 5), (916,), (393, 6))

In [31]:
from sklearn.model_selection import train_test_split 

In [32]:
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, stratify=ori_train['survived'], random_state=args.random_state 
)

In [33]:
X_tr.isnull().sum().sum(), X_te.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0, 0)

In [34]:
X_tr.shape, X_te.shape, test.shape

((732, 5), (184, 5), (393, 6))

In [35]:
test.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,passengerid
0,3,15.0,0,0,8.0292,916
1,2,32.0,0,0,13.0,917
2,3,29.69837,0,0,7.75,918
3,3,29.69837,0,0,7.25,919
4,1,45.0,1,1,164.8667,920


In [36]:
test.set_index(['passengerid'], inplace=True)  

print(f'{test.shape}')
test.head()

(393, 5)


Unnamed: 0_level_0,pclass,age,sibsp,parch,fare
passengerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
916,3,15.0,0,0,8.0292
917,2,32.0,0,0,13.0
918,3,29.69837,0,0,7.75
919,3,29.69837,0,0,7.25
920,1,45.0,1,1,164.8667


In [37]:
X_tr.shape, X_te.shape, test.shape

((732, 5), (184, 5), (393, 5))

In [38]:
X_tr.isnull().sum().sum(), X_te.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0, 0)

In [39]:
from sklearn.tree import DecisionTreeClassifier 

In [40]:
modelV0 = DecisionTreeClassifier(random_state=args.random_state)

In [41]:
modelV0.fit(X_tr, y_tr)

DecisionTreeClassifier(random_state=21)

In [42]:
from sklearn.metrics import roc_curve, auc 

In [43]:
pred = modelV0.predict_proba(X_te)[:, 1]

In [44]:
fpr, tpr, _ = roc_curve(y_te, pred) 

In [45]:
auc(fpr, tpr) 

0.5963032581453634

In [46]:
test_pred = modelV0.predict_proba(test)[:, 1]
test_pred.shape

(393,)

In [47]:
test_pred[:3]

array([0.        , 0.        , 0.46666667])

In [48]:
submission = pd.read_csv(args.submission_csv)
print(f'{submission.shape}')
submission.head()

(393, 2)


Unnamed: 0,passengerid,survived
0,916,0.5
1,917,0.5
2,918,0.5
3,919,0.5
4,920,0.5


In [49]:
submission['survived'] = test_pred
submission.head()

Unnamed: 0,passengerid,survived
0,916,0.0
1,917,0.0
2,918,0.466667
3,919,0.0
4,920,0.0


In [51]:
submission.to_csv(args.submission_csv_0220_1, header=True, index=False)