타이타닉 분석

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import LabelEncoder, StandardScaler 
#범주형 데이터(예: 문자열 레이블)를 숫자로 변환하는 데 사용
#데이터의 특성(feature)을 표준화(Standardization)하는 데 사용

from sklearn.neighbors import KNeighborsClassifier
#K-Nearest Neighbors (KNN) 알고리즘을 사용하는 분류 모델

from sklearn.metrics import accuracy_score, confusion_matrix
#분류 모델의 성능을 평가하기 위해 혼동 행렬(Confusion Matrix)을 계산

In [7]:
file_path = "D:/files/titanic.csv"

df = pd.read_csv(file_path)

df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [35]:
#결측치 확인
df.isnull().sum() 

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64

In [33]:
# 'age' 열의 결측치를 평균값으로 채우기
age_mean = df['Age'].mean()  # 'age' 열의 평균값 계산
df['Age'] = df['Age'].fillna(age_mean)  # 결측치를 평균값으로 채움

In [None]:
#age의 결측치가 사라진 걸 확인 - 평균갑으로 대체되었기 때문
df.isnull().sum() 

In [22]:
# 레이블 갯수 확인 (숫자)
df['label'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [24]:
# 불필요한 컬럼 제거
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], inplace=True)

df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.000000,1,0,7.2500
1,1,1,female,38.000000,1,0,71.2833
2,1,3,female,26.000000,0,0,7.9250
3,1,1,female,35.000000,1,0,53.1000
4,0,3,male,35.000000,0,0,8.0500
...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000
887,1,1,female,19.000000,0,0,30.0000
888,0,3,female,29.699118,1,2,23.4500
889,1,1,male,26.000000,0,0,30.0000


In [39]:
#label encoder로 문자를 숫자로 변경

print(df['Sex'].value_counts()) 

# LabelEncoder 객체 생성
label_encoder = LabelEncoder()

# 'Sex' 컬럼을 숫자로 변환
df['Sex'] = label_encoder.fit_transform(df['Sex'])

print(df['Sex'].value_counts())

Sex
1    577
0    314
Name: count, dtype: int64
Sex
1    577
0    314
Name: count, dtype: int64


In [41]:
#데이터 분할
X = df.drop('Survived', axis=1)
y = df['Survived']

# 훈련용 데이터와 테스트용 데이터로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [70]:
# DT 모델 생성
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# 혼동 행렬 확인
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Accuracy: 73.88%
Confusion Matrix:
[[126  31]
 [ 39  72]]


In [72]:
# RF모델 생성 
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy: {accuracy_rf * 100:.2f}%")

# 혼동 행렬 계산
cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix:")
print(cm_rf)

Accuracy: 80.22%
Confusion Matrix:
[[137  20]
 [ 33  78]]


In [64]:
# LR모델 생성
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy: {accuracy_lr * 100:.2f}%")

# 혼동 행렬 계산 및 출력
cm_lr = confusion_matrix(y_test, y_pred_lr)
print("Confusion Matrix:")
print(cm_lr)

Accuracy: 81.34%
Confusion Matrix:
[[139  18]
 [ 32  79]]


In [52]:
# SVM모델 생성 (커널은 RBF 사용)
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Accuracy: {accuracy_svm * 100:.2f}%")

# 혼동 행렬 계산 및 출력
cm_svm = confusion_matrix(y_test, y_pred_svm)
print("Confusion Matrix:")
print(cm_svm)

Accuracy: 66.04%
Confusion Matrix:
[[148   9]
 [ 82  29]]


In [74]:
# KNN모델 생성
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"Accuracy: {accuracy_knn * 100:.2f}%")

# 혼동 행렬 계산 및 출력
cm_knn = confusion_matrix(y_test, y_pred_knn)
print("Confusion Matrix:")
print(cm_knn)

Accuracy: 69.78%
Confusion Matrix:
[[131  26]
 [ 55  56]]
