## 데이터 및 전처리

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV, train_test_split

In [2]:
train = pd.read_csv("data/titanic_train.csv")
test = pd.read_csv("data/titanic_test.csv")

In [3]:
print(train.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [4]:
#전처리를 해야할 문자열이 있는지 확인
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
train["Age"] = train["Age"].fillna(train["Age"].median())
test["Age"] = test["Age"].fillna(test["Age"].median())
test["Fare"] = test["Fare"].fillna(test["Fare"].median())
train["Embarked"] = train["Embarked"].fillna("S")

In [6]:
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  train["Sex"][train["Sex"] == "male"] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["Sex"][train["Se

In [7]:
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  train["Embarked"][train["Embarked"] == "S"] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["Embarked

In [8]:
y = train.loc[:, ["Survived"]] #y는 답안지
x = train.loc[:, ["Sex", "Age", "Pclass", "SibSp", "Fare", "Embarked"]]
print(x, y)

    Sex   Age  Pclass  SibSp     Fare Embarked
0     0  22.0       3      1   7.2500        0
1     1  38.0       1      1  71.2833        1
2     1  26.0       3      0   7.9250        0
3     1  35.0       1      1  53.1000        0
4     0  35.0       3      0   8.0500        0
..   ..   ...     ...    ...      ...      ...
886   0  27.0       2      0  13.0000        0
887   1  19.0       1      0  30.0000        0
888   1  28.0       3      1  23.4500        0
889   0  26.0       1      0  30.0000        1
890   0  32.0       3      0   7.7500        2

[891 rows x 6 columns]      Survived
0           0
1           1
2           1
3           1
4           0
..        ...
886         0
887         1
888         0
889         1
890         0

[891 rows x 1 columns]


## 데이터 분리 및 전처리

In [9]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, shuffle=True, random_state=0
)
print(x_train, y_train)
print(x_test, y_test)

    Sex   Age  Pclass  SibSp     Fare Embarked
857   0  51.0       1      0  26.5500        0
52    1  49.0       1      1  76.7292        1
386   0   1.0       3      5  46.9000        0
124   0  54.0       1      0  77.2875        0
578   1  28.0       3      1  14.4583        1
..   ..   ...     ...    ...      ...      ...
835   1  39.0       1      1  83.1583        1
192   1  19.0       3      1   7.8542        0
629   0  28.0       3      0   7.7333        2
559   1  36.0       3      1  17.4000        0
684   0  60.0       2      1  39.0000        0

[623 rows x 6 columns]      Survived
857         1
52          1
386         0
124         0
578         0
..        ...
835         1
192         1
629         0
559         1
684         0

[623 rows x 1 columns]
    Sex   Age  Pclass  SibSp      Fare Embarked
495   0  28.0       3      0   14.4583        1
648   0  28.0       3      0    7.5500        0
278   0   7.0       3      4   29.1250        2
31    1  28.0       1      1

In [10]:
x_train = x_train.astype("float32")
y_train = y_train.astype("float32")
x_test = x_test.astype("float32")
y_test = y_test.astype("float32")

In [11]:
rf = RandomForestClassifier()

In [12]:
rf.fit(x_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [13]:
rf_pred = rf.predict(x_test)

In [14]:
rf_pred

array([0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 0., 0.,
       0., 0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0.,
       1., 1., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
       1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1.,
       0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0.,
       1., 1., 0., 0., 0.

In [15]:
accuracy_score(y_test, rf_pred)

0.8246268656716418

In [16]:
#분류는 정확도 보다 이게 더 중요함
confusion_matrix(y_test, rf_pred)

array([[148,  20],
       [ 27,  73]], dtype=int64)

[[148,  20],
[ 27,  73]]
-산다고 했는데 산 사람: 148
-죽는다고 했는데 산 사람: 27
-산다고 했는데 죽은 사람: "20"
-죽는다고 했는데 죽은 사람: 73

In [18]:
svc = SVC()
svc.fit(x_train, y_train)
svc_pred = svc.predict(x_test)

  y = column_or_1d(y, warn=True)


In [19]:
accuracy_score(y_test, svc_pred)

0.7164179104477612

In [20]:
confusion_matrix(y_test, svc_pred)

array([[160,   8],
       [ 68,  32]], dtype=int64)

[[160,  8],
[ 68,  32]]
-산다고 했는데 산 사람: 160
-죽는다고 했는데 산 사람: 68
-산다고 했는데 죽은 사람: "8"
-죽는다고 했는데 죽은 사람: 32