In [None]:
import seaborn as sns
import pandas as pd

# 타이타닉 데이터셋 로드
titanic_df = sns.load_dataset('titanic')
print(titanic_df.head())
print(titanic_df.shape)
print(titanic_df.columns)


   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  
(891, 15)
Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')


In [3]:

# Age 컬럼의 중앙값으로 결측치 채우기
titanic_df['age'].fillna(titanic_df['age'].median(), inplace=True)

# 결과 확인
print(titanic_df['age'].isnull().sum())
print(titanic_df.head())

0
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic_df['age'].fillna(titanic_df['age'].median(), inplace=True)


In [5]:
# 타이타닉 데이터셋 요약 통계량 조회
print(titanic_df.describe())


         survived      pclass         age       sibsp       parch        fare
count  891.000000  891.000000  891.000000  891.000000  891.000000  891.000000
mean     0.383838    2.308642   29.361582    0.523008    0.381594   32.204208
std      0.486592    0.836071   13.019697    1.102743    0.806057   49.693429
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    2.000000   22.000000    0.000000    0.000000    7.910400
50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%      1.000000    3.000000   35.000000    1.000000    0.000000   31.000000
max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200


In [6]:

# Age 컬럼의 최빈값 계산 및 출력
age_mode = titanic_df['age'].mode()[0]
print(f"Mode of Age: {age_mode}")

Mode of Age: 28.0


In [None]:
# 결측치 확인
print(titanic_df.isnull().sum())

In [14]:
# Age 컬럼의 결측치를 머신러닝으로 채우기
# Age 컬럼을 제외한 나머지 컬럼을 사용하여 Age 예측 모델 학습
from sklearn.ensemble import RandomForestRegressor

titanic_df = sns.load_dataset('titanic')
age_df = titanic_df[['age', 'pclass', 'sex', 'sibsp', 'parch', 'fare', 'embarked']]
age_df = pd.get_dummies(age_df)
train_df = age_df[age_df['age'].notnull()]
test_df = age_df[age_df['age'].isnull()]

X_train = train_df.drop('age', axis=1)
y_train = train_df['age']
X_test = test_df.drop('age', axis=1)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

if not X_test.empty:
    predicted_ages = model.predict(X_test)
    titanic_df.loc[titanic_df['age'].isnull(), 'age'] = predicted_ages
else:
    print('No missing values in Age column to predict.')

# 결과 확인
print(titanic_df['age'].isnull().sum())
print(titanic_df.head())

0
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


In [16]:
print(titanic_df.to_markdown())

|     |   survived |   pclass | sex    |      age |   sibsp |   parch |     fare | embarked   | class   | who   | adult_male   | deck   | embark_town   | alive   | alone   |
|----:|-----------:|---------:|:-------|---------:|--------:|--------:|---------:|:-----------|:--------|:------|:-------------|:-------|:--------------|:--------|:--------|
|   0 |          0 |        3 | male   | 22       |       1 |       0 |   7.25   | S          | Third   | man   | True         | nan    | Southampton   | no      | False   |
|   1 |          1 |        1 | female | 38       |       1 |       0 |  71.2833 | C          | First   | woman | False        | C      | Cherbourg     | yes     | False   |
|   2 |          1 |        3 | female | 26       |       0 |       0 |   7.925  | S          | Third   | woman | False        | nan    | Southampton   | yes     | True    |
|   3 |          1 |        1 | female | 35       |       1 |       0 |  53.1    | S          | First   | woman | False        | C