In [9]:
import pandas as pd

# 1. 데이터 불러오기
train_df = pd.read_csv("train.csv")  # 파일 경로는 필요에 맞게 수정

# 1. age 열에 각 나이에 10을 더하는 add_10() 함수 적용
def add_10(x):
    return x + 10

train_df['age_add_10'] = train_df['Age'].apply(add_10)

# 2. age 열에 모든 나이에 5를 곱하는 람다 함수 적용
train_df['age_times_5'] = train_df['Age'].apply(lambda x: x * 5)

# 3. age와 fare 열에 대해 최댓값과 최솟값의 차이를 계산하는 함수 작성 및 적용
def range_diff(column):
    return column.max() - column.min()

age_range = range_diff(train_df['Age'])
fare_range = range_diff(train_df['Fare'])

# 결과 출력
print("=== age 열 변환 결과 (일부) ===")
print(train_df[['Age', 'age_add_10', 'age_times_5']].head(10))

print("\n=== age, fare의 최댓값-최솟값 차이 ===")
print(f"Age 차이: {age_range}")
print(f"Fare 차이: {fare_range}")

=== age 열 변환 결과 (일부) ===
    Age  age_add_10  age_times_5
0  22.0        32.0        110.0
1  38.0        48.0        190.0
2  26.0        36.0        130.0
3  35.0        45.0        175.0
4  35.0        45.0        175.0
5   NaN         NaN          NaN
6  54.0        64.0        270.0
7   2.0        12.0         10.0
8  27.0        37.0        135.0
9  14.0        24.0         70.0

=== age, fare의 최댓값-최솟값 차이 ===
Age 차이: 79.58
Fare 차이: 512.3292


In [10]:
import pandas as pd

# 데이터 불러오기
train_df = pd.read_csv("train.csv")
numeric_df = train_df.select_dtypes(include='number')

# 1. 평균이 30을 초과하는 열만 추출
mean_over_30_cols = numeric_df.columns[numeric_df.mean() > 30]
df_mean_over_30 = numeric_df[mean_over_30_cols]

print("=== 평균값이 30을 초과하는 수치형 컬럼만 추출한 데이터프레임 ===")
print(df_mean_over_30.head())

# 2. 각 행의 평균값으로 'Low'/'High' 분류하는 열 추가
row_means = numeric_df.mean(axis=1)
train_df['row_type'] = row_means.apply(lambda x: 'High' if x > 50 else 'Low')

print("\n=== 'row_type' 열 추가 결과 (일부) ===")
print(train_df[['row_type']].head(10))

=== 평균값이 30을 초과하는 수치형 컬럼만 추출한 데이터프레임 ===
   PassengerId     Fare
0            1   7.2500
1            2  71.2833
2            3   7.9250
3            4  53.1000
4            5   8.0500

=== 'row_type' 열 추가 결과 (일부) ===
  row_type
0      Low
1      Low
2      Low
3      Low
4      Low
5      Low
6      Low
7      Low
8      Low
9      Low


In [12]:
import pandas as pd

train_df = pd.read_csv("train.csv")

def fill_median(df):
    num_cols = df.select_dtypes(include='number').columns
    return df.copy().fillna(df[num_cols].median())

def count_missing(df):
    return df.isnull().sum().sum()

def extract_first_char(df):
    str_cols = df.select_dtypes(include='object').columns
    for col in str_cols:
        df[col + '_first'] = df[col].astype(str).str[0]
    return df

def preview_first_chars(df):
    first_cols = [col for col in df.columns if col.endswith('_first')]
    print("첫 글자 추출 샘플:\n", df[first_cols].head())
    return df

# 체이닝 예시
missing_count = train_df.pipe(fill_median)\
                        .pipe(count_missing)
print("총 결측치 개수:", missing_count)

train_df.pipe(fill_median)\
        .pipe(extract_first_char)\
        .pipe(preview_first_chars)

총 결측치 개수: 689
첫 글자 추출 샘플:
   Name_first Sex_first Ticket_first Cabin_first Embarked_first
0          B         m            A           n              S
1          C         f            P           C              C
2          H         f            S           n              S
3          F         f            1           C              S
4          A         m            3           n              S


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name_first,Sex_first,Ticket_first,Cabin_first,Embarked_first
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,B,m,A,n,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,C,f,P,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,H,f,S,n,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,F,f,1,C,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,A,m,3,n,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,M,m,2,n,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,G,f,1,B,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,,S,J,f,W,n,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,B,m,1,C,C


In [13]:
import pandas as pd

train_df = pd.read_csv("train.csv")

# 1. 알파벳순 정렬
cols_sorted = sorted(train_df.columns)
df_sorted = train_df[cols_sorted]
print("알파벳순 정렬 컬럼:", df_sorted.columns.tolist())

# 2-1. 역순 정렬
df_reversed = train_df[train_df.columns[::-1]]
print("역순 정렬 컬럼:", df_reversed.columns.tolist())

# 2-2. 사용자 정의 순서로 정렬
custom_order = ['Survived', 'Sex', 'Pclass', 'Age']  # 실제 컬럼명과 일치시켜야 함
existing_order = [col for col in custom_order if col in train_df.columns]
df_custom = train_df[existing_order]
print("사용자 정의 순서 컬럼:", df_custom.columns.tolist())


알파벳순 정렬 컬럼: ['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId', 'Pclass', 'Sex', 'SibSp', 'Survived', 'Ticket']
역순 정렬 컬럼: ['Embarked', 'Cabin', 'Fare', 'Ticket', 'Parch', 'SibSp', 'Age', 'Sex', 'Name', 'Pclass', 'Survived', 'PassengerId']
사용자 정의 순서 컬럼: ['Survived', 'Sex', 'Pclass', 'Age']


In [14]:
import pandas as pd

train_df = pd.read_csv("train.csv")

# 1. Pclass별 데이터 개수, 평균 Age, 평균 Fare
grouped = train_df.groupby('Pclass').agg(
    count=('PassengerId', 'count'),
    age_mean=('Age', 'mean'),
    fare_mean=('Fare', 'mean')
)
print("=== Pclass별 데이터 개수, 평균 Age, 평균 Fare ===")
print(grouped)

# 2. Pclass와 Sex별 생존율
survival_rate = train_df.groupby(['Pclass', 'Sex'])['Survived'].mean()
print("\n=== Pclass, Sex별 생존율 ===")
print(survival_rate)

# 3. agg()로 Age(평균, 표준편차), Fare(최대, 최소)
agg_stats = train_df.groupby('Pclass').agg(
    age_mean=('Age', 'mean'),
    age_std=('Age', 'std'),
    fare_max=('Fare', 'max'),
    fare_min=('Fare', 'min')
)
print("\n=== Pclass별 Age와 Fare 집계 ===")
print(agg_stats)

=== Pclass별 데이터 개수, 평균 Age, 평균 Fare ===
        count   age_mean  fare_mean
Pclass                             
1         216  38.233441  84.154687
2         184  29.877630  20.662183
3         491  25.140620  13.675550

=== Pclass, Sex별 생존율 ===
Pclass  Sex   
1       female    0.968085
        male      0.368852
2       female    0.921053
        male      0.157407
3       female    0.500000
        male      0.135447
Name: Survived, dtype: float64

=== Pclass별 Age와 Fare 집계 ===
         age_mean    age_std  fare_max  fare_min
Pclass                                          
1       38.233441  14.802856  512.3292       0.0
2       29.877630  14.001077   73.5000       0.0
3       25.140620  12.495398   69.5500       0.0


In [15]:
import pandas as pd

train_df = pd.read_csv("train.csv")

# 1. Fare 누적합
train_df['Fare_cumsum'] = train_df.groupby('Pclass')['Fare'].cumsum()

# 2. Age z-score 표준화
train_df['Age_zscore'] = train_df.groupby('Pclass')['Age'].transform(
    lambda x: (x - x.mean()) / x.std()
)

# 3. 데이터 개수 200개 이상인 그룹만 필터링
group_counts = train_df.groupby('Pclass')['PassengerId'].transform('count')
filtered_df = train_df[group_counts >= 200]

print("=== Pclass별 Fare 누적합 ===\n", train_df[['Pclass', 'Fare_cumsum']].head())
print("=== Pclass별 Age z-score ===\n", train_df[['Pclass', 'Age', 'Age_zscore']].head())
print("=== 데이터 개수 200개 이상인 Pclass 그룹만 남긴 결과 ===\n", filtered_df['Pclass'].value_counts())

=== Pclass별 Fare 누적합 ===
    Pclass  Fare_cumsum
0       3       7.2500
1       1      71.2833
2       3      15.1750
3       1     124.3833
4       3      23.2250
=== Pclass별 Age z-score ===
    Pclass   Age  Age_zscore
0       3  22.0   -0.251342
1       1  38.0   -0.015770
2       3  26.0    0.068776
3       1  35.0   -0.218434
4       3  35.0    0.789041
=== 데이터 개수 200개 이상인 Pclass 그룹만 남긴 결과 ===
 Pclass
3    491
1    216
Name: count, dtype: int64


In [16]:
import pandas as pd

train_df = pd.read_csv("train.csv")

# 1. 멀티 인덱스 생성
df_multi = train_df.set_index(['Pclass', 'Sex'])
print(df_multi.head())

# 2. (Pclass=1, Sex=female) age 평균
age_mean = df_multi.loc[(1, 'female'), 'Age'].mean()
print("\n(Pclass=1, Sex=female)의 Age 평균:", age_mean)

# 3. xs()로 male 승객만 추출
male_df = df_multi.xs('male', level='Sex')
print("\nMale 승객 데이터 샘플:")
print(male_df.head())

               PassengerId  Survived  \
Pclass Sex                             
3      male              1         0   
1      female            2         1   
3      female            3         1   
1      female            4         1   
3      male              5         0   

                                                            Name   Age  SibSp  \
Pclass Sex                                                                      
3      male                              Braund, Mr. Owen Harris  22.0      1   
1      female  Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0      1   
3      female                             Heikkinen, Miss. Laina  26.0      0   
1      female       Futrelle, Mrs. Jacques Heath (Lily May Peel)  35.0      1   
3      male                             Allen, Mr. William Henry  35.0      0   

               Parch            Ticket     Fare Cabin Embarked  
Pclass Sex                                                      
3      male        0 

  age_mean = df_multi.loc[(1, 'female'), 'Age'].mean()


In [18]:
import pandas as pd

train_df = pd.read_csv("train.csv")

# 1. 피벗테이블 생성
pivot_age = pd.pivot_table(train_df, index='Pclass', columns='Sex', values='Age', aggfunc='mean')
print(pivot_age)

# 2. stack/unstack 변환
pivot_stacked = pivot_age.stack()
print(pivot_stacked.head())
pivot_unstacked = pivot_stacked.unstack()
print(pivot_unstacked.head())

# 3. melt로 Wide → Long 변환
pivot_reset = pivot_age.reset_index()
pivot_long = pd.melt(pivot_reset, id_vars=['Pclass'], value_vars=['female', 'male'],
                     var_name='Sex', value_name='Age_mean')
print(pivot_long.head())

Sex        female       male
Pclass                      
1       34.611765  41.281386
2       28.722973  30.740707
3       21.750000  26.507589
Pclass  Sex   
1       female    34.611765
        male      41.281386
2       female    28.722973
        male      30.740707
3       female    21.750000
dtype: float64
Sex        female       male
Pclass                      
1       34.611765  41.281386
2       28.722973  30.740707
3       21.750000  26.507589
   Pclass     Sex   Age_mean
0       1  female  34.611765
1       2  female  28.722973
2       3  female  21.750000
3       1    male  41.281386
4       2    male  30.740707
