<a href="https://colab.research.google.com/github/zzhining/python_ml_dl2/blob/main/answer/2_8_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[데이터 준비]

In [1]:
from sklearn.datasets import make_classification, make_regression, make_blobs
import pandas as pd
import numpy as np

def create_classification_data():
    X, y = make_classification(n_samples=100, n_features=5, random_state=42)
    df = pd.DataFrame(X, columns=[f"feature_{i+1}" for i in range(X.shape[1])])
    df['target'] = y

    # feature_1에 결측치를 추가 (10% 비율로)
    missing_indices = np.random.choice(df.index, size=int(len(df) * 0.1), replace=False)
    df.loc[missing_indices, 'feature_1'] = np.nan
    return df

def create_regression_data():
    X, y = make_regression(n_samples=100, n_features=5, noise=0.1, random_state=42)
    df = pd.DataFrame(X, columns=[f"feature_{i+1}" for i in range(X.shape[1])])
    df['target'] = y

    for column in df.columns[:-1]:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # 이상치를 추가할 인덱스를 랜덤하게 선택
        outlier_indices = np.random.choice(df.index, size=5, replace=False)
        for idx in outlier_indices:
            df.at[idx, column] = np.random.uniform(upper_bound + 1, upper_bound + 10)

    return df

def create_blobs_data():
    X, y = make_blobs(n_samples=100, centers=3, n_features=5, random_state=42)
    df = pd.DataFrame(X, columns=[f"feature_{i+1}" for i in range(X.shape[1])])
    df['target'] = y
    return df

classification_df = create_classification_data()
regression_df = create_regression_data()
blobs_df = create_blobs_data()

display(classification_df.head())
display(regression_df.head())
display(blobs_df.head())

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,target
0,-0.430668,0.672873,-0.72428,-0.53963,-0.6516,0
1,0.211646,-0.843897,0.534794,0.825848,0.681953,1
2,1.092675,0.409106,1.100096,-0.942751,-0.981509,0
3,1.519901,-0.773361,1.998053,0.155132,-0.385314,0
4,-0.453901,-2.183473,0.244724,2.591239,-0.484234,1


Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,target
0,0.97512,-0.677162,-0.012247,-0.897254,0.075805,-57.19576
1,0.081874,-0.485364,0.758969,-0.772825,-0.236819,-46.546477
2,-1.412304,-0.908024,-0.562288,-1.012831,0.314247,-258.13344
3,-0.64512,0.361636,1.35624,-0.07201,1.003533,115.850751
4,-0.6227,0.280992,-1.952088,-0.151785,0.588317,-123.767712


Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,target
0,-3.00263,9.937449,6.346484,2.846759,-6.870483,0
1,-5.970371,-9.040785,7.38442,2.669463,4.287546,1
2,-2.290559,9.896047,3.630793,0.389875,-6.105927,0
3,-3.463695,8.263107,3.509451,2.743147,-5.611238,0
4,-1.888649,8.853349,4.251614,1.087657,-7.236372,0


# 문제1
주어진 `classification_df` 데이터셋에서 `feature_1` 열에 일부 결측치가 포함되어 있다. 결측치를 평균값으로 대체하시오.

In [2]:
# 코드 작성

print('[전처리 전]결측치 수:', classification_df.isna().sum().sum())

classification_df['feature_1'] = classification_df['feature_1'].fillna(classification_df['feature_1'].mean())

print('[전처리 후]결측치 수:', classification_df.isna().sum().sum())

[전처리 전]결측치 수: 10
[전처리 후]결측치 수: 0


# 문제 2
주어진 `regression_df` 데이터셋에서 `feature_3`의 값에 대해 IQR 기반 이상치를 삭제하시오.
- Q1: 낮은 순에서 높은 순으로 정렬한 후 4등분했을 때 25%에 해당하는 값(1사분위수)
- Q3: 낮은 순에서 높은 순으로 정렬한 후 4등분했을 때 75%에 해당하는 값(3사분위수)
- IQR: 25%의 Q1과 75%의 Q3의 각 값과 거리(IQR= Q3 – Q1)
- Q1과 Q3의 각 값과 거리(Q3-Q1)를 구하여 1.5배 거리(내부 울타리 내의 값)이 아닌 데이터라면 이상치



In [3]:
# 코드 작성
print('이상치 처리 전 결측치 수:', regression_df['feature_3'].isna().sum())

Q1 = regression_df['feature_3'].quantile(0.25)
Q3 = regression_df['feature_3'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

regression_df.loc[(regression_df['feature_3'] < lower_bound) | (regression_df['feature_3'] > upper_bound) , 'feature_3'] = None

print('이상치 처리 후 결측치 수:', regression_df['feature_3'].isna().sum())

regression_df.dropna(subset=['feature_3'], inplace=True)
print('결측치 삭제:', regression_df['feature_3'].isna().sum())

이상치 처리 전 결측치 수: 0
이상치 처리 후 결측치 수: 5
결측치 삭제: 0


# 문제 3
두 개의 데이터프레임 `classification_df`와 `regression_df`를 수평으로 결합하여 하나의 데이터프레임(`concat_df`)을 생성하시오.


In [4]:
concat_df = pd.concat([classification_df, regression_df], axis=1)
concat_df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,target,feature_1.1,feature_2.1,feature_3.1,feature_4.1,feature_5.1,target.1
0,-0.430668,0.672873,-0.72428,-0.53963,-0.6516,0,0.97512,-0.677162,-0.012247,-0.897254,0.075805,-57.19576
1,0.211646,-0.843897,0.534794,0.825848,0.681953,1,0.081874,-0.485364,0.758969,-0.772825,-0.236819,-46.546477
2,1.092675,0.409106,1.100096,-0.942751,-0.981509,0,-1.412304,-0.908024,-0.562288,-1.012831,0.314247,-258.13344
3,1.519901,-0.773361,1.998053,0.155132,-0.385314,0,-0.64512,0.361636,1.35624,-0.07201,1.003533,115.850751
4,-0.453901,-2.183473,0.244724,2.591239,-0.484234,1,-0.6227,0.280992,-1.952088,-0.151785,0.588317,-123.767712


# 문제 4

`classification_df`와 `blobs_df`를 `target` 열을 기준으로 내부 조인(inner join) 하여 하나의 데이터프레임(`merged_df`)을 생성하시오.

In [5]:
merged_df = pd.merge(classification_df, blobs_df, on='target', how='inner')
merged_df.head()

Unnamed: 0,feature_1_x,feature_2_x,feature_3_x,feature_4_x,feature_5_x,target,feature_1_y,feature_2_y,feature_3_y,feature_4_y,feature_5_y
0,-0.430668,0.672873,-0.72428,-0.53963,-0.6516,0,-3.00263,9.937449,6.346484,2.846759,-6.870483
1,-0.430668,0.672873,-0.72428,-0.53963,-0.6516,0,-2.290559,9.896047,3.630793,0.389875,-6.105927
2,-0.430668,0.672873,-0.72428,-0.53963,-0.6516,0,-3.463695,8.263107,3.509451,2.743147,-5.611238
3,-0.430668,0.672873,-0.72428,-0.53963,-0.6516,0,-1.888649,8.853349,4.251614,1.087657,-7.236372
4,-0.430668,0.672873,-0.72428,-0.53963,-0.6516,0,-2.768226,8.439577,4.218381,2.312991,-6.887007


# 문제 5
주어진 `classification_df` 데이터셋에서 `target` 변수를 기준으로 각 그룹의 feature_1, feature_2, feature_3의 평균값을 구하시오.


In [6]:
grouped_df = classification_df.groupby('target')[['feature_1', 'feature_2', 'feature_3']].mean()

grouped_df.head()

Unnamed: 0_level_0,feature_1,feature_2,feature_3
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.095528,1.006369,-0.181439
1,-0.189894,-0.929705,0.083742


# 문제 6
주어진 `blobs_df` 데이터셋에서 `target` 변수를 기준으로 One-Hot Encoding을 적용하여 새로운 컬럼을 생성하시오.

In [7]:
blobs_df = pd.get_dummies(blobs_df, columns=['target'], prefix='target')
blobs_df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,target_0,target_1,target_2
0,-3.00263,9.937449,6.346484,2.846759,-6.870483,True,False,False
1,-5.970371,-9.040785,7.38442,2.669463,4.287546,False,True,False
2,-2.290559,9.896047,3.630793,0.389875,-6.105927,True,False,False
3,-3.463695,8.263107,3.509451,2.743147,-5.611238,True,False,False
4,-1.888649,8.853349,4.251614,1.087657,-7.236372,True,False,False


# 문제 7

주어진 `blobs_df` 데이터셋에서 신규 feature(`feature_6`, `feature_7`)를 생성하시오.
- `feature_6` = `feature_1`과 `feature_2`의 합
- `feature_7` = `feature_3` ~ `feature_5`의 평균값

In [8]:
# 코드 작성

# feature_6 = feature_1 + feature_2
blobs_df['feature_6'] = blobs_df['feature_1'] + blobs_df['feature_2']

# feature_7 = (feature_3 + feature_4 + feature_5) / 3
blobs_df['feature_7'] = blobs_df[['feature_3', 'feature_4', 'feature_5']].mean(axis=1)

blobs_df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,target_0,target_1,target_2,feature_6,feature_7
0,-3.00263,9.937449,6.346484,2.846759,-6.870483,True,False,False,6.934819,0.774253
1,-5.970371,-9.040785,7.38442,2.669463,4.287546,False,True,False,-15.011157,4.780476
2,-2.290559,9.896047,3.630793,0.389875,-6.105927,True,False,False,7.605488,-0.695086
3,-3.463695,8.263107,3.509451,2.743147,-5.611238,True,False,False,4.799412,0.213787
4,-1.888649,8.853349,4.251614,1.087657,-7.236372,True,False,False,6.964699,-0.632367


# 문제 8
주어진 `regression_df` 데이터셋에서 모든 `feature`들을 0과 1 사이로 스케일링 하시오.

In [9]:
# 코드 작성
from sklearn.preprocessing import MinMaxScaler

print('변환 전')
display(regression_df.describe())

scaler = MinMaxScaler()
regression_df[['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5']] = scaler.fit_transform(regression_df[['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5']])

print('변환 후')
display(regression_df.describe())

변환 전


Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,target
count,95.0,95.0,95.0,95.0,95.0,95.0
mean,0.498322,0.496836,-0.048719,0.442204,0.253043,10.150228
std,2.353308,1.914733,0.908462,2.176527,1.878038,139.526772
min,-1.987569,-1.91328,-1.952088,-2.301921,-3.241267,-341.184598
25%,-0.63391,-0.527758,-0.739427,-0.622983,-0.691039,-82.691107
50%,-0.020902,0.232254,-0.062679,0.068563,0.045572,3.484676
75%,0.79358,0.813513,0.545119,0.907501,0.716777,103.947827
max,12.504689,11.319537,2.314659,12.377706,10.54852,399.561141


변환 후


Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,target
count,95.0,95.0,95.0,95.0,95.0,95.0
mean,0.171532,0.182132,0.446094,0.186934,0.253398,10.150228
std,0.162384,0.144696,0.212917,0.148269,0.13619,139.526772
min,0.0,0.0,0.0,0.0,0.0,-341.184598
25%,0.093406,0.104704,0.284212,0.114372,0.184936,-82.691107
50%,0.135705,0.162137,0.442822,0.161481,0.238353,3.484676
75%,0.191906,0.206063,0.585272,0.218631,0.287027,103.947827
max,1.0,1.0,1.0,1.0,1.0,399.561141


# 문제 9
주어진 `classification_df` 데이터셋에서 모든 `feature`들을 평균 0, 표준편차 1로 스케일링 하시오.


In [10]:
# 코드 작성
from sklearn.preprocessing import StandardScaler

print('변환 전')
display(classification_df.describe())

scaler = StandardScaler()
classification_df[['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5']] = scaler.fit_transform(classification_df[['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5']])

print('변환 후')
display(classification_df.describe())

변환 전


Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,target
count,100.0,100.0,100.0,100.0,100.0,100.0
mean,-0.047183,0.038332,-0.048848,-0.027714,0.029213,0.5
std,0.851906,1.274375,1.230126,1.323957,1.007244,0.502519
min,-1.692005,-2.68318,-2.523434,-2.585909,-3.241267,0.0
25%,-0.771096,-0.989945,-1.097573,-1.185113,-0.629925,0.0
50%,-0.047183,-0.098046,-0.035241,0.069846,0.054926,0.5
75%,0.657686,1.190847,1.035708,1.089659,0.701519,1.0
max,1.724002,2.489048,2.388694,2.591239,2.314659,1.0


변환 후


Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,target
count,100.0,100.0,100.0,100.0,100.0,100.0
mean,-4.440892e-18,1.3322680000000001e-17,2.164935e-17,-4.2188470000000006e-17,8.881784e-18,0.5
std,1.005038,1.005038,1.005038,1.005038,1.005038,0.502519
min,-1.940482,-2.146324,-2.021787,-1.941969,-3.263315,0.0
25%,-0.8540384,-0.8109525,-0.8568288,-0.8786008,-0.6576936,0.0
50%,8.186175e-18,-0.1075546,0.01111779,0.07405972,0.02565669,0.5
75%,0.8315709,0.9089325,0.886105,0.8482166,0.6708328,1.0
max,2.089559,1.93276,1.991521,1.988092,2.280439,1.0


# 문제 10
주어진 `regression_df` 데이터셋에서 주성분 분석(PCA)을 사용하여 데이터를 3차원으로 축소하시오.

In [11]:
# 코드 작성
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
principal_components = pca.fit_transform(regression_df[['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5']])
principal_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3'])

principal_df.head()

Unnamed: 0,PC1,PC2,PC3
0,0.020064,0.018603,-0.000498
1,0.199009,-0.026346,0.039716
2,-0.099829,-0.124379,0.005764
3,0.331647,-0.021313,0.01557
4,-0.428498,-0.105174,-0.073253


# 문제 11
주어진 `classification_df` 데이터셋을 학습 데이터 80%, 테스트 데이터 20%로 분할하시오.

In [12]:
# 코드 작성
from sklearn.model_selection import train_test_split

X = classification_df.drop(columns=['target'])
y = classification_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)