In [4]:
import pandas as pd
data = pd.read_csv('First_Health_Camp_Attended.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6218 entries, 0 to 6217
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Patient_ID      6218 non-null   int64  
 1   Health_Camp_ID  6218 non-null   int64  
 2   Donation        6218 non-null   int64  
 3   Health_Score    6218 non-null   float64
 4   Unnamed: 4      0 non-null      float64
dtypes: float64(2), int64(3)
memory usage: 243.0 KB


In [5]:
data.head(5)

Unnamed: 0,Patient_ID,Health_Camp_ID,Donation,Health_Score,Unnamed: 4
0,506181,6560,40,0.439024,
1,494977,6560,20,0.097561,
2,518680,6560,10,0.04878,
3,509916,6560,30,0.634146,
4,488006,6560,20,0.02439,


In [6]:
data.describe()

Unnamed: 0,Patient_ID,Health_Camp_ID,Donation,Health_Score,Unnamed: 4
count,6218.0,6218.0,6218.0,6218.0,0.0
mean,506929.053715,6552.904149,32.587649,0.517668,
std,12437.711002,19.095027,24.501676,0.289211,
min,485702.0,6524.0,10.0,0.001667,
25%,496069.0,6538.0,20.0,0.263094,
50%,506704.0,6543.0,30.0,0.529706,
75%,517747.75,6570.0,40.0,0.771429,
max,528657.0,6587.0,330.0,1.0,


In [8]:
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 Patient_ID           0
Health_Camp_ID       0
Donation             0
Health_Score         0
Unnamed: 4        6218
dtype: int64


In [11]:
# 불필요한 부분
data = data[['Patient_ID', 'Health_Camp_ID', 'Donation','Health_Score']]
data.head(5)

Unnamed: 0,Patient_ID,Health_Camp_ID,Donation,Health_Score
0,506181,6560,40,0.439024
1,494977,6560,20,0.097561
2,518680,6560,10,0.04878
3,509916,6560,30,0.634146
4,488006,6560,20,0.02439


In [12]:
import numpy as np
# 결측값 변환 함수
def add_missing_values(df, col_name, missing_frac):
    """
    특정 열에 지정된 비율의 결측값을 추가합니다.
    :param df: 데이터프레임
    :param col_name: 결측값을 추가할 열 이름
    :param missing_frac: 결측값 비율 (0.0 ~ 1.0)
    """
    np.random.seed(42)  # 재현성을 위해 랜덤 시드 설정
    n_rows = df.shape[0] # 데이터프레임의 행 수
    n_missing = int(n_rows * missing_frac) # 결측값을 추가할 행의 수를 계산

    missing_indices = np.random.choice(n_rows, n_missing, replace=False) # 결측값을 추가할 행의 인덱스를 무작위로 선택
    df.loc[missing_indices, col_name] = np.nan # 선택된 인덱스의 열 값을 NaN으로 설정

In [13]:
add_missing_values(data, 'Donation', 0.1) # Donation 열에 10% 결측값 추가
add_missing_values(data, 'Health_Score', 0.1) # Health_Score 열에 10% 결측값 추가

In [14]:
# 결측값이 잘 추가되었는지 확인
print("Data with missing values:\n", data.tail(10))
print("Missing values count:\n", data.isnull().sum())

Data with missing values:
       Patient_ID  Health_Camp_ID  Donation  Health_Score
6208      525361            6575      20.0      0.431373
6209      490682            6575      20.0      0.156863
6210      514665            6575       NaN           NaN
6211      526888            6575      10.0      0.470588
6212      515749            6575      10.0      0.568627
6213      502728            6575      10.0      0.509804
6214      511088            6575      20.0      0.078431
6215      507608            6575      40.0      0.627451
6216      488046            6575      50.0      0.686275
6217      508766            6575      50.0      0.921569
Missing values count:
 Patient_ID          0
Health_Camp_ID      0
Donation          621
Health_Score      621
dtype: int64


In [16]:
# 숫자 데이터 전처리 - 결측값 처리 (평균값으로 대체)
data.fillna({'Donation': data['Donation'].mean()}, inplace=True)
data.fillna({'Health_score': data['Health_Score'].mean()}, inplace=True)

In [17]:
print("Missing values count:\n", data.isnull().sum())

Missing values count:
 Patient_ID        0
Health_Camp_ID    0
Donation          0
Health_Score      0
dtype: int64


In [19]:
# 전처리 작업의 자동화 (파이프라인 구축)
from sklearn.pipeline import Pipeline  # 파이프라인 구축을 위한 클래스
from sklearn.impute import SimpleImputer  # 결측값 처리
from sklearn.preprocessing import StandardScaler  # 데이터 스케일링

# 파이프라인 구축
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # 결측값 대체
    ('scaler', StandardScaler())  # 데이터 스케일링
])

# 데이터 변환
processed_data = pipeline.fit_transform(data)

# 결과를 데이터프레임으로 변환
processed_data_df = pd.DataFrame(processed_data, columns=['Patient_ID', 'Health_Camp_ID', 'Donation','Health_Score'])

print("Processed Data:\n", processed_data_df.head())

Processed Data:
    Patient_ID  Health_Camp_ID  Donation  Health_Score
0   -0.060149        0.371637  0.317621     -0.286158
1   -0.961030        0.371637 -0.540355     -1.532330
2    0.944860        0.371637 -0.969343     -1.710354
3    0.240172        0.371637 -0.111367      0.425939
4   -1.521548        0.371637 -0.540355     -1.799366
