# 1. 주제 - 피마 인디언 당뇨병 데이터 분석


# 2. 데이터 수집

데이터 수집 사이트 : https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database

# 3. feature 설명

1. Pregnancies: 임신 횟수

2. Glucose: 포도당 부하 검사 수치

3. BloodPressure: 혈압(mm Hg)

4. SkinThickness: 팔 삼두근 뒤쪽의 피하지방 측정값(mm)

5. Insulin: 혈청 인슐린(mu U/ml)

6. BMI: 체질량지수 (체중(kg) / 키(m)^2)

7. DiabetesPedigreeFunction: 당뇨 내력 가중치 값

8. Age: 나이

9. Outcome: 클래스 결정 값 (0 또는 1)

# 4. 과정

1. 데이터 탐색(차원, 기초 통계값, 결측치, 이상치)
2. 이상치 존재 -> 각 feature의 평균값을 넣는 이상치 처리 진행
3. 당뇨 양, 음성을 기준으로 다양한 통계값 확인
4. 나이를 기준으로 당뇨 양성자 분류
5. 데이터 정규화 후 거리, 코사인 유사도 계산
6. 공분산, 상관계수 계산

In [1]:
# 데이터 전처리에 필요한 라이브러리 임포트

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# 데이터 불러오기

df = pd.read_csv('/Users/yrlee/dgu_mac/1-2/데사개/ds_intro_hw1/diabetes.csv')

In [3]:
# 상위 5행 출력

df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# 데이터 차원 확인

df.shape

(768, 9)

In [5]:
# 데이터 통계값 확인

df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
# 결측치 확인

df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
# 이상치 개수 확인

zero_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

print("전체 데이터 {0}개 중" .format(df.shape[0]))

for i in zero_features:
    
    zero_count = df[df[i] == 0][i].count()
    ratio = (zero_count/df.shape[0]) * 100
    print("{0} feature에서 값이 0인 이상치의 개수는 {1}개, 비율은 {2:.2f}%입니다" .format(i, zero_count, ratio))

전체 데이터 768개 중
Glucose feature에서 값이 0인 이상치의 개수는 5개, 비율은 0.65%입니다
BloodPressure feature에서 값이 0인 이상치의 개수는 35개, 비율은 4.56%입니다
SkinThickness feature에서 값이 0인 이상치의 개수는 227개, 비율은 29.56%입니다
Insulin feature에서 값이 0인 이상치의 개수는 374개, 비율은 48.70%입니다
BMI feature에서 값이 0인 이상치의 개수는 11개, 비율은 1.43%입니다


In [8]:
# 각 feature에 평균값을 넣어 이상치 처리

for i in zero_features:
    df[i] = df[i].replace(0, df[i].mean())

In [9]:
# 인덱스 1부터 시작하게 맞추기

df.index = df.index + 1

In [10]:
# 열 이름 바꾸기

df = df.rename(columns = {'Outcome' : 'Diabetes'})

In [11]:
# 값 치환

df2 = df.copy()
df2['Diabetes'] = df2['Diabetes'].replace([0, 1], ["negative", "positive"])

In [12]:
# 데이터 확인

df2.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Diabetes
1,6,148.0,72.0,35.0,79.799479,33.6,0.627,50,positive
2,1,85.0,66.0,29.0,79.799479,26.6,0.351,31,negative
3,8,183.0,64.0,20.536458,79.799479,23.3,0.672,32,positive
4,1,89.0,66.0,23.0,94.0,28.1,0.167,21,negative
5,0,137.0,40.0,35.0,168.0,43.1,2.288,33,positive


In [13]:
# 고유값 확인

df2['Diabetes'].unique()

array(['positive', 'negative'], dtype=object)

In [14]:
# 음성, 양성 개수 확인

df2['Diabetes'].value_counts()

negative    500
positive    268
Name: Diabetes, dtype: int64

In [15]:
# 당뇨 양, 음성을 기준으로 최대값 계산

df2.groupby('Diabetes').max()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Diabetes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
negative,13,197.0,122.0,60.0,744.0,57.3,2.329,81
positive,17,199.0,114.0,99.0,846.0,67.1,2.42,70


In [16]:
# 당뇨 양, 음성을 기준으로 평균값 계산

df2.groupby('Diabetes').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Diabetes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
negative,3.298,110.705367,70.810008,25.373135,106.457354,30.880066,0.429734,31.19
positive,4.865672,142.159661,74.950326,28.907494,141.426597,35.381288,0.5505,37.067164


In [17]:
# 당뇨 양, 음성 칼럼으로 one-hot encoding 진행

dummy = pd.get_dummies(df2[['Diabetes']])
df3 = pd.concat([df2, dummy], axis = 1)

In [18]:
# 데이터 확인

df3.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Diabetes,Diabetes_negative,Diabetes_positive
1,6,148.0,72.0,35.0,79.799479,33.6,0.627,50,positive,0,1
2,1,85.0,66.0,29.0,79.799479,26.6,0.351,31,negative,1,0
3,8,183.0,64.0,20.536458,79.799479,23.3,0.672,32,positive,0,1
4,1,89.0,66.0,23.0,94.0,28.1,0.167,21,negative,1,0
5,0,137.0,40.0,35.0,168.0,43.1,2.288,33,positive,0,1


In [19]:
# 이산화 진행

label = ['Children', 'Youth', 'Adults', 'Seniors']
df3['Age'] = pd.cut(x = df3['Age'], bins = [0, 14, 24, 64, 100], labels = label)
df3.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Diabetes,Diabetes_negative,Diabetes_positive
1,6,148.0,72.0,35.0,79.799479,33.6,0.627,Adults,positive,0,1
2,1,85.0,66.0,29.0,79.799479,26.6,0.351,Adults,negative,1,0
3,8,183.0,64.0,20.536458,79.799479,23.3,0.672,Adults,positive,0,1
4,1,89.0,66.0,23.0,94.0,28.1,0.167,Youth,negative,1,0
5,0,137.0,40.0,35.0,168.0,43.1,2.288,Adults,positive,0,1


In [20]:
# 나이를 기준으로 데이터 개수 카운팅

df3.groupby('Age')['Diabetes_positive'].count()

Age
Children      0
Youth       219
Adults      533
Seniors      16
Name: Diabetes_positive, dtype: int64

In [21]:
# 나이를 기준으로 데이터 개수 카운팅

df3[df3['Diabetes_positive'] == 1].groupby('Age')['Diabetes_positive'].count()

Age
Children      0
Youth        31
Adults      233
Seniors       4
Name: Diabetes_positive, dtype: int64

In [22]:
# 나이를 기준으로 당뇨 양성 비율 카운팅

for i in label:
    
    total = df3[(df3['Age'] == i)]['Age'].count()
    cnt = df3[(df3['Age'] == i) & (df3['Diabetes_positive'] == 1)]['Age'].count()
    ratio = (cnt / total) * 100
    
    if total == 0:
        ratio = 0
    
    print("{0} 카테고리의 전체 데이터 수는 {1}개, 양성 데이터 수는 {2}로 비율은 {3:3.2f}%입니다" .format(i, total, cnt, ratio))

Children 카테고리의 전체 데이터 수는 0개, 양성 데이터 수는 0로 비율은 0.00%입니다
Youth 카테고리의 전체 데이터 수는 219개, 양성 데이터 수는 31로 비율은 14.16%입니다
Adults 카테고리의 전체 데이터 수는 533개, 양성 데이터 수는 233로 비율은 43.71%입니다
Seniors 카테고리의 전체 데이터 수는 16개, 양성 데이터 수는 4로 비율은 25.00%입니다


  ratio = (cnt / total) * 100


In [23]:
# 데이터 정규화

df_normalization = (df - df.min()) / (df.max() - df.min())
df_normalization.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Diabetes
1,0.352941,0.670968,0.489796,0.304348,0.079086,0.314928,0.234415,0.483333,1.0
2,0.058824,0.264516,0.428571,0.23913,0.079086,0.171779,0.116567,0.166667,0.0
3,0.470588,0.896774,0.408163,0.147135,0.079086,0.104294,0.253629,0.183333,1.0
4,0.058824,0.290323,0.428571,0.173913,0.096154,0.202454,0.038002,0.0,0.0
5,0.0,0.6,0.163265,0.304348,0.185096,0.509202,0.943638,0.2,1.0


In [24]:
# 전체 데이터의 1차 거리 계산

total = 0
total_nor = 0
i = 0

while (i < 767):

    total += np.linalg.norm(df.iloc[i] - df.iloc[i+1], ord = 1)
    total_nor += np.linalg.norm(df_normalization.iloc[i] - df_normalization.iloc[i+1], ord = 1)
    i += 1
    
avg = total / 768
avg_nor = total_nor / 768
   
print("전체 데이터의 1차 거리는 {0:3.2f}이며, 정규화한 데이터의 1차 거리는 {1:3.2f}입니다." .format(avg, avg_nor))

전체 데이터의 1차 거리는 156.59이며, 정규화한 데이터의 1차 거리는 1.72입니다.


In [25]:
# 전체 데이터의 2차 거리 계산

total = 0
total_nor = 0
i = 0

while (i < 767):

    total += np.linalg.norm(df.iloc[i] - df.iloc[i+1], ord = 2)
    total_nor += np.linalg.norm(df_normalization.iloc[i] - df_normalization.iloc[i+1], ord = 2)
    i += 1
    
avg = total / 768
avg_nor = total_nor / 768
   
print("전체 데이터의 2차 거리는 {0:3.2f}이며, 정규화한 데이터의 2차 거리는 {1:3.2f}입니다." .format(avg, avg_nor))

전체 데이터의 2차 거리는 100.17이며, 정규화한 데이터의 2차 거리는 0.83입니다.


In [26]:
# 전체 데이터의 inf 거리 계산

total = 0
total_nor = 0
i = 0

while (i < 767):
     
    total += np.linalg.norm(df.iloc[i] - df.iloc[i+1], ord = np.inf)
    total_nor += np.linalg.norm(df_normalization.iloc[i] - df_normalization.iloc[i+1], ord = np.inf)
    i += 1
    
avg = total / 768
avg_nor = total_nor / 768
   
print("전체 데이터의 inf 거리는 {0:3.2f}이며, 정규화한 데이터의 inf 거리는 {1:3.2f}입니다." .format(avg, avg_nor))

전체 데이터의 inf 거리는 89.96이며, 정규화한 데이터의 inf 거리는 0.65입니다.


In [27]:
# 기존 코사인 유사도 계산

cosine_similarity(np.array(df.iloc[0]).reshape(1, -1), np.array(df.iloc[767]).reshape(1, -1))

array([[0.97165855]])

In [28]:
# 정규화한 데이터의 코사인 유사도 계산

cosine_similarity(np.array(df_normalization.iloc[0]).reshape(1, -1), np.array(df_normalization.iloc[767]).reshape(1, -1))

array([[0.64007587]])

In [29]:
# 기존 데이터의 공분산 계산

df.cov()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Diabetes
Pregnancies,11.354056,13.123525,8.531879,0.434104,-5.671349,0.499157,-0.037426,21.57062,0.356618
Glucose,13.123525,926.351048,81.004072,47.126333,1123.556474,48.438745,1.382625,95.425324,7.155305
BloodPressure,8.531879,81.004072,146.795798,15.654757,12.322063,23.426997,0.00149,46.555963,0.941851
SkinThickness,0.434104,47.126333,15.654757,92.760798,215.478568,35.473414,0.494498,2.992808,0.804005
Insulin,-5.671349,1123.556474,12.322063,215.478568,8663.952981,121.500481,4.86676,42.310463,7.954895
BMI,0.499157,48.438745,23.426997,35.473414,121.500481,47.270761,0.349692,2.081889,1.02395
DiabetesPedigreeFunction,-0.037426,1.382625,0.00149,0.494498,4.86676,0.349692,0.109779,0.130772,0.027472
Age,21.57062,95.425324,46.555963,2.992808,42.310463,2.081889,0.130772,138.303046,1.336953
Diabetes,0.356618,7.155305,0.941851,0.804005,7.954895,1.02395,0.027472,1.336953,0.227483


In [30]:
# 정규화한 데이터의 공분산 계산

df_normalization.cov()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Diabetes
Pregnancies,0.039287,0.00498,0.005121,0.000278,-0.000401,0.0006,-0.00094,0.021148,0.020978
Glucose,0.00498,0.038558,0.005333,0.003305,0.008712,0.006391,0.003809,0.010261,0.046163
BloodPressure,0.005121,0.005333,0.015285,0.001736,0.000151,0.004889,6e-06,0.007918,0.009611
SkinThickness,0.000278,0.003305,0.001736,0.010959,0.002815,0.007885,0.002295,0.000542,0.008739
Insulin,-0.000401,0.008712,0.000151,0.002815,0.012516,0.002986,0.002498,0.000848,0.009561
BMI,0.0006,0.006391,0.004889,0.007885,0.002986,0.019769,0.003053,0.00071,0.02094
DiabetesPedigreeFunction,-0.00094,0.003809,6e-06,0.002295,0.002498,0.003053,0.020014,0.000931,0.01173
Age,0.021148,0.010261,0.007918,0.000542,0.000848,0.00071,0.000931,0.038418,0.022283
Diabetes,0.020978,0.046163,0.009611,0.008739,0.009561,0.02094,0.01173,0.022283,0.227483


In [31]:
# 기존 데이터의 상관계수 계산

co = df.corr()
co.style.background_gradient(cmap = 'coolwarm')

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Diabetes
Pregnancies,1.0,0.127964,0.208984,0.013376,-0.018082,0.021546,-0.033523,0.544341,0.221898
Glucose,0.127964,1.0,0.219666,0.160766,0.396597,0.231478,0.137106,0.2666,0.492908
BloodPressure,0.208984,0.219666,1.0,0.134155,0.010926,0.281231,0.000371,0.32674,0.162986
SkinThickness,0.013376,0.160766,0.134155,1.0,0.240361,0.535703,0.154961,0.026423,0.175026
Insulin,-0.018082,0.396597,0.010926,0.240361,1.0,0.189856,0.157806,0.038652,0.179185
BMI,0.021546,0.231478,0.281231,0.535703,0.189856,1.0,0.153508,0.025748,0.312254
DiabetesPedigreeFunction,-0.033523,0.137106,0.000371,0.154961,0.157806,0.153508,1.0,0.033561,0.173844
Age,0.544341,0.2666,0.32674,0.026423,0.038652,0.025748,0.033561,1.0,0.238356
Diabetes,0.221898,0.492908,0.162986,0.175026,0.179185,0.312254,0.173844,0.238356,1.0


In [32]:
# 정규화한 데이터의 상관계수 계산

co2 = df_normalization.corr()
co2.style.background_gradient(cmap = 'coolwarm')

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Diabetes
Pregnancies,1.0,0.127964,0.208984,0.013376,-0.018082,0.021546,-0.033523,0.544341,0.221898
Glucose,0.127964,1.0,0.219666,0.160766,0.396597,0.231478,0.137106,0.2666,0.492908
BloodPressure,0.208984,0.219666,1.0,0.134155,0.010926,0.281231,0.000371,0.32674,0.162986
SkinThickness,0.013376,0.160766,0.134155,1.0,0.240361,0.535703,0.154961,0.026423,0.175026
Insulin,-0.018082,0.396597,0.010926,0.240361,1.0,0.189856,0.157806,0.038652,0.179185
BMI,0.021546,0.231478,0.281231,0.535703,0.189856,1.0,0.153508,0.025748,0.312254
DiabetesPedigreeFunction,-0.033523,0.137106,0.000371,0.154961,0.157806,0.153508,1.0,0.033561,0.173844
Age,0.544341,0.2666,0.32674,0.026423,0.038652,0.025748,0.033561,1.0,0.238356
Diabetes,0.221898,0.492908,0.162986,0.175026,0.179185,0.312254,0.173844,0.238356,1.0


# 5. 결론

1. 당뇨병 양성은 268개, 음성은 500개로 이루어진 데이터셋
2. 당뇨병 양성자의 평균 수치는 모든 feature에서 음성자의 평균 수치보다 높다.
3. Children 카테고리의 양성 비율은 0.00%, Youth 카테고리의 양성 비율은 14.16%, Adults 카테고리의 양성 비율은 43.71%, Seniors 카테고리의 양성 비율은 25.00%로 Adults 카테고리의 양성 비율이 제일 높다.
4. Minkowski 거리의 경우 차원이 커질수록 점점 0에 가까워지며 이는 데이터가 유사하다는 의미다.
5. 코사인 유사도는 정규화 전 데이터가 더욱 1에 가까운 수치가 나온다. 
6. 기존 데이터와 정규화한 데이터의 공분산은 다르고 상관계수는 같다.