### 클래스 불균형 처리하기

In [1]:
import numpy as np
from sklearn.utils import resample
from collections import Counter

In [2]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from collections import Counter

# 예시로 사용할 데이터를 불러오는 코드
# 실제 데이터에 맞게 파일 경로나 데이터 로딩 방법을 수정해야 합니다.
df = pd.read_csv('winequality-red.csv')

# 'quality' 컬럼이 품질 클래스를 나타내는 컬럼이라고 가정
X = df.drop('quality', axis=1).values
y = df['quality'].values

# 클래스 불균형 확인
print('Before Resampling:', Counter(y))

# 클래스별 샘플 수 계산
quality_counts = df['quality'].value_counts().to_dict()
minority_class_count = min(quality_counts.values())

# 오버샘플링
X_over = []
y_over = []
for label, count in quality_counts.items():
    X_label = X[y == label]
    X_label_over = resample(X_label, replace=True, n_samples=count, random_state=42)
    X_over.append(X_label_over)
    y_over.extend([label] * count)

X_over = np.concatenate(X_over)
y_over = np.array(y_over)

# 클래스 불균형 확인
print('After OverSampling:', Counter(y_over))

# 언더샘플링
X_under = []
y_under = []
for label, count in quality_counts.items():
    X_label = X_over[y_over == label]
    X_label_under = resample(X_label, replace=False, n_samples=minority_class_count, random_state=42)
    X_under.append(X_label_under)
    y_under.extend([label] * minority_class_count)

X_under = np.concatenate(X_under)
y_under = np.array(y_under)

# 클래스 불균형 확인
print('After UnderSampling:', Counter(y_under))


Before Resampling: Counter({5: 681, 6: 638, 7: 199, 4: 53, 8: 18, 3: 10})
After OverSampling: Counter({5: 681, 6: 638, 7: 199, 4: 53, 8: 18, 3: 10})
After UnderSampling: Counter({5: 10, 6: 10, 7: 10, 4: 10, 8: 10, 3: 10})


In [8]:
import pandas as pd

# 언더샘플링한 데이터를 데이터프레임으로 변환
under_df = pd.DataFrame(data=X_under, columns=df.columns[:-1])  # 가정: 마지막 열이 타겟 변수
under_df['quality'] = y_under  # 타겟 변수 추가

# 오버샘플링한 데이터를 데이터프레임으로 변환
over_df = pd.DataFrame(data=X_over, columns=df.columns[:-1])  # 가정: 마지막 열이 타겟 변수
over_df['quality'] = y_over  # 타겟 변수 추가

### 언더샘플링

In [11]:
# 데이터프레임 조회
print("언더샘플링 데이터:")
under_df

언더샘플링 데이터:


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.2,0.415,0.36,2.0,0.081,13.0,45.0,0.9972,3.48,0.64,9.2,5
1,6.5,0.615,0.0,1.9,0.065,9.0,18.0,0.9972,3.46,0.65,9.2,5
2,9.0,0.82,0.05,2.4,0.081,26.0,96.0,0.99814,3.36,0.53,10.0,5
3,7.5,0.58,0.03,4.1,0.08,27.0,46.0,0.99592,3.02,0.47,9.2,5
4,6.8,0.69,0.0,5.6,0.124,21.0,58.0,0.9997,3.46,0.72,10.2,5
5,7.5,0.58,0.2,2.0,0.073,34.0,44.0,0.99494,3.1,0.43,9.3,5
6,7.3,0.49,0.1,2.6,0.068,4.0,14.0,0.99562,3.3,0.47,10.5,5
7,8.5,0.46,0.31,2.25,0.078,32.0,58.0,0.998,3.33,0.54,9.8,5
8,9.7,0.32,0.54,2.5,0.094,28.0,83.0,0.9984,3.28,0.82,9.6,5
9,6.6,0.66,0.0,3.0,0.115,21.0,31.0,0.99629,3.45,0.63,10.3,5


#### 각 클래스별로 언더샘플링된 데이터를 출력

In [14]:
# 각 클래스별로 언더샘플링된 데이터를 출력
for label in quality_counts.keys():
    under_samples_label = under_df[under_df['quality'] == label]
    print(f'Class {label} (After UnderSampling):')
    print(under_samples_label)
    print('\n')

Class 5 (After UnderSampling):
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.2             0.415         0.36            2.00      0.081   
1            6.5             0.615         0.00            1.90      0.065   
2            9.0             0.820         0.05            2.40      0.081   
3            7.5             0.580         0.03            4.10      0.080   
4            6.8             0.690         0.00            5.60      0.124   
5            7.5             0.580         0.20            2.00      0.073   
6            7.3             0.490         0.10            2.60      0.068   
7            8.5             0.460         0.31            2.25      0.078   
8            9.7             0.320         0.54            2.50      0.094   
9            6.6             0.660         0.00            3.00      0.115   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 13.0              

### 오버샘플링

결과가 샘플링 전과 동일한 이유는 오버샘플링을 할 때 각 클래스마다 샘플을 똑같은 수만큼 늘리기 때문입니다. 주어진 예시에서는 클래스별로 주어진 샘플 수가 이미 충분하기 때문에 더 이상 늘릴 필요가 없었기 때문입니다. 실제로 오버샘플링은 적은 클래스의 샘플 수를 늘리는 과정이며, 이를 위해 데이터를 반복해서 사용하거나 생성된 합성 데이터를 추가하는 방식 등이 사용됩니다. 주어진 예시에서는 이미 클래스별로 충분한 샘플 수가 있어서 오버샘플링을 적용해도 클래스의 분포가 변하지 않았습니다. 오버샘플링이 실제로 효과를 보려면 적은 클래스의 샘플이 부족한 경우에 해당하며, 그런 경우에는 적은 클래스의 샘플 수를 늘리는 과정이 필요합니다.

In [12]:
print("\n오버샘플링 데이터:")
over_df


오버샘플링 데이터:


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.1,0.430,0.42,5.50,0.070,29.0,129.0,0.99730,3.42,0.72,10.50,5
1,7.2,0.660,0.03,2.30,0.078,16.0,86.0,0.99743,3.53,0.57,9.70,5
2,7.3,0.730,0.24,1.90,0.108,18.0,102.0,0.99670,3.26,0.59,9.30,5
3,7.4,0.600,0.26,7.30,0.070,36.0,121.0,0.99820,3.37,0.49,9.40,5
4,8.1,0.785,0.52,2.00,0.122,37.0,153.0,0.99690,3.21,0.69,9.30,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.7,0.760,0.02,1.80,0.078,6.0,12.0,0.99600,3.55,0.63,9.95,3
1595,7.4,1.185,0.00,4.25,0.097,5.0,14.0,0.99660,3.63,0.54,10.70,3
1596,6.8,0.815,0.00,1.20,0.267,16.0,29.0,0.99471,3.32,0.51,9.80,3
1597,7.3,0.980,0.05,2.10,0.061,20.0,49.0,0.99705,3.31,0.55,9.70,3
