In [26]:
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 데이터셋 불러오기
boston = fetch_openml(data_id=531, as_frame=True)

# DataFrame으로 변환
df = boston.frame

df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


## 주요 열 (Features)
-	CRIM: 도시 1인당 범죄율
-	ZN: 25,000 평방피트를 초과하는 거주 지역의 비율
-	INDUS: 비소매 상업 지역이 점유하고 있는 면적 비율
-	CHAS: 찰스강 인접 여부 (1: 인접, 0: 인접하지 않음)
-	NOX: 일산화질소 농도
-	RM: 주택당 평균 방 수
-	AGE: 1940년 이전에 건축된 소유 주택의 비율
-	DIS: 5개의 보스턴 직업 고용 센터까지의 가중 거리
-	RAD: 방사형 고속도로까지의 접근성 지수
-	TAX: 10,000달러 당 재산세율
-	PTRATIO: 도시별 학생/교사 비율
-	B: 1000(Bk - 0.63)^2, 여기서 Bk는 도시의 아프리카계 미국인 비율
-	LSTAT: 인구의 하위 계층 비율
-	MEDV: 주택의 중앙값 (종속 변수)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   CRIM     506 non-null    float64 
 1   ZN       506 non-null    float64 
 2   INDUS    506 non-null    float64 
 3   CHAS     506 non-null    category
 4   NOX      506 non-null    float64 
 5   RM       506 non-null    float64 
 6   AGE      506 non-null    float64 
 7   DIS      506 non-null    float64 
 8   RAD      506 non-null    category
 9   TAX      506 non-null    float64 
 10  PTRATIO  506 non-null    float64 
 11  B        506 non-null    float64 
 12  LSTAT    506 non-null    float64 
 13  MEDV     506 non-null    float64 
dtypes: category(2), float64(12)
memory usage: 49.0 KB


In [28]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.554695,6.284634,68.574901,3.795043,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.115878,0.702617,28.148861,2.10571,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.385,3.561,2.9,1.1296,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.449,5.8855,45.025,2.100175,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.538,6.2085,77.5,3.20745,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.624,6.6235,94.075,5.188425,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,0.871,8.78,100.0,12.1265,711.0,22.0,396.9,37.97,50.0


In [56]:
def crime_level(x):
    if x < 0.082:
        return 0
    elif x >= 0.082 and x < 0.256:
        return 1
    elif x >= 0.256 and x < 3.677:
        return 2
    else:
        return 3

In [57]:
df['CRIM'] = df['CRIM'].apply(crime_level)

In [59]:
test = pd.get_dummies(df['CRIM'], prefix='CRIM')

In [63]:
test

Unnamed: 0,CRIM_0,CRIM_1,CRIM_2,CRIM_3
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0
...,...,...,...,...
501,1,0,0,0
502,1,0,0,0
503,1,0,0,0
504,0,1,0,0


In [67]:
df = pd.concat([df, test], axis=1)

In [69]:
df.drop(['CRIM'], axis=1, inplace=True)

In [74]:
df.drop(['TAX', 'PTRATIO'], axis=1, inplace=True)

In [75]:
df

Unnamed: 0,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,B,LSTAT,...,CRIM_1,CRIM_2,CRIM_3,TAX_75,TAX_50,TAX_25,TAX_1,STUDY_LOW,STUDY_MID,STUDY_HIGH
0,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,396.90,4.98,...,0,0,0,0,1,0,0,1,0,0
1,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,396.90,9.14,...,0,0,0,1,0,0,0,0,1,0
2,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,392.83,4.03,...,0,0,0,1,0,0,0,0,1,0
3,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,394.63,2.94,...,0,0,0,1,0,0,0,0,1,0
4,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,396.90,5.33,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,391.99,9.67,...,0,0,0,1,0,0,0,0,0,1
502,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,396.90,9.08,...,0,0,0,1,0,0,0,0,0,1
503,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,396.90,5.64,...,0,0,0,1,0,0,0,0,0,1
504,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,393.45,6.48,...,1,0,0,1,0,0,0,0,0,1


In [71]:
df['TAX_75'] = df['TAX'].apply(lambda x: 1 if x < 279.0 else 0)
df['TAX_50'] = df['TAX'].apply(lambda x: 1 if x >= 279.0 and x < 330.0 else 0)
df['TAX_25'] = df['TAX'].apply(lambda x: 1 if x >= 330.0 and x < 666.0 else 0)
df['TAX_1'] = df['TAX'].apply(lambda x: 1 if x > 666 else 0)

In [72]:
df['STUDY_LOW'] = df['PTRATIO'].apply(lambda x: 1 if x < 17.4 else 0)
df['STUDY_MID'] = df['PTRATIO'].apply(lambda x: 1 if x >= 17.4 and x < 19.05 else 0)
df['STUDY_HIGH'] = df['PTRATIO'].apply(lambda x: 1 if x >= 19.05 else 0)

In [76]:
df['NOX'] = df['NOX'].apply(lambda x: 1 if x < 0.538 else 0)

In [80]:
df.describe()

Unnamed: 0,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,B,LSTAT,...,CRIM_1,CRIM_2,CRIM_3,TAX_75,TAX_50,TAX_25,TAX_1,STUDY_LOW,STUDY_MID,STUDY_HIGH
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,...,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,11.363636,11.136779,0.06917,0.492095,6.284634,68.574901,3.795043,9.549407,356.674032,12.653063,...,0.249012,0.249012,0.250988,0.245059,0.241107,0.243083,0.009881,0.249012,0.250988,0.5
std,23.322453,6.860353,0.253994,0.500432,0.702617,28.148861,2.10571,8.707259,91.294864,7.141062,...,0.432869,0.432869,0.434011,0.430548,0.428178,0.429369,0.099011,0.432869,0.434011,0.500495
min,0.0,0.46,0.0,0.0,3.561,2.9,1.1296,1.0,0.32,1.73,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,5.19,0.0,0.0,5.8855,45.025,2.100175,4.0,375.3775,6.95,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,9.69,0.0,0.0,6.2085,77.5,3.20745,5.0,391.44,11.36,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
75%,12.5,18.1,0.0,1.0,6.6235,94.075,5.188425,24.0,396.225,16.955,...,0.0,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.75,1.0
max,100.0,27.74,1.0,1.0,8.78,100.0,12.1265,24.0,396.9,37.97,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ZN          506 non-null    float64
 1   INDUS       506 non-null    float64
 2   CHAS        506 non-null    int64  
 3   NOX         506 non-null    int64  
 4   RM          506 non-null    float64
 5   AGE         506 non-null    float64
 6   DIS         506 non-null    float64
 7   RAD         506 non-null    int64  
 8   B           506 non-null    float64
 9   LSTAT       506 non-null    float64
 10  MEDV        506 non-null    float64
 11  CRIM_0      506 non-null    uint8  
 12  CRIM_1      506 non-null    uint8  
 13  CRIM_2      506 non-null    uint8  
 14  CRIM_3      506 non-null    uint8  
 15  TAX_75      506 non-null    int64  
 16  TAX_50      506 non-null    int64  
 17  TAX_25      506 non-null    int64  
 18  TAX_1       506 non-null    int64  
 19  STUDY_LOW   506 non-null    i

In [93]:
def zn_level(x):
    if x < 12.5:
        return 0
    elif x >= 12.5 and x < 17.5:
        return 1
    elif x >= 17.5 and x < 22.0:
        return 2
    elif x >= 22.0 and x < 34.0:
        return 3
    elif x >= 34.0 and x < 75.0:
        return 4
    else:
        return 5

In [94]:
df['ZN'] = df['ZN'].apply(zn_level)

In [96]:
test2 = pd.get_dummies(df['ZN'], prefix="ZN")

In [97]:
df = pd.concat([df, test2], axis=1)

In [98]:
df.drop('ZN', axis=1, inplace=True)

In [99]:
df

Unnamed: 0,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,B,LSTAT,MEDV,...,TAX_1,STUDY_LOW,STUDY_MID,STUDY_HIGH,ZN_0,ZN_1,ZN_2,ZN_3,ZN_4,ZN_5
0,2.31,0,0,6.575,65.2,4.0900,1,396.90,4.98,24.0,...,0,1,0,0,0,0,1,0,0,0
1,7.07,0,1,6.421,78.9,4.9671,2,396.90,9.14,21.6,...,0,0,1,0,1,0,0,0,0,0
2,7.07,0,1,7.185,61.1,4.9671,2,392.83,4.03,34.7,...,0,0,1,0,1,0,0,0,0,0
3,2.18,0,1,6.998,45.8,6.0622,3,394.63,2.94,33.4,...,0,0,1,0,1,0,0,0,0,0
4,2.18,0,1,7.147,54.2,6.0622,3,396.90,5.33,36.2,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,11.93,0,0,6.593,69.1,2.4786,1,391.99,9.67,22.4,...,0,0,0,1,1,0,0,0,0,0
502,11.93,0,0,6.120,76.7,2.2875,1,396.90,9.08,20.6,...,0,0,0,1,1,0,0,0,0,0
503,11.93,0,0,6.976,91.0,2.1675,1,396.90,5.64,23.9,...,0,0,0,1,1,0,0,0,0,0
504,11.93,0,0,6.794,89.3,2.3889,1,393.45,6.48,22.0,...,0,0,0,1,1,0,0,0,0,0


In [84]:
df[df['ZN'] > 12.5]['ZN'].describe()

count    124.000000
mean      45.362903
std       26.023780
min       17.500000
25%       22.000000
50%       34.000000
75%       75.000000
max      100.000000
Name: ZN, dtype: float64

In [48]:
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48,22.0


In [78]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# 독립 변수(X)와 종속 변수(y) 분리
X = df.drop(columns=['MEDV'])  # 'MEDV' 열을 제거하여 독립 변수만 남김
y = df['MEDV']  # 종속 변수

# 데이터 분리 (7:3 비율)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 학습
model = LinearRegression()
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

# 평가
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 22.91694650811752


In [79]:
from sklearn.metrics import r2_score

# 결정 계수 (R²) 계산
r2 = r2_score(y_test, y_pred)
print(f'R²: {r2}')

R²: 0.6924440417692135
