# 집값 예측 (Linear Regression)

In [110]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import myutils as my

In [111]:
### 데이터 준비

In [112]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
type(housing)

sklearn.utils._bunch.Bunch

In [113]:
housing.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [114]:
df = pd.DataFrame(housing.data,
                 columns = housing.feature_names)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [115]:
df['target'] = housing.target
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [116]:
df.shape

(20640, 9)

In [117]:
# housing.DESCR

In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   target      20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB


In [119]:
df.isna().sum()

MedInc        0
HouseAge      0
AveRooms      0
AveBedrms     0
Population    0
AveOccup      0
Latitude      0
Longitude     0
target        0
dtype: int64

In [120]:
df.duplicated().sum()

0

In [121]:
df.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'target'],
      dtype='object')

In [122]:
# X, y
X = df[['MedInc', 'HouseAge', 'AveRooms']]
y = df['target']

In [123]:
# 테스트 데이터 분리
# 스케일링 
# 학습
# RMSE (X_test)

In [124]:
from sklearn.model_selection import train_test_split

X_train, X_tes, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.2, 
                                                   random_state=2022)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(16512, 3) (4128, 3) (16512,) (4128,)


In [125]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
y_train = y_train.values

In [126]:
X_test = scaler.fit_transform(X_test)
y_test = y_test.values

In [127]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

In [128]:
y_pred = lr.predict(X_test)
y_pred

array([4.52725301, 1.67345752, 1.90095685, ..., 3.08363905, 1.80470024,
       1.21192612])

In [129]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

0.8045075088355537

In [133]:
from sklearn.model_selection import cross_val_score

mse = cross_val_score(lr, X_test, y_test, 
                scoring='neg_mean_squared_error',
                cv=3)
mse

array([-0.63879692, -0.6664322 , -0.66737045])

In [134]:
np.sqrt(-mse)

array([0.79924772, 0.81635299, 0.81692745])

In [136]:
np.mean(np.sqrt(-mse))

0.8108427192491202

In [None]:
# X_train, y_train
# X_test, y_test
# 스케일링 완료된 상태

In [138]:
### 결정트리
from sklearn.tree import DecisionTreeRegressor

In [139]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

In [140]:
y_pred = dt.predict(X_test)
y_pred

array([5.00001, 1.382  , 1.96   , ..., 3.183  , 1.326  , 0.744  ])

In [141]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

1.0415799225821956

In [161]:
mse = cross_val_score(dt, X_test, y_test, 
                scoring='neg_mean_squared_error',
                cv=3)
mse

array([-1.06220141, -1.12100811, -1.07341629])

In [162]:
np.sqrt(-mse)

array([1.03063156, 1.05877671, 1.03605805])

In [163]:
np.mean(np.sqrt(-mse))

1.0418221057171975

In [145]:
### RandomForest
from sklearn.ensemble import RandomForestRegressor

In [164]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [166]:
y_pred = rf.predict(X_test)
y_pred

array([4.8213668, 1.63611  , 1.9068   , ..., 3.2238107, 1.4005   ,
       1.06808  ])

In [149]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

1.0415799225821956

In [167]:
mse = cross_val_score(rf, X_test, y_test, 
                scoring='neg_mean_squared_error',
                cv=3)
mse

array([-0.58263328, -0.67315019, -0.61411204])

In [168]:
np.sqrt(-mse)

array([0.76330418, 0.8204573 , 0.78365301])

In [169]:
np.mean(np.sqrt(-mse))

0.7891381660427849

In [170]:
### Support Vector Machine
from sklearn.svm import SVR

In [171]:
sv = SVR()
sv.fit(X_train, y_train)

In [172]:
y_pred = sv.predict(X_test)
y_pred

array([4.90815504, 1.3479415 , 1.75299741, ..., 3.02077115, 1.38902956,
       1.09032679])

In [173]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

0.7468673186302347

In [174]:
mse = cross_val_score(sv, X_test, y_test, 
                scoring='neg_mean_squared_error',
                cv=3)
mse

array([-0.53069734, -0.58883067, -0.54999605])

In [175]:
np.sqrt(-mse)

array([0.72848976, 0.76735303, 0.74161718])

In [176]:
np.mean(np.sqrt(-mse))

0.7458199916008036

In [None]:
# sklearn.linear_model.Lasso
# sklearn.linear_model.Ridge

In [178]:
from sklearn.linear_model import Lasso

ls = Lasso()
ls.fit(X_train, y_train)

In [179]:
y_pred = ls.predict(X_test)
y_pred

array([2.07672098, 2.07672098, 2.07672098, ..., 2.07672098, 2.07672098,
       2.07672098])

In [180]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

1.1383122510326285

In [181]:
mse = cross_val_score(ls, X_test, y_test, 
                scoring='neg_mean_squared_error',
                cv=3)
mse

array([-1.30101807, -1.28618021, -1.29712899])

In [182]:
np.sqrt(-mse)

array([1.14062179, 1.13409886, 1.13891571])

In [183]:
np.mean(np.sqrt(-mse))

1.1378787852974963

In [184]:
from sklearn.linear_model import Ridge

rg = Ridge()
rg.fit(X_train, y_train)

In [185]:
y_pred = rg.predict(X_test)
y_pred

array([4.52709852, 1.67347762, 1.90096586, ..., 3.08357558, 1.80471426,
       1.21199468])

In [186]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse

0.8045040109347619

In [187]:
mse = cross_val_score(rg, X_test, y_test, 
                scoring='neg_mean_squared_error',
                cv=3)
mse

array([-0.63879481, -0.66641783, -0.66723666])

In [188]:
np.sqrt(-mse)

array([0.7992464 , 0.81634418, 0.81684555])

In [189]:
np.mean(np.sqrt(-mse))

0.8108120466794109

In [191]:
lr.score(X_train, y_train)    # R^2  0~1

0.5147637855350439

In [192]:
sv.score(X_train, y_train)    # 정확도

0.59266503393894