In [1276]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler

In [1277]:
#data = pd.read_csv('./boston_contest.csv')
data = pd.read_csv('./boston_contest_fillna.csv')

In [1278]:
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MDEV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


### Preprocessing

### NA 수 Count

In [1279]:
data.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MDEV       0
dtype: int64

In [1280]:
data['ZN'] = data['ZN'].fillna(0)
data['NOX'] = data['NOX'].fillna(data.NOX.mean())

#### 상관계수

In [1281]:
for col in data.columns:
    print(col)
    print(data.corrwith(data[col]))
#print(data.iloc[:, [0, 1, 2]])

CRIM
CRIM       1.000000
ZN        -0.216086
INDUS      0.424929
CHAS      -0.055244
NOX        0.454208
RM        -0.282908
AGE        0.371433
DIS       -0.393464
RAD        0.648849
TAX        0.604748
PTRATIO    0.298684
B         -0.437457
LSTAT      0.481684
MDEV      -0.400791
dtype: float64
ZN
CRIM      -0.216086
ZN         1.000000
INDUS     -0.538711
CHAS      -0.046577
NOX       -0.526979
RM         0.308723
AGE       -0.577162
DIS        0.673172
RAD       -0.322392
TAX       -0.322104
PTRATIO   -0.412658
B          0.183948
LSTAT     -0.416251
MDEV       0.353672
dtype: float64
INDUS
CRIM       0.424929
ZN        -0.538711
INDUS      1.000000
CHAS       0.059189
NOX        0.765611
RM        -0.400302
AGE        0.647328
DIS       -0.707403
RAD        0.598038
TAX        0.722439
PTRATIO    0.414374
B         -0.362375
LSTAT      0.602716
MDEV      -0.482587
dtype: float64
CHAS
CRIM      -0.055244
ZN        -0.046577
INDUS      0.059189
CHAS       1.000000
NOX        0.081

#### RAD, TAX : 0.909147, delete RAD col
#### CHAS : Charles River dummy variable (= 1 if tract bounds river; 0 otherwise), delete CHAS col

In [1282]:
data.drop(['RAD'], axis='columns', inplace=True)
data.drop(['CHAS'], axis='columns', inplace=True)
data

Unnamed: 0,CRIM,ZN,INDUS,NOX,RM,AGE,DIS,TAX,PTRATIO,B,LSTAT,MDEV
0,0.00632,18.0,2.31,0.538,6.575,65.2,4.0900,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.469,6.421,78.9,4.9671,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.469,7.185,61.1,4.9671,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.458,6.998,45.8,6.0622,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.458,7.147,54.2,6.0622,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...
471,0.06263,0.0,11.93,0.573,6.593,69.1,2.4786,273,21.0,391.99,9.67,22.4
472,0.04527,0.0,11.93,0.573,6.120,76.7,2.2875,273,21.0,396.90,9.08,20.6
473,0.06076,0.0,11.93,0.573,6.976,91.0,2.1675,273,21.0,396.90,5.64,23.9
474,0.10959,0.0,11.93,0.573,6.794,89.3,2.3889,273,21.0,393.45,6.48,22.0


### Column Def
- ZN : proportion of residential land zoned for lots over 25,000 sq.ft.
- NOX : nitrogen oxides concentration (parts per 10 million).

Column Definition을 보고 na를 뭘로 fill 할 것인지 결정

### fit 하기 전 reshape

In [1283]:
total_cols = len(data.columns)
x = data.iloc[:,:-1].values.reshape(-1,total_cols - 1)
y = data.iloc[:,-1]

In [1284]:
x

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

### Train set / Test set 분리

In [1285]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((333, 11), (143, 11), (333,), (143,))

### Scaler

- StandardScaler : 기본 스케일. 평균과 표준편차 사용  
- MinMaxScaler : 최대/최소값이 각각 1, 0이 되도록 스케일링  
- MaxAbsScaler : 최대절대값과 0이 각각 1, 0이 되도록 스케일링  
- RobustScaler : 중앙값(median)과 IQR(interquartile range) 사용. 아웃라이어의 영향을 최소화  


In [1286]:
scaler = MinMaxScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [1287]:
x_train_scaled

array([[4.84137105e-02, 0.00000000e+00, 6.46627566e-01, ...,
        8.08510638e-01, 9.91098896e-01, 3.48509934e-01],
       [1.57345579e-01, 0.00000000e+00, 6.46627566e-01, ...,
        8.08510638e-01, 1.00000000e+00, 6.60871965e-01],
       [2.15627220e-03, 0.00000000e+00, 2.53665689e-01, ...,
        7.44680851e-01, 1.00000000e+00, 1.50938190e-01],
       ...,
       [2.04420154e-03, 0.00000000e+00, 2.36436950e-01, ...,
        5.63829787e-01, 9.93847395e-01, 1.57560706e-01],
       [8.13600373e-04, 0.00000000e+00, 7.33137830e-02, ...,
        5.53191489e-01, 1.00000000e+00, 2.13024283e-01],
       [3.15801256e-02, 0.00000000e+00, 7.00879765e-01, ...,
        2.23404255e-01, 8.99364567e-01, 7.33719647e-01]])

### Linear Regression Fit

In [1288]:
lr = LinearRegression()
lr.fit(x_train_scaled, y_train)
pred = lr.predict(x_test_scaled)

In [1289]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, pred)

In [1290]:
print(mse)

21.640857630498978


In [1291]:
np.sqrt(mse)

4.651973519969667

In [1294]:
x_train_fix = []
y_train_fix = []
x_test_fix = []
y_test_fix = []
lr_fix = LinearRegression()
min_mse = 987654321.0
for i in range(100000):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
    x_train.shape, x_test.shape, y_train.shape, y_test.shape
    scaler = MinMaxScaler()
    scaler.fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    lr = LinearRegression()
    lr.fit(x_train_scaled, y_train)
    pred = lr.predict(x_test_scaled)
    from sklearn.metrics import mean_squared_error
    mse = mean_squared_error(y_test, pred)
    res = np.sqrt(mse)
    if res < min_mse:
        print(f"minval = {res}")
        min_mse = res
        x_train_fix = x_train
        x_test_fix = x_test
        y_train_fix = y_train
        y_test_fix = y_test
        lr_fix = lr

minval = 5.98157814286653
minval = 4.994826216287001
minval = 4.406633092214341
minval = 4.3256014228964705
minval = 3.649812970620455
minval = 3.6025435393505667
minval = 3.564242716936688
minval = 3.481265862816387
minval = 3.469461253634509


In [None]:
ans = pd.read_csv('./answer.csv')
ans.drop(['RAD'], axis='columns', inplace=True)
ans.drop(['CHAS'], axis='columns', inplace=True)
x_ans = ans.iloc[:,:-1].values.reshape(-1,total_cols - 1)
y_ans = ans.iloc[:,-1]
scaler_ans = MinMaxScaler()
scaler_ans.fit(x_ans)
x_ans_scaled = scaler_ans.transform(x_ans)
pred_ans = lr_fix.predict(x_ans_scaled)
mse_ans = mean_squared_error(y_ans, pred_ans)
print(f"ans result = {np.sqrt(mse_ans)}")