In [1]:
from pandas import read_csv, DataFrame
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split as split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [2]:
df = read_csv("https://raw.githubusercontent.com/wooihaw/datasets/main/real_estate_valuation_dataset.csv")
df.head()

Unnamed: 0,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
0,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [3]:
X = df.drop(columns=["House price of unit area"])
y = df["House price of unit area"]

X_train, X_test, y_train, y_test = split(X, y, test_size=0.25, random_state=42)

In [4]:
keys = ['No scaling', 'MinMaxScaler', 'StandardScaler', 'RobustScaler']

In [5]:
ridge_scores = dict(zip(keys, [[], [], [], []]))
for a in range(10):
    ridge = Ridge(alpha=a*10).fit(X_train, y_train)
    ridge_scores['No scaling'].append(ridge.score(X_test, y_test))

In [6]:
knn_scores = dict(zip(keys, [[], [], [], []]))
for k in range(1, 11):
    knn = KNeighborsRegressor(n_neighbors=k).fit(X_train, y_train)
    knn_scores['No scaling'].append(knn.score(X_test, y_test))

In [7]:
print(ridge_scores)
print(knn_scores)

{'No scaling': [0.5796624182861202, 0.556516464928763, 0.5558674264689027, 0.5549527235406654, 0.5540792727088005, 0.5533072217351883, 0.5526363643870258, 0.5520528826176566, 0.55154173693648, 0.5510897870391724], 'MinMaxScaler': [], 'StandardScaler': [], 'RobustScaler': []}
{'No scaling': [0.5946317867520786, 0.6551453782089411, 0.6551375481870814, 0.6439275458428156, 0.6095640001350416, 0.6069253588794992, 0.5902132914988957, 0.595895552280236, 0.6036374906037147, 0.6145513805490412], 'MinMaxScaler': [], 'StandardScaler': [], 'RobustScaler': []}


In [8]:
scalers = (None, MinMaxScaler(), StandardScaler(), RobustScaler())

for i, key in enumerate(keys):
    if key == 'No scaling':
        continue
    scl = scalers[i]
    Xs_train = scl.fit_transform(X_train)
    Xs_test = scl.transform(X_test)
    for a in range(10):
        ridge = Ridge(alpha=a*10).fit(Xs_train, y_train)
        ridge_scores[key].append(ridge.score(Xs_test, y_test))
    for k in range(1, 11):
        knn = KNeighborsRegressor(n_neighbors=k).fit(Xs_train, y_train)
        knn_scores[key].append(knn.score(Xs_test, y_test))


In [9]:
print(ridge_scores)
print(knn_scores)

{'No scaling': [0.5796624182861202, 0.556516464928763, 0.5558674264689027, 0.5549527235406654, 0.5540792727088005, 0.5533072217351883, 0.5526363643870258, 0.5520528826176566, 0.55154173693648, 0.5510897870391724], 'MinMaxScaler': [0.5796624182861772, 0.5571567546453898, 0.502203424641247, 0.4502586550162119, 0.4047576724116998, 0.3656105160775531, 0.3319736114583852, 0.3029415855959484, 0.27772101524633397, 0.25565754076047387], 'StandardScaler': [0.5796624182861826, 0.5845874777204771, 0.5874572333349921, 0.5891019025478204, 0.5899377136567463, 0.5901979048170207, 0.5900251406436804, 0.58951304861583, 0.5887266869711805, 0.5877134187936146], 'RobustScaler': [0.5796624182861827, 0.5835715405253634, 0.5847857459399439, 0.5843601601486879, 0.5829101466687434, 0.5808087497812973, 0.5782890861905312, 0.5755003683710598, 0.5725399272954069, 0.569472191415292]}
{'No scaling': [0.5946317867520786, 0.6551453782089411, 0.6551375481870814, 0.6439275458428156, 0.6095640001350416, 0.60692535887949

In [10]:
ridge_df = DataFrame.from_dict(ridge_scores)
ridge_df['alpha'] = [10*i for i in range(10)]
ridge_df

Unnamed: 0,No scaling,MinMaxScaler,StandardScaler,RobustScaler,alpha
0,0.579662,0.579662,0.579662,0.579662,0
1,0.556516,0.557157,0.584587,0.583572,10
2,0.555867,0.502203,0.587457,0.584786,20
3,0.554953,0.450259,0.589102,0.58436,30
4,0.554079,0.404758,0.589938,0.58291,40
5,0.553307,0.365611,0.590198,0.580809,50
6,0.552636,0.331974,0.590025,0.578289,60
7,0.552053,0.302942,0.589513,0.5755,70
8,0.551542,0.277721,0.588727,0.57254,80
9,0.55109,0.255658,0.587713,0.569472,90


In [11]:
ridge_df.max()

No scaling         0.579662
MinMaxScaler       0.579662
StandardScaler     0.590198
RobustScaler       0.584786
alpha             90.000000
dtype: float64

In [12]:
knn_df = DataFrame.from_dict(knn_scores)
knn_df['k'] = range(1, 11)
knn_df

Unnamed: 0,No scaling,MinMaxScaler,StandardScaler,RobustScaler,k
0,0.594632,0.430464,0.436084,0.442736,1
1,0.655145,0.574485,0.643574,0.639329,2
2,0.655138,0.627923,0.636407,0.67452,3
3,0.643928,0.655488,0.629162,0.66541,4
4,0.609564,0.640904,0.637958,0.684393,5
5,0.606925,0.633756,0.645706,0.686602,6
6,0.590213,0.628055,0.645879,0.690344,7
7,0.595896,0.633602,0.652464,0.691054,8
8,0.603637,0.644371,0.658866,0.686092,9
9,0.614551,0.635001,0.655669,0.674779,10


In [13]:
knn_df.max()

No scaling         0.655145
MinMaxScaler       0.655488
StandardScaler     0.658866
RobustScaler       0.691054
k                 10.000000
dtype: float64