In [26]:
import copy

import pandas as pd
import numpy as np

In [27]:
data = pd.read_csv('data/housing.csv',delimiter=',',nrows=1000)
print(data.head(5))

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


In [42]:
X = data[['longitude','latitude','housing_median_age','total_rooms','total_bedrooms','population','households','median_income']].to_numpy()
y = data['median_house_value'].to_numpy()

In [64]:
print(X[550])
print(y)

[-122.26     37.77     52.     1565.      315.      637.      297.
    4.7778]
[452600. 358500. 352100. 341300. 342200. 269700. 299200. 241400. 226700.
 261100. 281500. 241800. 213500. 191300. 159200. 140000. 152500. 155500.
 158700. 162900. 147500. 159800. 113900.  99700. 132600. 107500.  93800.
 105500. 108900. 132000. 122300. 115200. 110400. 104900. 109700.  97200.
 104500. 103900. 191400. 176000. 155400. 150000. 118800. 188800. 184400.
 182300. 142500. 137500. 187500. 112500. 171900.  93800.  97500. 104200.
  87500.  83100.  87500.  85300.  80300.  60000.  75700.  75000.  86100.
  76100.  73500.  78400.  84400.  81300.  85000. 129200.  82500.  95200.
  75000.  67500. 137500. 177500. 102100. 108300. 112500. 131300. 162500.
 112500. 112500. 137500. 118800.  98200. 118800. 162500. 137500. 500001.
 162500. 137500. 162500. 187500. 179200. 130000. 183800. 125000. 170000.
 193100. 257800. 273400. 237500. 350000. 335700. 313400. 268500. 259400.
 275700. 225000. 262500. 218500. 255000. 2241

In [50]:
print(np.isnan(X).sum(), np.isnan(y).sum())  # Kiểm tra số lượng NaN
print(np.isinf(X).sum(), np.isinf(y).sum())  # Kiểm tra số lượng Inf


6 0
0 0


In [55]:
w_init = np.random.randn(8) * 0.1  # Tăng hệ số khởi tạo
b_init = 0.0

print(w_init,b_init)

[ 0.05683226 -0.07698664 -0.19250184 -0.04630793 -0.03689946 -0.01468299
  0.26445092  0.06220638] 0.0


In [56]:
def compute_cost(X,y,w,b):
    m = X.shape[0]
    cost = 0.0
    for i in range(m):
        f_wb_i = np.dot(X[i], w) + b
        cost = cost + (f_wb_i - y[i])**2
    cost = cost / (2 * m)
    return cost

In [57]:
mask = ~np.isnan(X).any(axis=1)  # Chỉ giữ lại dòng không có NaN
X_clean = X[mask]
y_clean = y[mask]
cost = compute_cost(X_clean,y_clean,w_init,b_init)
print(cost)

25979133468.62565


In [58]:
def compute_gradient(X,y,w,b):
    m,n = X.shape
    dj_dw = np.zeros((n,))
    dj_db = 0.
    
    for i in range(m):
        err = (np.dot(X[i],w) + b) - y[i]
        for j in range(n):
            dj_dw[j] = dj_dw[j] + err * X[i,j]
        dj_db = dj_db + err
    dj_dw = dj_dw / m
    dj_db = dj_db / m
    
    return dj_db,dj_dw

In [59]:
def gradient_descent(X,y,w_in,b_in,cost_function,gradient_function,alpha,num_iters):
    w = copy.deepcopy(w_in)
    b = b_in
    
    for i in range(num_iters):
        dj_db, dj_dw = gradient_function(X,y,w,b)
        
        w = w - alpha * dj_dw
        b = b - alpha * dj_db
        
    return w,b

In [61]:
print("NaN trong X:", np.isnan(X_clean).sum())
print("NaN trong y:", np.isnan(y_clean).sum())
print("NaN trong w:", np.isnan(w_init).sum())

print("Inf trong X:", np.isinf(X_clean).sum())
print("Inf trong y:", np.isinf(y_clean).sum())
print("Inf trong w:", np.isinf(w_init).sum())


NaN trong X: 0
NaN trong y: 0
NaN trong w: 0
Inf trong X: 0
Inf trong y: 0
Inf trong w: 0


In [63]:
X_clean = (X_clean - np.mean(X_clean, axis=0)) / np.std(X_clean, axis=0)
y_clean = (y_clean - np.mean(y_clean)) / np.std(y_clean)

iterations = 1000
alpha = 1e-3
w_final, b_final = gradient_descent(X_clean,y_clean,w_init,b_init,compute_cost,compute_gradient,alpha,iterations)
print(w_final,b_final)

[ 0.02328311  0.09330958 -0.07574369  0.00634056 -0.0737007  -0.06828302
  0.23758269  0.48078991] -2.3476408526770018e-14


In [65]:
test = X_clean[505]
prediction = np.dot(w_final,test) + b_final
print("Real value : ",y_clean[505])
print("Prediction value : ",prediction)

Real value :  -0.9293990720709878
Prediction value :  -0.21603880156876254
