In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/Users/mac/Desktop/Project/Sample/1553768847-housing.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
 9   median_house_value  20640 non-null  int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB


In [3]:
df.replace(np.nan, df['total_bedrooms'].mean(), inplace = True)

In [4]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,INLAND,78100
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,INLAND,77100
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,INLAND,92300
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,INLAND,84700


In [5]:
df['ocean_proximity'].value_counts()

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

In [6]:
def ocean_proximity (x):
    if x == '<1H OCEAN':
        return 1
    elif x == 'NEAR OCEAN':
        return 1
    elif x == 'NEAR BAY':
        return 1
    elif x == 'ISLAND':
        return 1
    elif x == 'INLAND':
        return 0
df['ocean_proximity'] = df.apply(lambda row: ocean_proximity(row['ocean_proximity']), axis = 1)

In [7]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,1,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,1,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,1,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,1,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,1,342200
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25,1665,374.0,845,330,1.5603,0,78100
20636,-121.21,39.49,18,697,150.0,356,114,2.5568,0,77100
20637,-121.22,39.43,17,2254,485.0,1007,433,1.7000,0,92300
20638,-121.32,39.43,18,1860,409.0,741,349,1.8672,0,84700


In [8]:
df['housing_median_age'] = df['housing_median_age'] / df['housing_median_age'].max()
df['total_rooms'] = df['total_rooms'] / df['total_rooms'].max()
df['total_bedrooms'] = df['total_bedrooms'] / df['total_bedrooms'].max()
df['population'] = df['population'] / df['population'].max()
df['households'] = df['households'] / df['households'].max()
df['median_income'] = df['median_income'] / df['median_income'].max()
df['median_house_value'] = df['median_house_value'] / df['median_house_value'].max()

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df[['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']], df[['median_house_value']], test_size = 0.2)

In [10]:
x_train = np.array(x_train)
y_train = np.array(y_train)

In [11]:
print('x_train.shape {}, y_train.shape {}'.format(x_train.shape, y_train.shape))


x_train.shape (16512, 6), y_train.shape (16512, 1)


In [12]:
def compute_cost (w, b, x, y, lambda_):
    m = x.shape[0]
    n = len(w)
    cost = 0
    reg_cost = 0
    
    for i in range(m):
        y_predict = np.dot(w, x[i]) + b
        cost += (y_predict - y[i]) ** 2
    cost = cost / (2 * m)
    
    for j in range(n):
        reg_cost += (w[j] ** 2)
    reg_cost = (lambda_ / (2 * m)) * reg_cost
    
    total_cost = cost + reg_cost
    print('Cost at optimal w is: ', total_cost)

In [13]:
w = np.random.rand(x_train.shape[1])
b = 0.5
lambda_ = 0.7
compute_cost(w, b, x_train, y_train, lambda_)

Cost at optimal w is:  [0.38739348]


In [14]:
def compute_gardient_descent(w, b, x, y, lambda_, learning_rate):
    m = x.shape[0]
    n = x.shape[1]
    
    dw = np.zeros((n, ))
    db = 0
    
    for i in range(m):
        y_predict = np.dot(w, x[i]) + b
        for j in range(n):
            dw[j] = dw[j] + (y_predict * x[i][j])
        db += y_predict - y[i]
    dw = dw / m
    db = db / m
    
    reg_dw = np.zeros((n,))
    for j in range(n):
        reg_dw[j] = (lambda_ / m) * w[j]
    dw = dw + reg_dw
    
    w = w - learning_rate * dw
    b = b - learning_rate * db 
   
    print('w is ', w)
    print('b is ', b)    

In [15]:
w = np.random.rand(x_train.shape[1])
b = 0.5
lambda_ = 0.7
learning_rate = 0.001
compute_gardient_descent(w, b, x_train, y_train, lambda_, learning_rate)

w is  [0.94278559 0.81939804 0.1759495  0.02642696 0.79706498 0.32308846]
b is  [0.49917473]


In [16]:
w = np.array([0.56662641, 0.34140304, 0.13303095, 0.38600769, 0.10100071, 0.04401588])
b = np.array([0.499532])

m = x_train.shape[0]
for i in range(m):
    print('prediction value', np.dot(w, x_train[i]) + b)
    print('target value', y_train[i])

prediction value [0.88196027]
target value [0.5477989]
prediction value [0.7570394]
target value [0.30519939]
prediction value [0.695543]
target value [0.31159938]
prediction value [0.90187035]
target value [0.74539851]
prediction value [0.92572133]
target value [0.5519989]
prediction value [0.68221515]
target value [0.8501983]
prediction value [0.69508957]
target value [0.35499929]
prediction value [0.93579966]
target value [0.42339915]
prediction value [0.89860671]
target value [0.18099964]
prediction value [0.78851374]
target value [0.57039886]
prediction value [0.87733197]
target value [0.50899898]
prediction value [1.16253446]
target value [1.]
prediction value [0.89705968]
target value [1.]
prediction value [0.96298455]
target value [0.44339911]
prediction value [0.7405656]
target value [0.26199948]
prediction value [0.96732735]
target value [0.67639865]
prediction value [1.1052832]
target value [0.66059868]
prediction value [0.81173323]
target value [0.28139944]
prediction value