# Classifying Houses by Features

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [21]:
data = pd.read_csv('housing.csv')
    
    
#Giving you column names to make it easy for you
x_columns = ["longitude","latitude","housing_median_age","total_rooms","total_bedrooms","population","households","median_income"]
y_column = ["median_house_value"]

#DROP NAN ROWS HERE
data = data.drop(columns = ['ocean_proximity'])
data = data.dropna()

In [22]:
data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0
...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0


# Data Cleaning and Feature Creation

In [3]:
# Making Features from categorical variables
# If you can see the dataframe, it's obvious that our algorithm can't take in the column ocean_proximity, 
# since it isn't in the form of numbers, which is the input to our linear equation. 
# Let's do a simple transformation and turn this into a simple binary classification

In [28]:
#Normalizing Vectors 

#Task: Implement min/max data normalization on continous data 
#Inputs: Dataframe and list of column names which you want to normalize
#Outputs: Dataframe with normalized columns

# Pseudocode data_normalized = data / max of data


def normalize_data(df,columns):
    #Empty dataframe
    normalized_df = pd.DataFrame()
    for column in columns:
        normalized_df[column] = df[column] / df[column].max()
        
    #ENTER CODE HERE
    
    
    return normalized_df

In [29]:
norm_data = normalize_data(data,["longitude","latitude","housing_median_age","total_rooms","total_bedrooms","population","households","median_income"])

In [31]:
norm_data

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,1.069285,0.902980,0.788462,0.022380,0.020016,0.009024,0.020717,0.555010
1,1.069198,0.902503,0.403846,0.180544,0.171606,0.067289,0.187110,0.553423
2,1.069373,0.902265,1.000000,0.037309,0.029480,0.013901,0.029102,0.483823
3,1.069460,0.902265,1.000000,0.032401,0.036462,0.015638,0.036008,0.376204
4,1.069460,0.902265,1.000000,0.041378,0.043445,0.015834,0.042585,0.256412
...,...,...,...,...,...,...,...,...
20635,1.059312,0.941120,0.480769,0.042345,0.058029,0.023681,0.054258,0.104019
20636,1.060362,0.941359,0.346154,0.017726,0.023274,0.009977,0.018744,0.170452
20637,1.060450,0.939928,0.326923,0.057325,0.075252,0.028222,0.071194,0.113333
20638,1.061324,0.939928,0.346154,0.047304,0.063460,0.020767,0.057382,0.124479


In [33]:
#turn normalized data into a numpy array that we can do things with these values
x = norm_data.values
y = data['median_house_value'].values

x

In [35]:
y

array([452600., 358500., 352100., ...,  92300.,  84700.,  89400.])

# Training our Model

Using classial linear regression model

In [55]:
#Setting up hyperparameters

#Alpha is our learning rate, I'll explain this later
alpha = 0.01

iterations = 10000
#Iterations is how many times we will run our algorithm

In [56]:
x.shape

(20433, 8)

In [57]:
x.shape[0]

20433

In [58]:
np.random.random(8)

array([0.09067642, 0.16419097, 0.02668727, 0.88867259, 0.02644456,
       0.25535033, 0.86590954, 0.04257898])

In [77]:
# Note: np.dot() is simply matrix and vector multiplication, while np.inner() is the standard definition of dot product

def train(x,y,iterations,alpha):
    #Setting up variables
    
    #Storing our cost function, Mean Squared Error
    history = []
    
    #Finding the number of weights we need and also the number of samples
    num_weights = x.shape[1] 
    n = x.shape[0]
    
    #Initializing our weights to random numbers
    np.random.seed(69420)
    weights = np.random.random(num_weights)
    
    #iterating through each training step
    for i in range(iterations):
        
        #testing the model and finding the error
        predictions = np.inner(x,weights)
        error = predictions - y
        
        #finding the mean squared error of the current weights and then add it to our mse list
        rmse = np.sqrt((1/2*n)*np.sum(error**2))
        history.append(rmse)
        
        #finding the gradient and then defining the new weights using this gradient
        gradient = (1/n) * np.dot(x.T,error)
        weights = weights - (alpha*gradient)
    
    return weights,history    

In [78]:
weights,history = train(x,y,iterations, alpha)

In [79]:
#Plot line graph of RMSE over each iterat
weights, history = train(x,y)
plt.xlabel('MRSE')
plt.ylabel()

In [80]:
#Finding Final RMSE
history[-1]

# Adding a bias term

In [89]:
#Add bias term to linear model for better performance!
ones = np.ones((x.shape[0]),1)
x = np.hstack((ones,x))

TypeError: data type not understood

In [82]:
weights,history = train(x,y,iterations, alpha)

In [83]:
#Plot line graph of RMSE over each iteration


In [84]:
#Finding Average Error

predictions = np.inner(x,weights)
error = predictions - y
average_error =  np.sum(abs(error)) / n
average_error

NameError: name 'n' is not defined

In [None]:
y

# Binary Classification

In [93]:
#Greater than amount, gets label 1, lesser than gets -1
def apply_binary(y,num):
    y_binary = np.zeros_like(y)
    for idx,sample in enumerate(y):
            if sample>num:
                y_binary[idx] = 1
            else:
                y_binary[idx] = -1
    #create new ndarray with binary labels instead of 
        
    return y_binary     

In [94]:
y

array([452600., 358500., 352100., ...,  92300.,  84700.,  89400.])

In [95]:
#Apply binary labels to your data
y_binary = apply_binary(y,20000)
#Using the same train function you created earlier
weights,history = train(x,y_binary,iterations, alpha)

In [96]:
y_binary

array([1., 1., 1., ..., 1., 1., 1.])

In [97]:
weights

array([ 0.58005057,  0.33273329,  0.06957184,  0.1155244 ,  0.22592917,
       -0.08303475,  0.23116269,  0.10912078])

In [102]:
classifications = np.sign(np.inner(x,weights))

In [103]:
#Calculate accuracy of model
np.mean(classifications == y_binary)

0.9997552978025742

In [105]:
#To test how your classifier does in comparison to off the shelf algorithms
from sklearn import linear_model

In [106]:
clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
clf.fit(x, y_binary)
predicted = clf.predict(x)
np.mean(predicted == y_binary)

0.9997552978025742