In [53]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.cluster import KMeans
california_housing = fetch_california_housing(as_frame=True)

In [7]:
def distance_point_to_line(point, line):
    a, b = line
    pa = np.array(a)
    pb = np.array(b)
    p = np.array(point)
    return np.linalg.norm(np.cross(pb-p, pa-p))/np.linalg.norm(pb-pa)

In [67]:
# mean square error 
def mean_square_error(houses, line):
    mse = 0
    for house in houses:
        mse += distance_point_to_line(house, line) ** 2
    return mse / len(houses)

In [8]:
def total_distance_to_line(houses, line):
    total_distance = 0
    for house in houses:
        total_distance += distance_point_to_line(house, line)
    return total_distance

In [9]:
def efficient_line(houses, iterations=1000, learning_rate=0.01):
    line = (np.random.rand(2), np.random.rand(2))
    for _ in range(iterations):
        gradient = np.zeros((2,2))
        for house in houses:
            pa = np.array(line[0])
            pb = np.array(line[1])
            p = np.array(house)
            gradient[0] += np.cross(pb-p, pa-p)
            gradient[1] += np.cross(pa-p, pb-p)
        line = (line[0] - learning_rate * gradient[0], line[1] - learning_rate * gradient[1])
    return line

In [34]:
def fair_line(houses, tolerance=0.01):
    left = min(houses, key=lambda x: x[0])
    right = max(houses, key=lambda x: x[0])
    line = ((right[1] - left[1]) / 2, (left[0] + right[0]) / 2)  
    while True:
        max_distance = max(distance_point_to_line(house, line) for house in houses)
        left_line = ((right[1] - left[1]) / 2 - tolerance, (left[0] + right[0]) / 2)
        right_line = ((right[1] - left[1]) / 2 + tolerance, (left[0] + right[0]) / 2)
        if distance_point_to_line(left, left_line) < distance_point_to_line(left, right_line):
            right = line
        else:
            left = line
        new_line = ((right[1] - left[1]) / 2, (left[0] + right[0]) / 2)
        if abs(max_distance - max(distance_point_to_line(house, new_line) for house in houses)) < tolerance:
            return new_line

In [56]:
def multiple_efficient_lines(houses, k):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(houses)
    lines = []
    for center in kmeans.cluster_centers_:      
        closest_house_idx = np.argmin(np.linalg.norm(houses - center, axis=1))
        closest_house = houses[closest_house_idx]
        lines.append((center - closest_house, closest_house))
    return lines

In [48]:
california_housing = fetch_california_housing()
houses = california_housing.data[:, [6, 7]]

In [59]:
print(efficient_line(houses[:1000]))

(array([2.07590872e+20, 2.07590872e+20]), array([-2.07590872e+20, -2.07590872e+20]))


In [None]:
print(houses)

In [52]:
print(fair_line(houses))

(-79.98750000000001, 18.857499999999998)


In [57]:
print("multiple Efficient lines" , multiple_efficient_lines(california_housing.data.to_numpy(), 3))

multiple Efficient lines [(array([ 0.16532823, -0.18724832,  0.90271494,  0.05134236, -0.86234899,
        0.86299025,  1.6052953 , -1.30391946]), array([   3.7312    ,   31.        ,    4.6350211 ,    1.05696203,
        944.        ,    1.99156118,   34.16      , -118.38      ])), (array([ 1.24587075e+00,  6.78260870e+00,  7.60454359e-02,  2.61181613e-04,
       -5.92885375e-01,  4.73776642e+00,  6.28794466e-01, -1.83120553e+00]), array([ 2.90720000e+00,  8.00000000e+00,  5.38984509e+00,  1.07358003e+00,
        6.13300000e+03,  2.63898451e+00,  3.45400000e+01, -1.17220000e+02])), (array([-1.63933978,  1.79231945, -1.70329644,  0.00760077, -3.45949561,
       -0.42069289,  1.03677302, -0.55468666]), array([ 5.40910000e+00,  2.20000000e+01,  6.81918239e+00,  1.05817610e+00,
        2.34700000e+03,  3.69025157e+00,  3.42600000e+01, -1.18740000e+02]))]
