In [3]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

df = pd.read_csv("../data/california_housing/housing.csv")

In [18]:
df.reset_index()["index"]

0            0
1            1
2            2
3            3
4            4
         ...  
20635    20635
20636    20636
20637    20637
20638    20638
20639    20639
Name: index, Length: 20640, dtype: int64

In [4]:
label_col = "median_house_value"
df_label = df[label_col]
df_label = np.log10(df_label)

In [5]:
from sklearn.model_selection import train_test_split
df_train, df_test, y_train, y_test = train_test_split(df.drop(label_col, axis=1), df_label, test_size=0.1)

In [6]:
df_train
housing_num = df_train.drop("ocean_proximity", axis=1)

In [7]:


rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6


class CombineAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

attr_adder = CombineAttributesAdder(add_bedrooms_per_room=False)
housing_extra_atrribs = attr_adder.transform(df_train.values)

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
#     ('attribs_adder', CombineAttributesAdder()),
    ('std_scaler', StandardScaler())
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(df_train)

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

lin_reg = LinearRegression()
# line_reg = RandomForestRegressor()
lin_reg.fit(housing_prepared, y_train)

LinearRegression()

In [11]:

df_test_prepared = full_pipeline.transform(df_test)

In [12]:
y_pred = lin_reg.predict(df_test_prepared)

In [13]:
from sklearn.metrics import mean_squared_error
    
# def compute_metric(y_label, y_pred):
#     return np.sqrt(mean_squared_error(y_label, y_pred))

def compute_metric(y_label, y_pred):
    # SMAPE
    return 100*np.mean(np.abs(y_label - y_pred)/(np.abs(y_label) + np.abs(y_pred)))
    
print(compute_metric(np.power(10, y_test), np.power(10, y_pred)))
# print(compute_metric(y_test, y_pred))


12.040756104752782


In [17]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [19]:
np.random.rand(10)

array([0.45826928, 0.19444591, 0.88121914, 0.07338111, 0.10842913,
       0.08688114, 0.53317678, 0.76647431, 0.8607615 , 0.4673663 ])

In [20]:
len(df)

20640

In [None]:
df.housing_median_age
df.dropna()