# Exercises

In [111]:
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector , make_column_transformer , ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator , TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import root_mean_squared_error
import pandas as pd
import tarfile
import urllib.request
import numpy as np
from scipy.stats import randint

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True , exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url , tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets" , filter="data")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing_full = load_housing_data()

In [112]:
#create income category attribute
housing_full["income_cat"] = pd.cut(housing_full["median_income"] , 
                                   bins=[0 , 1.5 , 3.0 , 4.5 , 6 , np.inf],
                                   labels=[1 , 2 , 3, 4, 5])


In [101]:
#split dataset into training and test set based on income category (stratified) ; 20% test set
strat_train_set , strat_test_set = train_test_split(housing_full , test_size=0.2 , 
                                                    stratify=housing_full["income_cat"] , random_state=42)

In [102]:
#get rid of "income_cat" because we don't need it again
for set_ in (strat_train_set , strat_test_set):
    set_.drop("income_cat" , axis=1 , inplace=True)

In [103]:
#separate data to predictor and labels (in training set)
housing = strat_train_set.drop("median_house_value" , axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [104]:
class ClusterSimilarity(BaseEstimator , TransformerMixin):
    def __init__(self , n_clusters=10 , gamma=1.0 , random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state
        
    def fit(self , X , y=None , sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters , random_state=self.random_state)
        self.kmeans_.fit(X , sample_weight=sample_weight)
        return self #always return self!
    
    def transform(self , X):
        return rbf_kernel(X , self.kmeans_.cluster_centers_ , gamma=self.gamma)
    
    def get_feature_names_out(self , names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]  
        

def column_ratio(X):
    """Calculates the ratio of 2 given columns"""
    return X[: , [0]] / X[: , [1]]

def ratio_name(function_transformer , feature_names_in):
    return ["ratio"] #feature names out

def ratio_pipeline():
    return make_pipeline(SimpleImputer(strategy="median"),
                        FunctionTransformer(column_ratio , feature_names_out=ratio_name),
                        StandardScaler())

log_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                            FunctionTransformer(np.log , feature_names_out="one-to-one"),
                            StandardScaler())

cluster_simil = ClusterSimilarity(n_clusters=10 , gamma=1 , random_state=42)

default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                    StandardScaler())

preprocessing = ColumnTransformer([
    ("bedrooms" , ratio_pipeline() , ["total_bedrooms" , "total_rooms"]),
    ("rooms_per_house" , ratio_pipeline() , ["total_rooms" , "households"]),
    ("people_per_house" , ratio_pipeline() , ["population" , "households"]),
    ("log" , log_pipeline , ["total_bedrooms" , "total_rooms" , "population" , "households" , "median_income"]),
    ("geo" , cluster_simil , ["latitude" , "longitude"]),
    ("cat" , cat_pipeline , make_column_selector(dtype_include=object))
],
remainder=default_num_pipeline) #remaining column: housing_median_age


housing_prepared = preprocessing.fit_transform(housing)

In [105]:
#learning with RandomForestRegressor using randomized search for best hyperparams
full_pipeline_rfr = Pipeline([
    ("preprocessing" , preprocessing),
    ("random_forest" , RandomForestRegressor(random_state=42)),
])

param_distr = {'preprocessing__geo__n_clusters' : randint(low=3 , high=50),
              'random_forest__max_features' : randint(low=2 , high=20)}

rnd_search = RandomizedSearchCV(full_pipeline_rfr , param_distributions=param_distr , 
                                n_iter=10 , cv=3 , scoring='neg_root_mean_squared_error' ,
                               random_state=42)

In [124]:
#learning with RandomForestRegressor using randomized search for best hyperparams and SelectFromModel
from sklearn.feature_selection import SelectFromModel

full_pipeline_rfr_sel = Pipeline([
    ("preprocessing" , preprocessing),
    ("random_forest" , SelectFromModel(estimator=RandomForestRegressor(random_state=42))),
])

param_distr = {'preprocessing__geo__n_clusters' : randint(low=3 , high=50),
              'random_forest__max_features' : randint(low=2 , high=20)}

rnd_search = RandomizedSearchCV(full_pipeline_rfr , param_distributions=param_distr , 
                                n_iter=10 , cv=3 , scoring='neg_root_mean_squared_error' ,
                               random_state=42)

In [129]:
#learning with support vector machine regressor (SVR)
#first using gridserach

from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

full_pipeline_svr = Pipeline([
    ("preprocessing" , preprocessing),
    ("svm_regressor" , SVR()),
])

#param grid uses both kernels for a more compact notation and for better comparison
param_grid = [{"svm_regressor__kernel" : ["linear"] ,
               "svm_regressor__C" : [10 , 50 , 100 , 500 , 1000 , 5000]},
              
              {"svm_regressor__kernel" : ["rbf"] ,
              "svm_regressor__C" : [10 , 50 , 100 , 500 , 1000 , 5000] ,
              "svm_regressor__gamma" : [0.01 , 0.05 , 0.1 , 0.5 , 1 , 5]}
             ]

grid_search_svm = GridSearchCV(full_pipeline_svr , param_grid,
                               cv=3 , scoring='neg_root_mean_squared_error')

In [154]:
#using RandomizedSearchCV
from scipy.stats import expon, loguniform

param_dist = {
        'svm_regressor__kernel': ['linear', 'rbf'],
        'svm_regressor__C': loguniform(20, 200_000),
        'svm_regressor__gamma': expon(scale=1.0),
    }

rnd_search_svm = RandomizedSearchCV(full_pipeline_svr , param_distributions=param_dist,
                               n_iter=10 , cv=3 , scoring="neg_root_mean_squared_error",
                               random_state=42)

In [None]:
grid_search_svm.fit(housing , housing_labels)
grid_search_svm_rmse = -grid_search_svm.best_score_

In [155]:
rnd_search_svm.fit(housing , housing_labels)
rnd_search_svm_rmse = -rnd_search_svm.best_score_

In [156]:
print(str(grid_search_svm_rmse))
print(str(rnd_search_svm_rmse))
rnd_search_svm.best_params_

#testing model
#X_test = strat_test_set.drop("median_house_value" , axis=1)
#y_test = strat_test_set["median_house_value"].copy()

#final_predictions = final_model.predict(X_test)

#final_rmse = root_mean_squared_error(y_test , final_predictions)
#print(final_rmse)

61951.67609348383
53878.52706541228


{'svm_regressor__C': 157055.10989448498,
 'svm_regressor__gamma': 0.26497040005002437,
 'svm_regressor__kernel': 'rbf'}

In [157]:
#adding a SelectFromModel to select only the most important attributes
select_pipeline = Pipeline([
    ("preprocessing" , preprocessing),
    ("attrselector" , SelectFromModel(estimator=RandomForestRegressor(random_state=42) , threshold=0.005)),
    ("svm_regressor" , SVR(kernel=rnd_search_svm.best_params_["svm_regressor__kernel"] , 
                          C=rnd_search_svm.best_params_["svm_regressor__C"] ,
                          gamma=rnd_search_svm.best_params_["svm_regressor__gamma"]))
])

In [158]:
#evaluate using cross_val_score
from sklearn.model_selection import cross_val_score

selector_rmses = -cross_val_score(select_pipeline,
                                  housing,
                                  housing_labels,
                                  scoring="neg_root_mean_squared_error",
                                  cv=3)
print(str(selector_rmses))

[53160.70373894 53695.62656734 56047.92108411]


In [165]:
#exercise 4)
# Creating an custom transformer which get an regressor and features to train on
from sklearn.base import MetaEstimatorMixin, clone

class FeaturesFromRegressor(BaseEstimator , TransformerMixin , MetaEstimatorMixin):
    def __init__(self , regressor , features):
        self.regressor = regressor
        self.features = features
        
    def fit(self , X , y=None):
        if hasattr(X , "columns"):
            self.feature_names_in_ = list(X.columns)
            X_df = X
        else:
            X_df = pd.DataFrame(X)
            
        self.input_features_ = [c for c in X.columns if c not in self.features]
        self.regressor_ = clone(self.regressor)
        self.regressor_.fit(X_df[self.input_features_] , X_df[self.features])
        
        return self
    
    
    def transform(self , X):
        columns = X.columns if hasattr(X , "columns") else None
        X_df = pd.DataFrame(X , columns=columns)
        predicts = self.regressor_.predict(X_df[self.input_features_])
        if predicts.ndim == 1:
            predicts.reshape(-1 , 1)
        extra_columns = [f"pred_{t}" for t in self.features]
        preds_df = pd.DataFrame(predicts , columns=extra_columns , index=X_df.index)
        return pd.concat([X_df , preds_df] , axis=1)
    
    def get_feature_names_out(self , input_features=None):
        extra_columns = [f"pred_{t}" for t in self.target_features]
        return self.feature_names_in_ + extra_columns

In [166]:
#for exercise 4: Try KNN regressor on our FeaturesOnRegressor transformer
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=3 , weights="distance")
knn_transf = FeaturesFromRegressor(knn , ["median_income"])
features = housing[["latitude" , "longitude" , "median_income"]]
knn_transf.fit_transform(features , housing_labels)


Unnamed: 0,latitude,longitude,median_income,pred_median_income
13096,37.80,-122.42,2.0987,3.985767
14973,34.14,-118.38,6.0876,6.899400
3785,38.36,-121.98,2.4330,2.900900
14689,33.75,-117.11,2.2618,2.261800
20507,33.77,-118.15,3.5292,3.475633
...,...,...,...,...
14207,33.86,-118.40,4.7105,4.939100
13105,36.32,-119.31,2.5733,3.301550
19301,32.59,-117.06,4.0616,4.061600
19121,34.06,-118.40,4.1455,4.145500
