In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

class TrainTestSplitPipeline ():
    """
    Split data into training and testing sets. If stratify is required,
    will perform stratifield shuffle split based on the selected column

    If the column selected as stratified category, and the bins value is
    provided, the columns will be first cut in to categories then perform
    the split

    Parameters
    ----------
    test_size: float, int, default=0.2
        If float, should be between 0.0 and 1.0, represent the percentage
        of groups to include the test split (rounded up).
        If int, should represent the absolute number of test groups. 
    
    Examples
    --------
    >>> split = TrainTestSplitPipeline()
    >>> split.fit(housing)
    >>> split.stratify(column='some_column', bins = [0, 1, 2, np.inf])
    >>> split.split()
    >>> train_set = split.get_train_set()
    >>> test_set = split.get_test_set()
    """
    def __init__(self, test_size:float=0.2, label:str=None):
        self.test_size = test_size
        self.label = label

        # For practice purpose, below parameters set to fixed value
        self.random_state = 42
        self.n_splits = 1

        pass

    def fit(self, X):
        self.data = X

    def stratify(self, column=None, bins=None ):
        """
        Set stratify categories when perform the splitting

        Parameters
        ----------
        column: string
            Must be one of the column in the fitted data.
        bins: array
            If provided, will try to perform a pd.cut() to the 
            selected column, and use the cutted value as the
            stratify category. Label of 1 based index will be 
            temporarily assigned to the category.
        """
        if bins is not None:
            self.stratify_cat = pd.cut(
                self.data[column], 
                bins=bins, 
                labels=[i+1 for i in range(len(bins)-1)]
                )
        else:
            self.stratify_cat = self.data[column]
        
        pass

    def split(self):
        if self.stratify_cat is not None:
            split = StratifiedShuffleSplit(
                n_splits=self.n_splits, 
                test_size=self.test_size, 
                random_state=self.random_state
                )
            
            for train_idx, text_idx in split.split(self.data, self.stratify_cat):
                train_set = self.data.loc[train_idx]
                test_set = self.data.loc[text_idx]
        else:
            train_set, test_set = train_test_split(
                self.data, 
                random_state=self.random_state
                )
        
        if self.label is not None:
            self.train_feature = train_set.drop(self.label, axis=1)
            self.train_label = train_set[self.label].copy()

            self.test_feature = test_set.drop(self.label, axis=1)
            self.test_label = test_set[self.label].copy()
        else:
            self.train_feature = train_set.copy()
            self.test_feature = test_set.copy()
            self.train_label = None
            self.test_label = None
    
    def get_train_set(self):
        return self.train_feature, self.train_label
    
    def get_test_set(self):
        return self.test_feature, self.test_label

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin 
from sklearn.preprocessing import OneHotEncoder

class OneHotEncodingPipeline(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.encoders = []
        
    def fit(self, X, y=None):
        for col in self.columns:
            encoder = OneHotEncoder()
            encoder.fit(X[[col]])
            self.encoders.append(encoder)
        return self
    def transform(self, X):
        x = X.copy()
        for idx,col in enumerate(self.columns):
            encoder = self.encoders[idx]
            x_onehot =encoder.fit_transform(X[[col]])
            
            x[encoder.categories_[0]] = x_onehot.toarray()
            # pd.concat([x, onehot_df], axis=0)
        
        return x.drop(self.columns, axis=1)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin 

class FeatureSelectionPipeline(BaseEstimator, TransformerMixin):
    def __init__(self, combination_function, drop_columns=None):
        self.comb_func = combination_function
        self.drop_columns = drop_columns
    def fit(self, X, y=None):
    
        return self
    def transform(self, X:pd.DataFrame):
        x = X.copy()
        x = self.comb_func(x)
        
        if self.drop_columns is not None:
            return x.drop(self.drop_columns, axis=1)
        else:
            return x

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin 

class ImputePipeline(BaseEstimator, TransformerMixin):
    def __init__(self, strategy, columns=None):
        self.imputer = SimpleImputer(strategy=strategy)
        self.columns = columns
        pass
    def fit(self, X, y=None):
        if self.columns is not None:
            self.imputer.fit(X[self.columns])
            return self
        else:
            self.imputer.fit(X)
            return self
    def transform(self, X, y=None):
        df = X.copy()
        if self.columns is not None:
            imputed = self.imputer.transform(X[self.columns])
            df[self.columns] = imputed
            return df
        else:
            print(X.head())
            df = pd.DataFrame(imputed, columns = X.columns, index=X.index)
            return df

In [None]:
HOUSING_PATH = os.path.join('datasets', 'housing')

In [None]:
def load_housing_data(housing_path=HOUSING_PATH):
	csv_path = os.path.join(housing_path, "housing.csv")
	return pd.read_csv(csv_path)

In [None]:
housing = load_housing_data()

In [None]:
split = TrainTestSplitPipeline(label='median_house_value')
split.fit(housing)
split.stratify(column='median_income', bins = [0, 1.5, 3.0, 4.5, 6, np.inf])
split.split()

train_features, train_labels = split.get_train_set()
test_features, test_labels = split.get_test_set()

In [None]:
train_features.corrwith(train_labels)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


def CombineAttributesFunc(data):
    data['rooms_per_household'] = data['total_rooms']/data['households']
    data['bedrooms_per_room'] = data['total_bedrooms']/data['total_rooms']
    return data

drop_columns = ['households','total_bedrooms','population','longitude','housing_median_age']
impute_columns = ['bedrooms_per_room']

feat_processing_pipeline = Pipeline([
    ('encoder', OneHotEncodingPipeline(columns=['ocean_proximity'])),
    ('feature_selector', FeatureSelectionPipeline(combination_function=CombineAttributesFunc, drop_columns=drop_columns)),
    ('imputer',ImputePipeline(strategy='median', columns=impute_columns)),
    ('std_scaler', StandardScaler())
])

Question:

- If a new feature that combines 2 existing features (e.g. bedrooms_per_room) have strong correlation with the label, however, one of the feature ('total_rooms') contains missing value. **Should I impute the missing value first, or should I calculate the new feature first (means both of new feature, and current feature will be NA in the beginning), then impute both of them?**


In [None]:
train_features_prepared = feat_processing_pipeline.fit_transform(train_features)

In [None]:
test_features_prepared = feat_processing_pipeline.transform(test_features)

In [None]:
# from sklearn.model_selection import cross_val_score
# from sklearn.svm import SVR

# svr = SVR(kernel='linear')

# scores = cross_val_score(svr, train_features_prepared, train_labels, cv=2)

In [None]:
# from sklearn.ensemble import RandomForestRegressor
# forest_reg = RandomForestRegressor()

# forest_reg_scores = cross_val_score(forest_reg, train_features_prepared, train_labels, cv=3)

In [None]:
# forest_reg_scores.mean()

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.svm import SVR

# sv_reg = SVR(kernel='linear')

# param_grid = [
#     {'kernel':['linear']},
#     {'kernel':['rbf'], "C":[1.0, 2.0], 'gamma':['scale', 'auto']}
# ]

# grid_search = GridSearchCV(sv_reg, param_grid, cv=3, scoring='neg_mean_squared_error', return_train_score=True)

In [None]:
# final_model = grid_search.best_estimator_

# final_predictions = final_model.predict(test_features_prepared)

In [None]:
# from sklearn.metrics import mean_squared_error

# final_mse = mean_squared_error(test_labels, final_predictions)