In [None]:
# System
import os
# import yaml
import datetime
from itertools import combinations

# Warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Data Analysis
import re
import numpy as np
import pandas as pd

# Modeling
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler,LabelEncoder,RobustScaler,MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge,Lasso,ElasticNet,LinearRegression
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Neural Network
import tensorflow
tensorflow.random.set_seed(1)
from tensorflow.python.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.wrappers.scikit_learn import KerasRegressor

In [None]:
class Preprocessor(object):
    """
    """
    
    def __init__(self,target,stage):
        try:
            assert stage in ['dev','test','prod']
        except:
            raise Exception('Unknown stage')
        self.stage = stage
        
        cwd = os.getcwd()
        kaggle = os.path.join(cwd,'kaggle')
        
        self.target = target
        self.train_df = pd.read_csv(os.path.join(kaggle,'train.csv'),header=0,index_col=0)
        self.test_df = pd.read_csv(os.path.join(kaggle,'test.csv'),header=0,index_col=0)
        
        # with open(os.path.join(os.getcwd(),'features.yml'),'r') as file:
        #     self.features_yml = yaml.load(file, Loader=yaml.FullLoader)
        self.features = None
        self._get_features()
        
        self.X_train = None
        self.y_train = None
        self.idx_train = None
        self.X_test = None
        self.y_test = None
        self.idx_test = None
        
        self.train_dummies = None
        self.test_dummies = None
        self.dummies = None
        
        self.train_encoded = None
        self.test_encoded = None
        
        self.scaler = None
        self.train_standardized = None
        self.test_standardized = None
        
        self.train_combined = None
        self.test_combined = None
        
        self.selector = None
        self.estimator= None
        self.selected_features = None
        self.train_selected = None
        self.test_selected = None
        
        self.X_fit = None
        self.y_fit = None
        self.X_score = None
        self.y_score = None
        self.X_pred = None
        self.y_pred = None
        
                    
    def _get_features(self,refresh=False):
        # features = {'all':[]}
        # for feature,info in self.features_yml.items():
        #     features['all'].append(feature)
        #     if not features.get(info['dtype'],None):
        #         features[info['dtype']] = []
        #     features[info['dtype']].append(feature)
        # self.features = features
        drop = [
            # 'Alley','LandContour','LandSlope',
            # 'Condition2','MSSubClass','HouseStyle',
            # 'YearRemodAdd','RoofStyle','RoofMatl',
            # 'Exterior1st','Exterior2nd','MasVnrType',
            # 'MasVnrArea','BsmtExposure','BsmtFinType1',
            # 'BsmtFinSF1','BsmtFinType2','BsmtFinSF2',
            # 'BsmtUnfSF','TotRmsAbvGrd','Fireplaces',
            # 'FireplaceQu','GarageType','GarageYrBlt',
        ]
        
        features = {
            'all':[],
            'target':['Target'],
            'cat':[],
            'num':[],
            'encoded':[],
            'other':[]
        }
        if refresh:
            df = self.X_train
        else:
            df = self.train_df
        
        for c in df.columns:
            if c not in drop:
                features['all'].append(c)

                if c==self.target:
                    None # features['target'].append(c)
                elif len(df[c].unique()) > 30:
                    features['num'].append(c)
                elif type(df[c].dropna().values[0])==str:
                    features['cat'].append(c)
                elif type(df[c].dropna().values[0])==np.int64:
                    features['encoded'].append(c)
                else:
                    features['other'].append(c)
        
        if len(features['other']) > 0:
            raise Exception('Uncategorized features')
        
        self.features = features
    
    def clean(self):
        df = self.train_df[self.features['all']].rename({self.target:'Target'},axis=1)
        df = df.dropna(axis=1,thresh=(df.shape[0]*0.6)).fillna('0')
        df.index = df.index.astype(str)
        
        for dtype,fields in self.features.items():
            for field in fields:
                if field not in df.columns:
                    fields.remove(field)
        
        self.y_train = df.Target
        self.X_train = df.drop(self.features['target'],axis=1)
        self.idx_train = df.index
                
        df = self.test_df[self.features['all']].fillna('0')
        df.index = df.index.astype(str)
        self.X_test = df
        self.idx_test = self.X_test.index
        
        self._get_features(refresh=True)
        
    def dummy(self):
        self.train_dummies = pd.get_dummies(self.X_train[self.features['cat']])
        self.test_dummies = pd.get_dummies(self.X_test[self.features['cat']])
        self.dummies = list(set(list(self.train_dummies.columns)+list(self.test_dummies.columns)))
        
        for column in self.dummies:
            if column not in self.train_dummies:
                self.train_dummies[column] = 0
            if column not in self.test_dummies:
                self.test_dummies[column] = 0
        
        self.train_dummies = self.train_dummies[self.dummies]
        self.test_dummies = self.test_dummies[self.dummies]
    
    def encode(self):
        train = pd.DataFrame(index=self.idx_train)
        test = pd.DataFrame(index=self.idx_test)
        for c in self.features['cat']:
            encoder = LabelEncoder()
            encoder.fit(self.X_train[c].values)
            train[c] = encoder.transform(self.X_train[c].values)
            test[c] = encoder.transform(self.X_test[c].values)
        self.train_encoded = train
        self.test_encoded = test
    
    def standardize(self,scaler):
        if scaler == 'standard':
            self.scaler = StandardScaler()
        elif scaler == 'robust': 
            self.scaler = RobustScaler()
        elif scaler == 'minmax':
            self.scaler = MinMaxScaler()
        else:
            raise Exception('Unknown scaler option')
        self.scaler.fit(self.X_train[self.features['num']+self.features['encoded']])
        self.scaler.fit(self.X_test[self.features['num']+self.features['encoded']])
        
        self.train_standardized = pd.DataFrame(
            self.scaler.transform(self.X_train[self.features['num']+self.features['encoded']])
            ,columns=self.features['num']+self.features['encoded'],index=self.idx_train
        )
        self.test_standardized = pd.DataFrame(
            self.scaler.transform(self.X_test[self.features['num']+self.features['encoded']])
            ,columns=self.features['num']+self.features['encoded'],index=self.idx_test
        )
    
    def combine(self):
        self.train_combined = self.train_dummies.join(self.train_standardized,how='inner')
        self.test_combined = self.test_dummies.join(self.test_standardized,how='inner')
    
    def select(self,num_columns=120):
        self.estimator = SVR(kernel='linear')
        self.selector = RFE(self.estimator, n_features_to_select=num_columns, step=1)
        self.selector.fit(self.train_combined,self.y_train)
        self.selected_features = self.selector.get_feature_names_out(self.selector.feature_names_in_)
        
        self.train_selected = self.train_combined[self.selected_features]
        self.test_selected = self.test_combined[self.selected_features]
        
    def split(self):
        (self.X_fit,self.X_score,
        self.y_fit,self.y_score) = train_test_split(
            self.train_selected,self.y_train,
            test_size=0.2,random_state=42
        )
        self.X_pred = self.test_selected
    
    def audit(self):
        None