# Data Analysis Libraries

In [19]:
%matplotlib inline 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

# Models

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn import neighbors
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import ElasticNet, Ridge, Lasso, LinearRegression

# Model Selection

In [21]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
from sklearn import model_selection
from sklearn.feature_selection import SelectKBest, f_regression, RFE

# Preprocessing

In [22]:
from sklearn import preprocessing

# Utilities

In [23]:
from functools import reduce
from ipywidgets import interact, interact_manual

import os
import json

# PLOTTING CONFIGS

In [24]:
SMALL_SIZE = 12
MEDIUM_SIZE = 20
BIGGER_SIZE = 24

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [25]:
EXPERIMENT_VERSION = 'exp_v1'

# Load Configuration

In [26]:
CONFIGURATIONS = './configurations/'
config_file_name = EXPERIMENT_VERSION + '.json'

config_file_name_path = os.path.join(CONFIGURATIONS + config_file_name)

with open(config_file_name_path, 'r') as f:
    configuration = json.load(f)

In [27]:
configuration

{'metadata': {'date': 'transactiondate', 'target': 'logerror', 'kfold': 10},
 'features': {'heatingorsystemtypeid': {'missing': 'Mean',
   'category_encoding': None,
   'feature_standarization': 'Standard'},
  'regionidcity': {'missing': 'Mean',
   'category_encoding': None,
   'feature_standarization': 'MinMax'},
  'yearbuilt': {'missing': 'Mean',
   'category_encoding': None,
   'feature_standarization': 'MinMax'},
  'lotsizesquarefeet': {'missing': 'Mean',
   'category_encoding': None,
   'feature_standarization': 'Standard'},
  'taxvaluedollarcnt': {'missing': 'Mean',
   'category_encoding': None,
   'feature_standarization': 'MinMax'},
  'rawcensustractandblock': {'missing': 'Mean',
   'category_encoding': None,
   'feature_standarization': 'Standard'},
  'buildingqualitytypeid': {'missing': 'Mean',
   'category_encoding': None,
   'feature_standarization': 'Standard'}},
 'feature_selection': {'operation': 'RFE', 'num_feats': 7},
 'features_to_analyze': ['transactiondate', 'logerr

# Read DATA

In [None]:
DATA_DIR = './data'
TRAIN_DATA = os.path.join(DATA_DIR, "base_data/train.csv")
date_feature = configuration['metadata']['date']
train_df = pd.read_csv(TRAIN_DATA, parse_dates= [date_feature])

# Results

In [13]:
RESULT_DIR = './results/'

In [14]:
train_df.head(10)

Unnamed: 0,heatingorsystemtypeid,buildingqualitytypeid,propertyzoningdesc,unitcnt,lotsizesquarefeet,finishedsquarefeet12,regionidcity,calculatedbathnbr,fullbathcnt,yearbuilt,...,fips,roomcnt,regionidcounty,rawcensustractandblock,propertylandusetypeid,propertycountylandusecode,longitude,transactiondate,logerror,parcelid
0,,,,,4506.0,3100.0,53571.0,3.5,3.0,1998.0,...,6059.0,0.0,1286.0,60590630.0,261.0,122,-117869207.0,2017-01-01,0.025595,14297519
1,,,,,12647.0,1465.0,13091.0,1.0,1.0,1967.0,...,6111.0,5.0,2061.0,61110010.0,261.0,1110,-119281531.0,2017-01-01,0.055619,17052889
2,,,,,8432.0,1243.0,21412.0,2.0,2.0,1962.0,...,6059.0,6.0,1286.0,60590220.0,261.0,122,-117823170.0,2017-01-01,0.005383,14186244
3,2.0,8.0,LCR110000*,1.0,13038.0,2376.0,396551.0,3.0,3.0,1970.0,...,6037.0,0.0,3101.0,60373000.0,261.0,0101,-118240722.0,2017-01-01,-0.10341,12177905
4,2.0,8.0,LAR3,1.0,278581.0,1312.0,12447.0,3.0,3.0,1964.0,...,6037.0,0.0,3101.0,60371240.0,266.0,010C,-118414640.0,2017-01-01,0.00694,10887214
5,,,,,903.0,1492.0,51239.0,2.0,2.0,1982.0,...,6111.0,6.0,2061.0,61110050.0,266.0,1129,-118993991.0,2017-01-01,-0.020526,17143294
6,2.0,9.0,PSR2,1.0,63000.0,2962.0,47019.0,3.0,3.0,1950.0,...,6037.0,0.0,3101.0,60374610.0,261.0,0101,-118179824.0,2017-01-01,-0.001011,12095076
7,,5.0,GLR4YY,1.0,4214.0,738.0,45457.0,1.0,1.0,1922.0,...,6037.0,0.0,3101.0,60373020.0,261.0,0100,-118239357.0,2017-01-01,0.101723,12069064
8,2.0,9.0,WHRE20000*,1.0,20028.0,3039.0,14634.0,3.0,3.0,1970.0,...,6037.0,0.0,3101.0,60375000.0,261.0,0100,-118006914.0,2017-01-02,-0.040966,12790562
9,2.0,8.0,LAR3,1.0,54048.0,1290.0,12447.0,3.0,3.0,1980.0,...,6037.0,0.0,3101.0,60372750.0,266.0,010C,-118416000.0,2017-01-02,-0.036763,11542646


# Preprocess Data

## Missing Values

In [30]:
def get_imputation_value(train, i, operation):
    """ Find the imputing value on behalf of the operation
    """
    if operation == 'Mean':
        return train[i].mean()
    if operation == 'Mode':
        return train[i].mode()[0]
    

In [31]:
def fill_missing_values(train, val):
    """ Fills the column with the imputing value
    """
    
    target = configuration['metadata']['target']
    list_of_features = configuration['features']   
    
    for feature, operations in list_of_features.items():
        operation = operations['missing']
        imputing_value = get_imputation_value(train, feature, operation)
        train[feature].fillna(imputing_value, inplace = True)
        val[feature].fillna(imputing_value, inplace = True)
        
    return train, val  

## Feature Standarization

In [32]:
scaler_objects = dict()
def data_standarization_train(series, feature, operation):

    df_train = pd.DataFrame([series]).T
    df_train.columns = [feature]
    
    
    scaler_object = get_scaler_object(df_train, operation)
    
    scaler_objects.update({str(series.name): scaler_object})
    
    df_train = scaler_object.transform(df_train)
    df_train = pd.DataFrame(df_train, columns= [feature])
    return df_train[feature].values

In [33]:
def data_standarization_val(series, feature, operation):
    
    df_val = pd.DataFrame([series]).T
    df_val.columns = [feature]
    scaler_object = scaler_objects[str(series.name)]
    
    df_val = scaler_object.fit_transform(df_val)
    df_val = pd.DataFrame(df_val, columns= [feature])
    return df_val[feature].values

In [34]:
def get_scaler_object(df, operation):
    
    if operation == 'MinMax':
        scaler = preprocessing.MinMaxScaler()
    if operation == 'Standard':
        scaler = preprocessing.StandardScaler()
    if operation == 'BoxCox':
        scaler = preprocessing.PowerTransformer()    
    if operation == 'QuantileTransformer':
        scaler = preprocessing.QuantileTransformer() 
    
    return scaler.fit(df)

In [35]:
def numerical_transform(train, val, feature, operation):
    
    df_train = pd.DataFrame(train[feature])
    df_train.columns = [feature]
    
    df_val = pd.DataFrame(val[feature])
    df_val.columns = [feature]
    
    scaler_object = get_scaler_object(df_train, operation)
    
    df_train = scaler_object.transform(df_train)
    df_train = pd.DataFrame(df_train, columns= [feature])
    
    df_val = scaler_object.transform(df_val)
    df_val = pd.DataFrame(df_val, columns= [feature])
    
    return df_train[feature].values, df_val[feature].values

In [36]:
def transform_by_group_feature(train, val,feature, operation, groupby_feature):
    train[feature] = train.groupby(groupby_feature)[feature].transform(lambda x : data_standarization_train(x, feature, operation))
    val[feature] =  val.groupby(groupby_feature)[feature].transform(lambda x : data_standarization_val(x, feature, operation))
    return train[feature], val[feature]

In [37]:
def feature_standarization(train, val):
    
    target = configuration['metadata']['target']
    list_of_features = configuration['features']     
    
    for feature, operations in list_of_features.items():
        
        operation = operations['feature_standarization']
        if not operation:
            continue
        train[feature], val[feature] = numerical_transform(train, val, feature, operation)
    return train, val

## Category Transformation

In [38]:
def transform(train, val, feature, operation):
    
    
    df_train = pd.DataFrame(train[feature])
    df_train.columns = [feature]
    
    df_val = pd.DataFrame(val[feature])
    df_val.columns = [feature]
    
    categorical_encoder = get_categorical_encoder(df_train, type)
    
    df_train = categorical_encoder.transform(df_train)
    df_train = pd.DataFrame(df_train, columns= [feature])
    
    try:
        df_val = categorical_encoder.transform(df_val)
        df_val = pd.DataFrame(df_val, columns= [feature])
    except Exception as e: 
        print('e', e)
     
    return df_train[feature].values, df_val[feature].values

In [39]:
def get_categorical_encoder(df_train, operation ):
    le = preprocessing.LabelEncoder()
    return le.fit(df_train)

In [40]:
def category_feature_standarization(train, val):
    
    target = configuration['metadata']['target']
    list_of_features = configuration['features']   

    for feature, operations in list_of_features.items():
        operation = operations['category_encoding']
        if not operation:
            continue
        train[feature], val[feature] = transform(train, val, feature, operation)
        
    return train, val

# Feature Selection

In [41]:
def feature_selection(train, val):
    target = configuration['metadata']['target']
    operation = configuration['feature_selection']['operation']
    num_feats = configuration['feature_selection']['num_feats']
    drop_features = [target , configuration['metadata']['date']]
    
    X = train.drop(drop_features, axis = 1)
    y = train[target]
    
    print("Number of features Inputed",  len(X.columns.to_list()))
    print("Features-", X.columns)
    
    
    if operation == 'corr':
        important_features, _ = select_from_corr(X, y, num_feats)
    if operation == 'k_best':
        important_features, _ = select_kBest(X,y, num_feats)
    
    if operation == 'RFE':
        important_features, _ = select_from_rfe(X,y, num_feats)
        
    if operation == 'select_from_model':
        important_features, _ = select_from_model(X,y, num_feats)    
    
    
    print('number of feature selected', len(important_features))
    display('feature_selected',important_features )
    important_features = important_features + drop_features 
    

    return train[important_features], val[important_features]

In [42]:
def select_from_corr(X, y, num_feats):
    cor_list = []
    feature_name = X.columns.tolist()
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():
        cor = np.corrcoef(X[i], y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-num_feats:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_feature, cor_support

def select_kBest(X,y, num_feats):
    f_regression_selector = SelectKBest(f_regression, k=num_feats)
    f_regression_selector.fit(X, y)
    f_regression_selector_support = f_regression_selector.get_support()
    f_regression_selector_feature = X.loc[:,f_regression_selector_support].columns.tolist()
    return f_regression_selector_feature, f_regression_selector_support


def select_from_rfe(X,y, num_feats):
    rfe_selector = RFE(estimator= RandomForestRegressor(), n_features_to_select=num_feats, step=10, verbose=5)
    rfe_selector.fit(X, y)
    rfe_support = rfe_selector.get_support()
    rfe_feature = X.loc[:,rfe_support].columns.tolist()
    return rfe_feature, rfe_support
   

def select_from_model(X,y, model, num_feats):
    embeded_lr_selector = SelectFromModel(model, max_features=num_feats)
    embeded_lr_selector.fit(X, y)

    embeded_lr_support = embeded_lr_selector.get_support()
    embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
    return embeded_lr_feature, embeded_lr_support

## Feature Engineering

## Time Feature Extraction

In [43]:
def time_feature_extraction(train_kf, val_kf):
    date_feature = configuration['metadata']['date']
    
    train_kf['day'] = train_kf[date_feature].dt.day
    train_kf['week'] = train_kf[date_feature].dt.week
    
    val_kf['day'] = val_kf[date_feature].dt.day
    val_kf['week'] = val_kf[date_feature].dt.week
    

    return train_kf, val_kf

In [44]:
def select_features():
    features = list(configuration['features'].keys() )
    features.append(configuration['metadata']['date'])
    features.append(configuration['metadata']['target'])
    return features


In [45]:
def drop_missing_target_values(train_kf, val_kf):
    target = configuration['metadata']['target']
    train_kf = train_kf.loc[~train_kf[target].isnull()]
    val_kf = val_kf.loc[~val_kf[target].isnull()]
    return train_kf, val_kf
    

## Preprocessing

In [46]:
def get_fold_indexs(df):
    per_fold_indexes = dict()
    n_splits = configuration['metadata']['kfold']
    
    kfold = model_selection.KFold(n_splits = n_splits, shuffle=True)
    kfold.split(train_df)
    for fold, (train_index, val_index) in enumerate(kfold.split(df)):
        per_fold_indexes.update({fold: (train_index, val_index )})
        
    return per_fold_indexes
    

In [47]:
def data_preprocess(df, per_fold_indexes):  
    
    """ Heart of the pipleline , here each module is independent of the other so they can be asily 
        added or remove from the pipeline
    """
    
    for fold in range(0,n_splits) :
        train_index = per_fold_indexes[fold][0]
        val_index = per_fold_indexes[fold][1]
        
        
        train_kf = df.loc[train_index]
        val_kf =  df.loc[val_index]
        
        select_columns = list(configuration['features'].keys() )
        select_columns.append(configuration['metadata']['date'])
        
        features = select_features()
        train_kf = train_kf[features]
        val_kf = val_kf[features]
        
        train_kf, val_kf = drop_missing_target_values(train_kf, val_kf)                       
        train_kf, val_kf = fill_missing_values(train_kf, val_kf)
        train_kf, val_kf = time_feature_extraction(train_kf, val_kf)
        train_kf, val_kf = category_feature_standarization(train_kf, val_kf)
        train_kf, val_kf = feature_standarization(train_kf, val_kf)
        train_kf, val_kf = feature_selection(train_kf, val_kf)        
        preprocess_data.update({fold: [train_kf, val_kf]})
        
    return preprocess_data


## Model Evaulation

In [35]:
regressors = {
    'Linear': LinearRegression(),
    'Random': RandomForestRegressor(),
    'Lasso': Lasso(),
    'Elastic': ElasticNet()
}

In [36]:
def evaluate_model():
    for name, regressor in regressors.items():
        n_splits = configuration['metadata']['kfold']
        for fold in range(0,4) :

            train , val = preprocess_data[fold][0].copy(), preprocess_data[fold][1].copy()
                 
            target = configuration['metadata']['target']
            drop = [configuration['metadata']['date'] , target]
            
            xtr, xts = train.drop(drop, axis=1), val.drop(drop, axis=1)
            ytr, yts = train[target], val[target]

            regressor.fit(xtr, ytr)
            p = regressor.predict(xts)
            
            prediction_alias = EXPERIMENT_VERSION + "_" + name + "_" + str(fold )
            val_copy = val.copy()         
            val_copy[prediction_alias] = p
            
            name_of_file = RESULT_DIR + '/' + prediction_alias + '.csv'
            val_copy.to_csv(name_of_file, index = False)
       

In [1012]:
per_fold_indexs = get_fold_indexs(train_df)
preprocess_data = data_preprocess(train_df, per_fold_indexes)
evaluate_model()

