# Introduction

In [1]:
import pandas as pd
import numpy as np
import calendar
pd.set_option('display.max_columns', 100)

## Function

### Preprocessing Data

In [2]:
def X_preprocessing(df):
    
    # checking resposne variable
    if 'SalePrice' not in df:
        column_interval_dup = column_interval.copy()
        column_interval_dup.remove('SalePrice')
        
    # drop useless column "id"
    df = df.drop('Id', axis=1)
    
    df_nominal = df[column_nominal].copy()
    df_interval = df[column_interval_dup].copy()

    # preprocessing on interval dataframe
    ## simple null value imputation: interval to 0
    ## concat to the actual table
    df_interval = df_interval.fillna(0)
    df = df.drop(column_interval_dup, axis=1)
    df = pd.concat([df, df_interval], axis=1)

    # binary data convesion
    df['CentralAir'] = df[column_binary]['CentralAir'].apply(lambda x: 1 if x == 'Y' else 0)

    # preprocessing on nonimal dataframe
    ## simple null value imputation: categorical to NaN
    df_nominal = df_nominal.drop(col_major_missing, axis = 1)
    df_nominal = df_nominal.fillna(nominal_mode)
    

    ## convert MSSubClass value to string
    df_nominal['MSSubClass'] = df_nominal['MSSubClass'].astype('string')
    df_nominal['MSSubClass'] = df_nominal['MSSubClass'].apply(lambda x: 'MSSC_'+x)

    ## all set, convert all the categorical variable to dummies variables
    df_nominal = pd.get_dummies(df_nominal)

    ## concat to the actual table
    df = df.drop(column_nominal, axis=1)
    df = pd.concat([df, df_nominal], axis=1)

    # preprocessing on time dataframe
    ## time approach-renovation: comparing to the year built
    ## time approach-soldyear: comparing to the year built
    df['YearRemodAdd'] = df[column_time]['YearRemodAdd'] - df[column_time]['YearBuilt'] 
    df['YrSold'] = df[column_time]['YrSold'] - df[column_time]['YearBuilt']

    ## time approach-built: comparing to the oldest listing on the market
    df_yrbuilt_min = df[column_time]['YearBuilt'].min()
    df['YearBuilt'] = df[column_time]['YearBuilt'].apply(lambda x: x - df_yrbuilt_min)

    ## time apprach-soldmonth: to dummies variable
    df['MoSold'] = df['MoSold'].apply(lambda x: 'sold_'+calendar.month_name[x])
    df_dum_months = pd.get_dummies(df['MoSold'])
    df = df.drop(['MoSold'], axis=1)

    df = pd.concat([df,df_dum_months], axis=1)

    if 'SalePrice' in df:
    # final to split between dependent variable to resposne
        X = df.drop('SalePrice', axis=1)
        y = df['SalePrice']

    else:
        print('This is test dataset return only X...')
        X = df.copy()
        y = None

    return X, y

In [3]:
def z_score_scaling(X_df, mean, stdev):
    '''
    This function converts the normal data after its pass preprocessing to z-score scalling
    input: x df, mean, and stdev
    output: x df (scalled)
    '''
    
    return 

## Import data(s)

In [4]:
train_df = pd.read_csv('data/train.csv')
df_cols_process = pd.read_csv('data/column_types_and_process_imputation.csv')

## Data Exploration

-> Numerical - Categorical (two general groups) <- 

- Useless (unique identifier usually useless)
- Nominal (as named or categorical)
- Binary (either 1/0)
- Ordinal (Ordinal)
- Count (Integer number starting from 0 exclude negative number)
- Time (consiting seasonal and date)
- Interval (Examples include percentages, temperatures, and income.)

In [5]:
df_cols_process['data types'] = df_cols_process['data types'].apply(lambda x: x.strip())

In [6]:
df_cols_process['data types'].unique()

array(['useless', 'nominal', 'interval', 'ordinal', 'time', 'binary'],
      dtype=object)

In [7]:
column_interval = list(df_cols_process[df_cols_process['data types'] == 'interval']['column']) # no need processing
column_nominal = list(df_cols_process[df_cols_process['data types'] == 'nominal']['column']) 
column_ordinal = list(df_cols_process[df_cols_process['data types'] == 'ordinal']['column']) # no need processing
column_binary = list(df_cols_process[df_cols_process['data types'] == 'binary']['column'])
column_time = list(df_cols_process[df_cols_process['data types'] == 'time']['column'])

## Data Pre-processing

### Drop useless data types

In [8]:
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# from sklearn.preprocessing import OrdinalEncoder

In [9]:
train_df_temp = train_df.drop('Id',axis = 1).copy()

In [10]:
train_df_nominal = train_df_temp[column_nominal].copy()
train_df_interval = train_df_temp[column_interval].copy()

### Data Imputation

#### Nominal

In [11]:
# build an array for removing nominal columns that have missinv value more than 30%.
col_major_missing = np.array(train_df_nominal.columns[train_df_nominal.isnull().sum()/train_df_nominal.shape[0] > 0.30])

# drop it
train_df_nominal = train_df_nominal.drop(col_major_missing, axis=1)

# assign mode to a variable "nominal mode".
nominal_mode = train_df_nominal.mode().iloc[0]

# simple null value imputation: Mode
train_df_nominal = train_df_nominal.fillna(nominal_mode)

#### Interval 

In [12]:
# simple null value imputation: interval to 0
train_df_interval = train_df_interval.fillna(0)

In [13]:
train_df_temp = train_df_temp.drop(column_interval, axis=1)
train_df_temp = pd.concat([train_df_temp, train_df_interval], axis=1)

### Nominal data to Dummies

In [14]:
train_df_nominal['MSSubClass'] = train_df_nominal['MSSubClass'].astype('string')
train_df_nominal['MSSubClass'] = train_df_nominal['MSSubClass'].apply(lambda x: 'MSSC_'+x)

In [15]:
# from 43 to 281 columns
train_df_nominal = pd.get_dummies(train_df_nominal)

In [16]:
train_df_temp = train_df_temp.drop(column_nominal, axis=1)
train_df_temp = pd.concat([train_df_temp, train_df_nominal], axis=1)

In [66]:
train_df['Condition2']

0       Norm
1       Norm
2       Norm
3       Norm
4       Norm
        ... 
1455    Norm
1456    Norm
1457    Norm
1458    Norm
1459    Norm
Name: Condition2, Length: 1460, dtype: object

In [64]:
train_df_temp['Condition2_RRAe']

0       0
1       0
2       0
3       0
4       0
       ..
1455    0
1456    0
1457    0
1458    0
1459    0
Name: Condition2_RRAe, Length: 1460, dtype: uint8

### Change the value of binary data

In [17]:
train_df_temp['CentralAir'] = train_df_temp[column_binary]['CentralAir'].apply(lambda x: 1 if x == 'Y' else 0)

### Change the value of time data

In [18]:
# unique approach on how to process the time data
train_df_temp['YearRemodAdd'] = train_df_temp[column_time]['YearRemodAdd'] - train_df_temp[column_time]['YearBuilt']
train_df_temp['YrSold'] = train_df_temp[column_time]['YrSold'] - train_df_temp[column_time]['YearBuilt']

train_df_temp_yrbuilt_min = train_df_temp[column_time]['YearBuilt'].min()
train_df_temp['YearBuilt'] = train_df_temp[column_time]['YearBuilt'].apply(lambda x: x - train_df_temp_yrbuilt_min)

In [19]:
# turn YrSold compare to Yr Built.
train_df_temp['MoSold'] = train_df_temp['MoSold'].apply(lambda x: 'sold_'+calendar.month_name[x])
train_df_temp_dum_months = pd.get_dummies(train_df_temp['MoSold'])
train_df_temp = train_df_temp.drop(['MoSold'], axis=1)

train_df_temp = pd.concat([train_df_temp,train_df_temp_dum_months], axis=1)

## Define Dependent and Response Data of Training

In [20]:
X_train = train_df_temp.drop('SalePrice', axis=1)
Y_train = train_df_temp['SalePrice']

---

## Feature Scalling

### Z-score

In [21]:
# Zscore using SKLearn

from sklearn.preprocessing import scale
# scale(X_orig, axis=0, with_mean=True, with_std=True, copy=True)

def zscore_normalize_features(X, mu = False, sigma = False):
    """
    computes  X, zcore normalized by column
    
    Args:
      X (ndarray (m,n))     : input data, m examples, n features
      
    Returns:
      X_norm (ndarray (m,n)): input normalized by column
      mu (ndarray (n,))     : mean of each feature
      sigma (ndarray (n,))  : standard deviation of each feature
    """
    
    if mu is False and sigma is False:
        # find the mean of each column/feature
        mu     = np.mean(X, axis=0)                 # mu will have shape (n,)
        # find the standard deviation of each column/feature
        sigma  = np.std(X, axis=0)                  # sigma will have shape (n,)
        # element-wise, subtract mu for that column from each example, divide by std for that column
        
    X_norm = (X - mu) / sigma      

    return (X_norm, mu, sigma)

In [22]:
# using SKLearn
# X_norm = scale(X_train, with_mean = True, with_std = True, copy = True)
X_train_scale, mu, sigma = zscore_normalize_features(X_train)

## Fitting Linear Regression

In [23]:
from sklearn.linear_model import LinearRegression

In [24]:
linear_model = LinearRegression()

In [31]:
X_train_arr = np.array(X_train_scale)

In [32]:
linear_model.fit(X_train_arr, Y_train)

LinearRegression()

In [33]:
b = linear_model.intercept_
w = linear_model.coef_

In [35]:
Y_train_pred = linear_model.predict(X_train_arr)

## Root Mean Squared Error

In [36]:
from sklearn.metrics import mean_squared_error
import math

In [37]:
train_eval_metrics = math.sqrt(mean_squared_error(Y_train_pred, Y_train))

In [38]:
train_eval_metrics

20949.493465736316

## Test Dataset

In [39]:
test_df = pd.read_csv('data/test.csv')

In [40]:
X_test, y = X_preprocessing(test_df)

This is test dataset return only X...


In [41]:
X_test_scale, mu_test, sigma_test = zscore_normalize_features(X_test, mu, sigma)

## Adjust the column of train and test datasource

In [48]:
X_train_scale_final, X_test_scale_final = X_train_scale.align(X_test_scale, join = 'left', axis=1)

In [68]:
X_test_scale_final = X_test_scale_final.fillna(0)

In [72]:
X_test_scale_final.columns[X_test_scale_final.isna().sum() != 0]

Index([], dtype='object')

# Resources

https://medium.com/analytics-vidhya/ways-to-handle-categorical-column-missing-data-its-implementations-15dc4a56893#:~:text=Step%201%3A%20Find%20which%20category,and%20keep%20newly%20imputed%20columns.&text=Advantage%3A%20Simple%20and%20easy%20to%20implement%20for%20categorical%20variables%2Fcolumns.

