# Introduction

In [1]:
import pandas as pd
import numpy as np
import calendar
pd.set_option('display.max_columns', 100)

## Function

### Preprocessing Data

In [2]:
def X_preprocessing(df):
    '''
    '''
    
    # checking resposne variable
    column_interval_dup = column_interval.copy()
    if 'SalePrice' not in df:
        column_interval_dup.remove('SalePrice')
        
    # drop useless column "id"
    df = df.drop('Id', axis=1)
    
    df_nominal = df[column_nominal].copy()
    df_interval = df[column_interval_dup].copy()

    # preprocessing on interval dataframe
    ## simple null value imputation: interval to 0
    ## concat to the actual table
    df_interval = df_interval.fillna(0)
    df = df.drop(column_interval_dup, axis=1)
    df = pd.concat([df, df_interval], axis=1)

    # binary data convesion
    df['CentralAir'] = df[column_binary]['CentralAir'].apply(lambda x: 1 if x == 'Y' else 0)

    # preprocessing on nonimal dataframe
    ## simple null value imputation: categorical to NaN
    df_nominal = df_nominal.drop(col_major_missing, axis = 1)
    df_nominal = df_nominal.fillna(nominal_mode)
    

    ## convert MSSubClass value to string
    df_nominal['MSSubClass'] = df_nominal['MSSubClass'].astype('string')
    df_nominal['MSSubClass'] = df_nominal['MSSubClass'].apply(lambda x: 'MSSC_'+x)

    ## all set, convert all the categorical variable to dummies variables
    df_nominal = pd.get_dummies(df_nominal)

    ## concat to the actual table
    df = df.drop(column_nominal, axis=1)
    df = pd.concat([df, df_nominal], axis=1)

    # preprocessing on time dataframe
    ## time approach-renovation: comparing to the year built
    ## time approach-soldyear: comparing to the year built
    df['YearRemodAdd'] = df[column_time]['YearRemodAdd'] - df[column_time]['YearBuilt'] 
    df['YrSold'] = df[column_time]['YrSold'] - df[column_time]['YearBuilt']

    ## time approach-built: comparing to the oldest listing on the market
    df_yrbuilt_min = df[column_time]['YearBuilt'].min()
    df['YearBuilt'] = df[column_time]['YearBuilt'].apply(lambda x: x - df_yrbuilt_min)

    ## time apprach-soldmonth: to dummies variable
    df['MoSold'] = df['MoSold'].apply(lambda x: 'sold_'+calendar.month_name[x])
    df_dum_months = pd.get_dummies(df['MoSold'])
    df = df.drop(['MoSold'], axis=1)

    df = pd.concat([df,df_dum_months], axis=1)

    if 'SalePrice' in df:
    # final to split between dependent variable to resposne
        X = df.drop('SalePrice', axis=1)
        y = df['SalePrice']

    else:
        print('This is test dataset return only X...')
        X = df.copy()
        y = None

    return X, y

## Import data(s)

In [3]:
train_df = pd.read_csv('data/train.csv')
df_cols_process = pd.read_csv('data/column_types_and_process_imputation.csv')

In [4]:
df_cols_process.head()

Unnamed: 0,no,column,count null,data type,description,data types,data process,imputation
0,1,Id,1460 non-null,int64,unique,useless,drop,
1,2,MSSubClass,1460 non-null,int64,Identifies the type of dwelling involved in th...,nominal,one-hot-encoding,
2,3,MSZoning,1460 non-null,object,Identifies the general zoning classification o...,nominal,one-hot-encoding,
3,4,LotFrontage,1201 non-null,float64,Linear feet of street connected to property,interval,as is,
4,5,LotArea,1460 non-null,int64,Lot size in square feet,interval,as is,


## Data Exploration

-> Numerical - Categorical (two general groups) <- 

- Useless (unique identifier usually useless)
- Nominal (as named or categorical)
- Binary (either 1/0)
- Ordinal (Ordinal)
- Count (Integer number starting from 0 exclude negative number)
- Time (consiting seasonal and date)
- Interval (Examples include percentages, temperatures, and income.)

In [5]:
df_cols_process['data types'] = df_cols_process['data types'].apply(lambda x: x.strip())

In [6]:
df_cols_process['data types'].unique()

array(['useless', 'nominal', 'interval', 'ordinal', 'time', 'binary'],
      dtype=object)

In [7]:
column_interval = list(df_cols_process[df_cols_process['data types'] == 'interval']['column']) # no need processing
column_nominal = list(df_cols_process[df_cols_process['data types'] == 'nominal']['column']) 
column_ordinal = list(df_cols_process[df_cols_process['data types'] == 'ordinal']['column']) # no need processing
column_binary = list(df_cols_process[df_cols_process['data types'] == 'binary']['column'])
column_time = list(df_cols_process[df_cols_process['data types'] == 'time']['column'])

## Data Pre-processing

### Drop useless data types

In [8]:
train_df_temp = train_df.drop('Id',axis = 1).copy()

train_df_nominal = train_df_temp[column_nominal].copy()
train_df_interval = train_df_temp[column_interval].copy()

In [9]:
col_major_missing = np.array(train_df_nominal.columns[train_df_nominal.isnull().sum()/train_df_nominal.shape[0] > 0.30])
nominal_mode = train_df_nominal.mode().iloc[0]

In [10]:
X_train, Y_train = X_preprocessing(train_df)

## Feature Scalling

### Z-score

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
scaler = StandardScaler().fit(X_train)
X_train_scale = scaler.transform(X_train)
X_train_scale = pd.DataFrame(X_train_scale, columns = X_train.columns)

## Training Model

### Linear Regression

In [13]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()

In [14]:
X_train_arr = np.array(X_train_scale)
linear_model.fit(X_train_arr, Y_train)

LinearRegression()

In [15]:
Y_train_pred = linear_model.predict(X_train_arr)

### Random Forest Regressor

In [16]:
from sklearn.ensemble import RandomForestRegressor

In [17]:
random_forest = RandomForestRegressor()
random_forest.fit(X_train_arr, Y_train)

RandomForestRegressor()

In [18]:
Y_train_pred = random_forest.predict(X_train_arr)

## Root Mean Squared Error

In [19]:
from sklearn.metrics import mean_squared_error
import math

In [20]:
train_eval_metrics = math.sqrt(mean_squared_error(Y_train_pred, Y_train))

In [21]:
train_eval_metrics

11153.795874488806

# Test Dataset

In [22]:
test_df = pd.read_csv('data/test.csv')

In [23]:
X_test, y_test = X_preprocessing(test_df)

This is test dataset return only X...


## Adjust the column of train and test datasource

In [24]:
X_train_scale_final, X_test = X_train_scale.align(X_test, join = 'left', axis=1)

### Using z-scaler to transform it

In [25]:
X_test_scale = scaler.transform(X_test)
X_test_scale_final = pd.DataFrame(X_test_scale, columns= X_test.columns)

X_test_scale_final = X_test_scale_final.fillna(0)

In [27]:
Y_test_pred = random_forest.predict(np.array(X_test_scale_final))

In [28]:
df_submission = pd.DataFrame(Y_test_pred, index=range(1461, 2920))
df_submission.columns = ['SalePrice']
df_submission.index.name = 'Id'

In [29]:
df_submission.to_csv('submission_15_january_2023.csv')

# Resources

https://towardsdatascience.com/7-data-types-a-better-way-to-think-about-data-types-for-machine-learning-939fae99a689

https://medium.com/analytics-vidhya/ways-to-handle-categorical-column-missing-data-its-implementations-15dc4a56893#:~:text=Step%201%3A%20Find%20which%20category,and%20keep%20newly%20imputed%20columns.&text=Advantage%3A%20Simple%20and%20easy%20to%20implement%20for%20categorical%20variables%2Fcolumns.



https://stackoverflow.com/questions/62653114/how-can-i-deal-with-this-git-warning-pulling-without-specifying-how-to-reconci