## Objectives of this section:

1. Preprocessing of the data to get it ready for training the model 
2. Feature engineering 
    - See if we can combine some features
    - Then we will create some polynomial features using the columns with the highest correlation to our target variable
    - Try simplifying some existing features. e.g columns with a rating of 1-10, trt splitting it into 3 bands of "Bad", "Average" & "Good"
3. Feature selection, using methods like Recursive Feature Elimination & Variance Inflation Factors

### Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.append('/Users/ganeshsivam/Mods')
import corr

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

sns.set_style('darkgrid')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
df = pd.read_csv("./data/train_cleaned.csv")

#### Create dummy variables of categorical columns

#### Categorical columns can be identified by columns with dtypes as Object. However, a couple of numeric columns actually are categorical columns, these will be manually added to the cat_cols list

In [3]:

def dummies(df):
    cat_cols = []
    for i in df.columns:
        if df.dtypes[i] == "O":
            cat_cols.append(i)
    cat_cols.append("MS SubClass")
    cat_cols.append("Mo Sold")
    for c in cat_cols:
        dums = pd.get_dummies(df[c],prefix=c)
        df = pd.concat([df,dums],axis=1)
        df.drop(c,inplace=True,axis=1)
        
    year_cols = [c for c in df.columns if "Year" in c or "year" in c or 'Yr' in c]
    year_cols = [c for c in year_cols if "rage" not in c]
    #year_cols.remove('Garage Yr Blt')
    for c in year_cols:
        df[c] = 2019 - df[c]
    return df
df = dummies(df)

#### Save the datasets once before feature engineering so later on we can test the effect of feature engineering on our metrics

In [4]:

train = pd.read_csv("./data/train_cleaned.csv")
test = pd.read_csv("./data/test_cleaned.csv")

train = dummies(train)
test = dummies(test)

def cols_sync(df1,df2):
    df1_cols = df1.columns
    df2_cols = df2.columns
    df1_add = []
    df2_add = []
    for i in df1_cols:
        if i not in df2_cols:
            df2_add.append(i)
    for i in df2_cols:
        if i not in df1_cols:
            df1_add.append(i)
    for c in df1_add:
        df1[c] = 0
    for c in df2_add:
        df2[c] = 0
        
    df2 = df2[df1.columns]
    return df1,df2

    

train,test = cols_sync(train,test)
train.to_csv("./data/train_bef_FE.csv",index=False)
test.to_csv("./data/test_bef_FE.csv",index=False)

## Feature Engineering

### We will create some combined columns for the data, like "TotalBath" and "Total SF" which sums different related columns up

In [328]:
def comb_features(df):
    df["TotalBath"] = df["Bsmt Full Bath"] + (0.5 * df["Bsmt Half Bath"]) + df["Full Bath"] + (0.5 * df["Half Bath"])
    # Total SF for house (incl. basement)
    df["Total SF"] = df["Gr Liv Area"] + df["Total Bsmt SF"] +df["1st Flr SF"] + df["2nd Flr SF"]
    # Total SF for 1st + 2nd floors
    #df["AllFlrsSF"] = df["1st Flr SF"] + df["2nd Flr SF"]
    # Total SF for porch
    df["AllPorchSF"] = df["Open Porch SF"] + df["Enclosed Porch"] + df["3Ssn Porch"] + df["Screen Porch"]
    return df
df = comb_features(df)

In [329]:
corr_df = pd.DataFrame(data = df.corr()['SalePrice'].values,index=df.columns,columns=["corelataion_target"])
corr_df['corelataion_target'] = abs(corr_df['corelataion_target'])
corr_df.sort_values("corelataion_target",ascending=False,inplace=True)
top_15 = list(corr_df.index[1:16])


### Create a function that adds polynomial features for the columns inputed, in this case we will use the top 15 correalated columns

In [330]:
def poly_features(df,cols):
    for c in cols:
        h = str(c+"s2")
        i = str(c+"s3")
        j = str(c+"sq")
        df[h] = df[c]**2
        df[i] = df[c]**3
        df[j] = np.sqrt(df[c])
    return df
df = poly_features(df,top_15)

We will need a function to sync the columns in our train and test datasets, due to dummy variables columns, all the columns might not be common

In [331]:

def cols_sync(df1,df2):
    df1_cols = df1.columns
    df2_cols = df2.columns
    df1_add = []
    df2_add = []
    for i in df1_cols:
        if i not in df2_cols:
            df2_add.append(i)
    for i in df2_cols:
        if i not in df1_cols:
            df1_add.append(i)
    for c in df1_add:
        df1[c] = 0
    for c in df2_add:
        df2[c] = 0
        
    df2 = df2[df1.columns]
    return df1,df2

    


#### We will save different iterartions of our feature engineering to test for the best results

In [332]:
train = pd.read_csv("./data/train_cleaned.csv")
test = pd.read_csv("./data/test_cleaned.csv")

def eda_proc_no_poly(df):
    df = dummies(df)
    df = comb_features(df)
    #df = poly_features(df,top_15)
    return df

train = eda_proc_no_poly(train)
test = eda_proc_no_poly(test)
train,test = cols_sync(train,test)

train.to_csv("./data/train_nopoly.csv",index=False)
test.drop("SalePrice",axis=1,inplace=True)
test.to_csv("./data/test_nopoly.csv",index=False)

In [333]:
train = pd.read_csv("./data/train_cleaned.csv")
test = pd.read_csv("./data/test_cleaned.csv")
                                                                                                                                                                                                           

def eda_proc_final(df):
    df = dummies(df)
    df = comb_features(df)
    df = poly_features(df,top_15)
    return df

train = eda_proc_final(train)
test = eda_proc_final(test)
train,test = cols_sync(train,test)

train.to_csv("./data/train_final.csv",index=False)
test.drop("SalePrice",axis=1,inplace=True)
test.to_csv("./data/test_final.csv",index=False)