In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import statsmodels.api as sm
df=pd.read_csv('./../data/train_clean.csv')

<p style="font-size:25px">In this notebook, I will be engineering features for this project. </p>
</br>
<p style="font-size:16px;">My process for feature engineering in this project consists of the following:</p>
<ol>
    <li><b>Remove more useless variables</b></li>
    <dd>After some analysis, I was able to find more variables with little to no predictive power so I dropped them.</dd>
    <li><b>Removing homes with outlier prices</b></li>
    <dd>As mentioned in the previous section, I wish for my model to be used for the average home because that is what is most popular so I don't want my model to look at houses that are statistically abnormal in price.</dd>
    <li><b>Removing heavily imbalanced qualitative variables</b></li>
    <dd>Heavily imbalanced quantitative variables are essentially constants and don't offer any predictive power.</dd>
    <li><b>Creating Indicator Variables</b></li>
    <dd>Not all houses have the same features, so I created indicator variables that signal whether a house has a specific feature or not.</dd>  
    <li><b>Scaling Dependent Variables</b></li>
    <dd>Scaling the dependent variables gives them equal weight which can help with model performance</dd> 
    <li><b>Removing Influential Points</b></li>
    <dd>Influential points hurt model performance so it is best to remove them.</dd> 
</ol>    

In [2]:
#Removing more useless variables
df=df.drop(columns=['MoSold','YrSold','Id'])

In [3]:
#Removing home price outliers
q1=df['SalePrice'].quantile(.25)
q3=df['SalePrice'].quantile(.75)
iqr=q3-q1
df=df[df['SalePrice']<q3+iqr*1.5]
df=df[df['SalePrice']>q1-iqr*1.5]

In [4]:
#Removing heavily imbalanced quantitative variables
temp=[i for i in df.columns if i not in df.corr().columns]
temp=df[temp]
temp=temp.describe().T
temp['freq (as a percentage)']=temp['freq']/temp['count']
rem=temp[temp['freq (as a percentage)']>.9].index
df=df.drop(columns=rem)

In [5]:
#creating variables
temp2=[]
import matplotlib.pyplot as plt
for col in df.corr().columns:
    temp=[str(i) for i in df[col]]
    temp=pd.DataFrame(temp)
    temp=temp.value_counts()
    if temp[0]>.5*df.shape[0]:
        if len(temp)>10:
            df['has_'+col]=[i if i==0 else 1 for i in df[col]]
            temp2.append(col)
temp=df['YearBuilt']==df['YearRemodAdd']
df['is_house_remod']=[1 if i ==True else 0 for i in temp]
temp=df['YearBuilt']==df['GarageYrBlt']
df['is_built_with_garage']=[1 if i ==True else 0 for i in temp]
temp=df['YearRemodAdd']==df['GarageYrBlt']
df['remod_with_garage']=[1 if i ==True else 0 for i in temp]

In [6]:
temp=df['SalePrice']
df=pd.get_dummies(df,drop_first=True)

In [7]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df=pd.DataFrame(scaler.fit_transform(df),columns=df.columns)

In [8]:
df['SalePrice']=[i for i in temp]

In [9]:
#Removing inflential points
temp=df.corr()
temp=temp[temp['SalePrice']>.5]
temp['SalePrice'].index[:-1]
for col in temp['SalePrice'].index[:-1]:
    x=df[col]
    y=df['SalePrice']
    
    x = sm.add_constant(x)
    model = sm.OLS(y,x).fit()

    np.set_printoptions(suppress=True)

    #create instance of influence
    influence = model.get_influence()

    #obtain Cook's distance for each observation
    cooks = influence.cooks_distance

    temp=pd.DataFrame([df[col],df['SalePrice'],cooks[0]]).T
    temp.columns=[col,'SalePrice','c.d.']
    temp=temp[temp['c.d.']>0.5]
    if temp.shape[0]!=0:
        for i in temp[col]:
            df=df[df[col]!=i]

In [10]:
df.to_csv('./../data/data.csv',index=False)