### Libraries and Settings

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

In [2]:
pd.set_option('display.max_columns', None)

### Import Data

In [3]:
train = pd.read_csv('../data/Model Train.csv').drop('Unnamed: 0', axis=1)

In [4]:
test = pd.read_csv('../data/Model Test.csv').drop('Unnamed: 0', axis=1)

In [5]:
Ktest = pd.read_csv('../data/Kaggle_test.csv')

In [6]:
# features = Ktest.isnull().sum().index

# missing = []
# missing.append(train.isnull().sum()[:-1])
# missing.append(test.isnull().sum()[:-1])
# missing.append(Ktest.isnull().sum())

# missingness = pd.DataFrame({'train':missing[0],'test':missing[1],'Ktest':missing[2]},
#                            index=features)

# pd.set_option('display.max_rows', None)
# missingness
# # pd.reset_option('display.max_rows')

# Nominal Categorical Features

In [7]:
cat_nom_cols = ['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour',
                'Utilities','LotConfig','LandSlope','Neighborhood','Condition1',
                'Condition2','BldgType','HouseStyle']

### MSSubClass

Modify this column to 'PUD' if the house is a PUD, and 'NotPUD' if it's not a PUD. Other infomation in this column is redundent because it's repeated in other columns.

In [8]:
def pp_MSSubClass(sr):
    sr_temp = sr.apply(lambda x: 'PUD' if x in [120,150,160,180] else 'NotPUD')
    return(sr_temp)

In [9]:
train['MSSubClass'] = pp_MSSubClass(train['MSSubClass'])
test['MSSubClass'] = pp_MSSubClass(test['MSSubClass'])
Ktest['MSSubClass'] = pp_MSSubClass(Ktest['MSSubClass'])

### Alley

NA means no alley access. So impute with 'NoAlley'.

In [10]:
def pp_Alley(sr):
    sr_temp = sr.fillna('NoAlley')
    return(sr_temp)

In [11]:
train['Alley'] = pp_Alley(train['Alley'])
test['Alley'] = pp_Alley(test['Alley'])
Ktest['Alley'] = pp_Alley(Ktest['Alley'])

### Impute

In [12]:
cat_nom_imp = SimpleImputer(strategy='most_frequent')
cat_nom_imp = cat_nom_imp.fit(train[cat_nom_cols])

In [13]:
cat_nom_imp_train = pd.DataFrame(cat_nom_imp.transform(train[cat_nom_cols]),
                                columns = cat_nom_cols)
cat_nom_imp_test = pd.DataFrame(cat_nom_imp.transform(test[cat_nom_cols]),
                                columns = cat_nom_cols)
cat_nom_imp_Ktest = pd.DataFrame(cat_nom_imp.transform(Ktest[cat_nom_cols]),
                                columns = cat_nom_cols)

### One Hot Encoding

In [14]:
drop_col = train[cat_nom_cols].apply(lambda col: col.value_counts().index[0])

cat_nom_ohe = OneHotEncoder(categories='auto', drop=drop_col, sparse = False)
cat_nom_ohe = cat_nom_ohe.fit(cat_nom_imp_train)

new_cols = cat_nom_ohe.get_feature_names(cat_nom_cols)

In [15]:
cat_nom_clean_train = pd.DataFrame(cat_nom_ohe.transform(cat_nom_imp_train),
                                   columns=new_cols )
cat_nom_clean_test = pd.DataFrame(cat_nom_ohe.transform(cat_nom_imp_test),
                                  columns=new_cols )
cat_nom_clean_Ktest = pd.DataFrame(cat_nom_ohe.transform(cat_nom_imp_Ktest),
                                   columns=new_cols )

### Condition1 & Condition2

Combine condition1 and condition2 together.

In [16]:
Condition1_cols = [col for col in new_cols if 'Condition1' in col]
Condition2_cols = [col for col in new_cols if 'Condition2' in col]
print(Condition1_cols)
print(Condition2_cols)

['Condition1_Artery', 'Condition1_Feedr', 'Condition1_PosA', 'Condition1_PosN', 'Condition1_RRAe', 'Condition1_RRAn', 'Condition1_RRNe', 'Condition1_RRNn']
['Condition2_Artery', 'Condition2_Feedr', 'Condition2_PosA', 'Condition2_PosN', 'Condition2_RRAe', 'Condition2_RRAn', 'Condition2_RRNn']


One feature, 'Condition1_RRNe', is not in Condition2_cols. Get rid of it from Condition1_cols.

In [17]:
Condition1_cols.remove('Condition1_RRNe')

In [18]:
def combine_C1_C2(df):
    df_temp = df.copy()
    for i in range(len(Condition1_cols)):
        mask = df_temp[Condition2_cols[i]] == 1
        df_temp.loc[mask,Condition1_cols[i]] = 1
    df_temp = df_temp.drop(Condition2_cols,axis=1)
    return(df_temp)

In [19]:
cat_nom_clean_train = combine_C1_C2(cat_nom_clean_train)
cat_nom_clean_test = combine_C1_C2(cat_nom_clean_test)
cat_nom_clean_Ktest = combine_C1_C2(cat_nom_clean_Ktest)

# Ordinal Categorical Features

In [20]:
cat_ord_cols = ['OverallQual', 'OverallCond','AgeBuilt','AgeRemodAdd']

### YearBuilt

Change this column to building age at sale.

In [21]:
train['AgeBuilt'] = train['YrSold'] - train['YearBuilt']
test['AgeBuilt'] = test['YrSold'] - test['YearBuilt']
Ktest['AgeBuilt'] = Ktest['YrSold'] - Ktest['YearBuilt']

### YearRemodAdd

Change this column to remodeling age at sale.

In [22]:
train['AgeRemodAdd'] = train['YrSold'] - train['YearRemodAdd']
test['AgeRemodAdd'] = test['YrSold'] - test['YearRemodAdd']
Ktest['AgeRemodAdd'] = Ktest['YrSold'] - Ktest['YearRemodAdd']

### Impute

In [23]:
cat_ord_imp = SimpleImputer(strategy='median')
cat_ord_imp = cat_ord_imp.fit(train[cat_ord_cols])

In [24]:
cat_ord_imp_train = pd.DataFrame(cat_ord_imp.transform(train[cat_ord_cols]),
                                columns = cat_ord_cols)
cat_ord_imp_test = pd.DataFrame(cat_ord_imp.transform(test[cat_ord_cols]),
                                columns = cat_ord_cols)
cat_ord_imp_Ktest = pd.DataFrame(cat_ord_imp.transform(Ktest[cat_ord_cols]),
                                columns = cat_ord_cols)

### MinMax Scale

In [25]:
cat_ord_scl = MinMaxScaler()
cat_ord_scl = cat_ord_scl.fit(cat_ord_imp_train)

In [26]:
cat_ord_clean_train = pd.DataFrame(cat_ord_scl.transform(cat_ord_imp_train),
                                columns = cat_ord_cols)
cat_ord_clean_test = pd.DataFrame(cat_ord_scl.transform(cat_ord_imp_test),
                                columns = cat_ord_cols)
cat_ord_clean_Ktest = pd.DataFrame(cat_ord_scl.transform(cat_ord_imp_Ktest),
                                columns = cat_ord_cols)

# Numerical Features

In [27]:
num_cols = ['LotFrontage','LotArea']

For both columns, log transform first, then scale, then impute using knn on the whole dataset.

### LotArea

In [28]:
train['LotArea'] = np.log(train['LotArea'])
test['LotArea'] = np.log(test['LotArea'])
Ktest['LotArea'] = np.log(Ktest['LotArea'])

### LotFrontage

In [29]:
train['LotFrontage'] = np.log(train['LotFrontage'])
test['LotFrontage'] = np.log(test['LotFrontage'])
Ktest['LotFrontage'] = np.log(Ktest['LotFrontage'])

### Scale

In [30]:
num_scl = StandardScaler()
num_scl = num_scl.fit(train[num_cols])

In [31]:
num_scl_train = pd.DataFrame(num_scl.transform(train[num_cols]),columns=num_cols)
num_scl_test = pd.DataFrame(num_scl.transform(test[num_cols]),columns=num_cols)
num_scl_Ktest = pd.DataFrame(num_scl.transform(Ktest[num_cols]),columns=num_cols)

# Merge Together

In [32]:
clean_train = pd.concat([cat_nom_clean_train,cat_ord_clean_train,num_scl_train],axis=1)
clean_test = pd.concat([cat_nom_clean_test,cat_ord_clean_test,num_scl_test],axis=1)
clean_Ktest = pd.concat([cat_nom_clean_Ktest,cat_ord_clean_Ktest,num_scl_Ktest],axis=1)

### Impute LotFrontage Using KNN

In [33]:
from math import sqrt
n_neighbors = round(sqrt(train.shape[0]))
num_imp = KNNImputer(n_neighbors=n_neighbors)
num_imp = num_imp.fit(clean_train)

In [34]:
cols = clean_train.columns
clean_train = pd.DataFrame(num_imp.transform(clean_train),columns=cols)
clean_test = pd.DataFrame(num_imp.transform(clean_test),columns=cols)
clean_Ktest = pd.DataFrame(num_imp.transform(clean_Ktest),columns=cols)

It looks like the whole data set is imputed with KNN. But since LotArea doesn't have missing values, and all the other features have been imputed before, this step actually only imputes LotFrontage.

### Output

In [35]:
clean_train = pd.concat([train['Id'],clean_train],axis=1)
clean_test = pd.concat([test['Id'],clean_test],axis=1)
clean_Ktest = pd.concat([Ktest['Id'],clean_Ktest],axis=1)

In [36]:
clean_train.to_csv('../data/clean_train_Ting')
clean_test.to_csv('../data/clean_test_Ting')
clean_Ktest.to_csv('../data/clean_Ktest_Ting')