In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use('ggplot')
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [3]:
model_train = pd.read_csv('Model Train.csv', index_col= 0).set_index(['Id']).iloc[:,-23:-1]
model_test = pd.read_csv('Model Test.csv', index_col = 0).set_index(['Id']).iloc[:,-23:-1]
kaggle_test = pd.read_csv('Kaggle_test.csv', index_col = 0).iloc[:,-22:]

### Categorical vs Continuous (Numerical)

In [4]:
cat = ['GarageType','GarageFinish','PavedDrive', 'Fence','MiscFeature','MoSold','SaleType', 'SaleCondition']
numeric = ['GarageAge', 'GarageCars', 'WoodDeckSF','TotalPorch','MiscVal','YrSold','GarageQual']
# ordinal = 'GarageQual'




### Imputations and Substitues

#### GarageType

In [5]:
model_train['GarageType'] = model_train['GarageType'].fillna('NoGarage')
model_test['GarageType'] = model_test['GarageType'].fillna('NoGarage')
kaggle_test['GarageType'] = kaggle_test['GarageType'].fillna('NoGarage')

#### GarageFinish

In [6]:
model_train['GarageFinish'] = model_train['GarageFinish'].fillna('NoFinish')
model_test['GarageFinish'] = model_test['GarageFinish'].fillna('NoFinish')
kaggle_test['GarageFinish'] = kaggle_test['GarageFinish'].fillna('NoFinish')

#### Fence

In [7]:
model_train['Fence'] = model_train['Fence'].fillna('NoFence')
model_test['Fence'] = model_test['Fence'].fillna('NoFence')
kaggle_test['Fence'] = kaggle_test['Fence'].fillna('NoFence')

#### MiscFeatures

In [8]:
model_train['MiscFeature'] = model_train['MiscFeature'].fillna('NoFeature')
model_test['MiscFeature'] = model_test['MiscFeature'].fillna('NoFeature')
kaggle_test['MiscFeature'] = kaggle_test['MiscFeature'].fillna('NoFeature')

#### GarageYrBlt

In [9]:
gar_yr_imp = SimpleImputer(strategy='mean')
gar_yr_imp = gar_yr_imp.fit(model_train[['GarageYrBlt']])



model_train[['GarageYrBlt']] = pd.DataFrame(gar_yr_imp.transform(model_train[['GarageYrBlt']]), columns=['GarageYrBlt'],index = model_train[['GarageYrBlt']].index)
model_test[['GarageYrBlt']]  = pd.DataFrame(gar_yr_imp.transform(model_test[['GarageYrBlt']]), columns=['GarageYrBlt'],index = model_test[['GarageYrBlt']].index)
kaggle_test[['GarageYrBlt']] = pd.DataFrame(gar_yr_imp.transform(kaggle_test[['GarageYrBlt']]), columns=['GarageYrBlt'],index = kaggle_test[['GarageYrBlt']].index)


#### Garage Quality

In [10]:
gar_qual_imp = SimpleImputer(strategy='most_frequent')
gar_qual_imp = gar_qual_imp.fit(model_train[['GarageQual']])

model_train[['GarageQual']] = pd.DataFrame(gar_qual_imp.transform(model_train[['GarageQual']]), columns=['GarageQual'], index = model_train[['GarageQual']].index)
model_test[['GarageQual']] = pd.DataFrame(gar_qual_imp.transform(model_test[['GarageQual']]), columns=['GarageQual'], index = model_test[['GarageQual']].index)
kaggle_test[['GarageQual']] = pd.DataFrame(gar_qual_imp.transform(kaggle_test[['GarageQual']]), columns=['GarageQual'], index = kaggle_test[['GarageQual']].index)


In [11]:
model_train['GarageQual'] = model_train['GarageQual'].replace({'Po':1, 'Fa':2, 'TA': 3, 'Gd': 4, 'Ex':5})
model_test['GarageQual'] = model_test['GarageQual'].replace('Po',1).replace('Fa',2).replace('TA',3).replace('Gd', 4).replace('Ex',5)
kaggle_test['GarageQual'] = kaggle_test['GarageQual'].replace('Po',1).replace('Fa',2).replace('TA',3).replace('Gd', 4).replace('Ex',5)


### Create New Columns

#### GarageAge

In [12]:
model_train['GarageAge'] = model_train['YrSold'] - model_train['GarageYrBlt']
model_test['GarageAge'] = model_test['YrSold'] - model_test['GarageYrBlt']
kaggle_test['GarageAge'] = np.abs(kaggle_test['YrSold'] - kaggle_test['GarageYrBlt'])

#### TotalPorch

In [13]:
model_train['TotalPorch'] = model_train['OpenPorchSF'] + model_train['EnclosedPorch'] + model_train['3SsnPorch'] + model_train['ScreenPorch']
model_test['TotalPorch'] = model_test['OpenPorchSF'] + model_test['EnclosedPorch'] + model_test['3SsnPorch'] + model_test['ScreenPorch']
kaggle_test['TotalPorch'] = kaggle_test['OpenPorchSF'] +  kaggle_test['EnclosedPorch'] + kaggle_test['3SsnPorch'] + kaggle_test['ScreenPorch']


In [14]:
model_train = model_train.drop(['OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch'], axis = 1)
model_test = model_test.drop(['OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch'], axis = 1)
kaggle_test = kaggle_test.drop(['OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch'], axis = 1)


### Variable Transformations (log,log^2, sqrt)

In [15]:
model_train['GarageAge'] = np.log(model_train['GarageAge']+1)
model_test['GarageAge'] = np.log(model_test['GarageAge']+1)
kaggle_test['GarageAge'] = np.log(kaggle_test['GarageAge']+1)

In [16]:
# model_train[['EnclosedPorch','MiscVal']] = np.sqrt(model_train[['EnclosedPorch','MiscVal']])
# model_test[['EnclosedPorch','MiscVal']] = np.sqrt(model_test[['EnclosedPorch','MiscVal']])
# kaggle_test[['EnclosedPorch','MiscVal']] = np.sqrt(kaggle_test[['EnclosedPorch','MiscVal']])

In [17]:
model_train[['MiscVal']] = np.sqrt(model_train[['MiscVal']])
model_test[['MiscVal']] = np.sqrt(model_test[['MiscVal']])
kaggle_test[['MiscVal']] = np.sqrt(kaggle_test[['MiscVal']])

In [18]:
model_train[['TotalPorch']] = np.log(model_train[['TotalPorch']]+1)
model_test[['TotalPorch']] = np.log(model_test[['TotalPorch']]+1)
kaggle_test[['TotalPorch']] = np.log(kaggle_test[['TotalPorch']]+1)

In [19]:
model_train[['WoodDeckSF']] = np.log(model_train[['WoodDeckSF']]+1)**2
model_test[['WoodDeckSF']] = np.log(model_test[['WoodDeckSF']]+1)**2
kaggle_test[['WoodDeckSF']] = np.log(kaggle_test[['WoodDeckSF']]+1)**2

### Handling NAs in Kaggle Dataset

In [20]:
for i in kaggle_test.columns:
    if i in cat:
        kaggle_test[i] = kaggle_test[i].fillna(kaggle_test[i].value_counts().index[0])
    elif i in numeric:
        kaggle_test[i] = kaggle_test[i].fillna(kaggle_test[i].mean())
    else:
        continue

### One Hot Encoding

In [21]:
full_train = pd.concat([model_train, model_test])

drop_col = [full_train[i].value_counts().index[0] for i in cat]

cat_ohe = OneHotEncoder(categories = 'auto', drop = drop_col, sparse = False )

cat_ohe = cat_ohe.fit(full_train[cat])


In [22]:
model_train = pd.concat([model_train, pd.DataFrame(cat_ohe.transform(model_train[cat]), columns = cat_ohe.get_feature_names(cat), index = model_train.index)], axis =1)
model_test = pd.concat([model_test, pd.DataFrame(cat_ohe.transform(model_test[cat]), columns = cat_ohe.get_feature_names(cat), index = model_test.index)], axis = 1)
kaggle_test = pd.concat([kaggle_test, pd.DataFrame(cat_ohe.transform(kaggle_test[cat]), columns = cat_ohe.get_feature_names(cat), index = kaggle_test.index)], axis =1)



In [23]:
model_train = model_train.drop(cat, axis = 1 )
model_test = model_test.drop(cat, axis = 1)
kaggle_test = kaggle_test.drop(cat, axis =  1)

### Scaling (Standartization)

#### Ordinal (min-max scaler)

In [24]:
# ord_scl = MinMaxScaler()
# ord_scl = ord_scl.fit(model_train[['GarageQual']])

# model_train[['GarageQual']] = pd.DataFrame(ord_scl.transform(model_train[['GarageQual']]), columns=['GarageQual'], index = model_train[['GarageQual']].index)
# model_test[['GarageQual']] = pd.DataFrame(ord_scl.transform(model_test[['GarageQual']]), columns=['GarageQual'], index = model_test[['GarageQual']].index)
# kaggle_test[['GarageQual']] = pd.DataFrame(ord_scl.transform(kaggle_test[['GarageQual']]), columns=['GarageQual'], index = kaggle_test[['GarageQual']].index)


#### Numeric (standard scaler)

In [25]:
numeric_scl = StandardScaler()
numeric_scl = numeric_scl.fit(model_train[numeric])

model_train[numeric] = pd.DataFrame(numeric_scl.transform(model_train[numeric]), columns=numeric, index = model_train[numeric].index)
model_test[numeric] = pd.DataFrame(numeric_scl.transform(model_test[numeric]), columns=numeric, index = model_test[numeric].index)
kaggle_test[numeric] = pd.DataFrame(numeric_scl.transform(kaggle_test[numeric]), columns=numeric, index = kaggle_test[numeric].index)



In [27]:
numeric

['GarageAge',
 'GarageCars',
 'WoodDeckSF',
 'TotalPorch',
 'MiscVal',
 'YrSold',
 'GarageQual']

In [26]:
numeric_scl.scale_

array([ 1.24762129,  0.74586759, 13.96993207,  2.2122882 ,  6.63836321,
        1.33154647,  0.24119991])

### Drop Unnecessary Columns

In [710]:
model_train = model_train.drop(['PoolArea','PoolQC','GarageArea','GarageCond','GarageYrBlt'], axis = 1 )
model_test = model_test.drop(['PoolArea','PoolQC','GarageArea','GarageCond','GarageYrBlt'], axis = 1)
kaggle_test = kaggle_test.drop(['PoolArea','PoolQC','GarageArea','GarageCond','GarageYrBlt'], axis = 1)

In [712]:
model_train.head()

Unnamed: 0_level_0,GarageCars,GarageQual,WoodDeckSF,MiscVal,YrSold,GarageAge,TotalPorch,GarageType_2Types,GarageType_Basment,GarageType_BuiltIn,...,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
134,0.301892,0.124236,1.069027,-0.156629,0.896964,-0.550922,0.063557,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
291,0.301892,4.270175,-0.91351,-0.156629,-1.356055,-2.312053,0.376874,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
25,-1.038829,0.124236,1.671029,-0.156629,1.647971,0.702644,0.629832,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1094,0.301892,0.124236,-0.91351,-0.156629,-1.356055,0.414093,0.477739,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1289,0.301892,0.124236,1.144013,-0.156629,0.896964,-0.466473,0.441558,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [713]:
model_test.head()

Unnamed: 0_level_0,GarageCars,GarageQual,WoodDeckSF,MiscVal,YrSold,GarageAge,TotalPorch,GarageType_2Types,GarageType_Basment,GarageType_BuiltIn,...,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1437,0.301892,0.124236,-0.91351,-0.156629,-0.605049,0.514414,-1.40917,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58,0.301892,0.124236,-0.91351,-0.156629,-1.356055,-1.431488,0.517649,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
781,0.301892,0.124236,1.17241,-0.156629,-0.605049,-0.256181,-0.011955,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
383,0.301892,0.124236,0.859432,-0.156629,-0.605049,-1.756478,0.393933,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1171,-1.038829,0.124236,1.11101,-0.156629,0.145958,0.465822,-1.40917,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [715]:
kaggle_test.head()

Unnamed: 0_level_0,GarageCars,GarageQual,WoodDeckSF,MiscVal,YrSold,GarageAge,TotalPorch,GarageType_2Types,GarageType_Basment,GarageType_BuiltIn,...,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Partial
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,-1.038829,0.124236,0.839557,-0.156629,1.647971,0.823532,0.758626,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1462,-1.038829,0.124236,1.643179,16.685384,1.647971,0.870236,0.22304,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1463,0.301892,0.124236,1.144013,-0.156629,1.647971,-0.196782,0.197921,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1464,0.301892,0.124236,1.568885,-0.156629,1.647971,-0.256181,0.22304,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1465,0.301892,0.124236,-0.91351,-0.156629,1.647971,0.047989,1.04302,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [720]:
# model_train.to_csv('model_train_Alex2.csv') 
# model_test.to_csv('model_test_Alex2.csv')
# kaggle_test.to_csv('kaggle_test_Alex2.csv')