In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import RidgeCV, LassoCV, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn import preprocessing

from scipy.stats import skew
%matplotlib inline

#import train and test dataset
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
df_all = pd.concat([train.drop(columns=['SalePrice', 'Id']), test.drop(columns='Id')])

print(f'Concated shape: {df_all.shape}')

df_all

Concated shape: (2919, 79)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
1455,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
1456,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
1457,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [4]:
df_all[['BsmtFinSF2', 'BsmtFinType2']].loc[df_all['BsmtFinSF2'] == 0]

Unnamed: 0,BsmtFinSF2,BsmtFinType2
0,0.0,Unf
1,0.0,Unf
2,0.0,Unf
3,0.0,Unf
4,0.0,Unf
...,...,...
1454,0.0,Unf
1455,0.0,Unf
1456,0.0,Unf
1457,0.0,Unf


In [29]:
df_all[['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF']].loc[df_all['LowQualFinSF'] != 0]

Unnamed: 0,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF
51,0.0,0.0,816.0,816.0,816,0,360
88,0.0,0.0,1013.0,1013.0,1013,0,513
125,490.0,0.0,30.0,520.0,520,0,234
170,360.0,0.0,360.0,720.0,854,0,528
185,0.0,0.0,1107.0,1107.0,1518,1518,572
187,0.0,0.0,660.0,660.0,808,704,144
197,1036.0,184.0,140.0,1360.0,1360,1360,392
198,0.0,0.0,755.0,755.0,929,929,371
263,234.0,692.0,0.0,926.0,926,0,390
267,378.0,0.0,342.0,720.0,1052,720,420


In [13]:
#Total Finished Square Feet Above Grade
df_all['FloorFinSF'] = df_all['1stFlrSF'] + df_all['2ndFlrSF']

In [30]:
#Ratio of Low Quality Squared Feet to Total Finished Square Feet Above Grade
df_all['R_LowQual_FloorFinSF'] = df_all['LowQualFinSF'] / df_all['FloorFinSF']
df_all.R_LowQual_FloorFinSF.iloc[51]

0.4411764705882353

In [31]:
df_all['HasGarage'] = df_all['GarageArea'].apply(lambda x: 1 if x > 0 else 0)


In [32]:
df_all['HasGarage']

0       1
1       1
2       1
3       1
4       1
       ..
1454    0
1455    1
1456    1
1457    0
1458    1
Name: HasGarage, Length: 2919, dtype: int64

In [36]:
df_all['PoolArea'].loc[df_all['PoolArea']!=0]

197     512
810     648
1170    576
1182    555
1298    480
1386    519
1423    738
514     144
960     368
1043    444
1113    228
1139    561
1250    800
Name: PoolArea, dtype: int64

In [40]:
(df_all['HasGarage']>0).sum()

2761

In [42]:
df_all[['GarageYrBlt', 'GarageCars', 'GarageArea']].loc[df_all['HasGarage']>0]

Unnamed: 0,GarageYrBlt,GarageCars,GarageArea
0,2003.0,2.0,548.0
1,1976.0,2.0,460.0
2,2001.0,2.0,608.0
3,1998.0,3.0,642.0
4,2000.0,3.0,836.0
...,...,...,...
1451,1969.0,1.0,336.0
1452,1970.0,1.0,286.0
1455,1970.0,1.0,286.0
1456,1960.0,2.0,576.0


In [43]:
df_all.MoSold


0        2
1        5
2        9
3        2
4       12
        ..
1454     6
1455     4
1456     9
1457     7
1458    11
Name: MoSold, Length: 2919, dtype: int64

In [45]:
df_all.MoSold

0       0
1       2
2       6
3       0
4       7
       ..
1454    3
1455    1
1456    6
1457    4
1458    7
Name: MoSold, Length: 2919, dtype: int64

In [46]:
train.shape

(1460, 81)

In [47]:
import scipy.stats as st
y = train['SalePrice']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)

NameError: name 'plt' is not defined