In [1]:
#Importing libraries
import pandas as pd
import numpy as np

In [2]:
missing_values = ["n/a", "na", "--", ""]

df_train = pd.read_csv('train.csv', na_values = missing_values)
df_train['type'] = 'analysis'
df_test = pd.read_csv('test.csv', na_values = missing_values)
df_test['SalePrice'] = ""
df_test['type'] = 'scoring'

df = pd.concat([df_train, df_test])

In [3]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

df_cont = df_train.select_dtypes(include=numerics)
df_cat = df_train.select_dtypes(include = 'object')

In [4]:
df_cat.shape

(1460, 44)

In [5]:
df_cat.isnull().sum()

MSZoning            0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinType2       38
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
KitchenQual         0
Functional          0
FireplaceQu       690
GarageType         81
GarageFinish       81
GarageQual         81
GarageCond         81
PavedDrive          0
PoolQC           1453
Fence            1179
MiscFeature      1406
SaleType            0
SaleCondition       0
type                0
dtype: int64

In [6]:
# remove columns with too many missing values
df_cat.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis=1, inplace=True)

In [7]:
# no basement maybe?
df_cat['BsmtQual'] = df_cat['BsmtQual'].fillna('no_bsmt')
df_cat['BsmtCond'] = df_cat['BsmtCond'].fillna('no_bsmt')
df_cat['BsmtExposure'] = df_cat['BsmtExposure'].fillna('no_bsmt')
df_cat['BsmtFinType1'] = df_cat['BsmtFinType1'].fillna('no_bsmt')
df_cat['BsmtFinType2'] = df_cat['BsmtFinType2'].fillna('no_bsmt')

In [8]:
# no garage maybe?
df_cat['GarageType'] = df_cat['GarageType'].fillna('no_garage')
df_cat['GarageFinish'] = df_cat['GarageFinish'].fillna('no_garage')
df_cat['GarageQual'] = df_cat['GarageQual'].fillna('no_garage')
df_cat['GarageCond'] = df_cat['GarageCond'].fillna('no_garage')

In [9]:
#no fireplace maybe?
df_cat['FireplaceQu'] = df_cat['FireplaceQu'].fillna('no_fireplace')

In [10]:
df_cat['MasVnrType'].value_counts()

None       864
BrkFace    445
Stone      128
BrkCmn      15
Name: MasVnrType, dtype: int64

In [11]:
#impute with most common value
#df_cat['MasVnrType'] = df_cat['MasVnrType'].fillna('None')

In [12]:
#remove row with missing value
df_cat = df_cat.dropna()

In [13]:
df_cat.isnull().sum()

MSZoning         0
Street           0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinType2     0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
KitchenQual      0
Functional       0
FireplaceQu      0
GarageType       0
GarageFinish     0
GarageQual       0
GarageCond       0
PavedDrive       0
SaleType         0
SaleCondition    0
type             0
dtype: int64

In [14]:
df_cont.isnull().sum()

Id                 0
MSSubClass         0
LotFrontage      259
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       81
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
SalePrice          0
dtype: int64

In [15]:
#impute a number for no garage
df_cont['GarageYrBlt'] = df_cont['GarageYrBlt'].fillna(9999)

In [16]:
df_cont = df_cont.fillna(df_cont.median())

In [17]:
df_cont.isnull().sum()

Id               0
MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
SalePrice        0
dtype: int64

In [18]:
df_cont.drop(['Id'], axis=1, inplace=True)

In [19]:
df_cont

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,61,0,0,0,0,0,2,2008,208500
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,298,0,0,0,0,0,0,5,2007,181500
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,42,0,0,0,0,0,9,2008,223500
3,70,60.0,9550,7,5,1915,1970,0.0,216,0,...,0,35,272,0,0,0,0,2,2006,140000
4,60,84.0,14260,8,5,2000,2000,350.0,655,0,...,192,84,0,0,0,0,0,12,2008,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,62.0,7917,6,5,1999,2000,0.0,0,0,...,0,40,0,0,0,0,0,8,2007,175000
1456,20,85.0,13175,6,6,1978,1988,119.0,790,163,...,349,0,0,0,0,0,0,2,2010,210000
1457,70,66.0,9042,7,9,1941,2006,0.0,275,0,...,0,60,0,0,0,0,2500,5,2010,266500
1458,20,68.0,9717,5,6,1950,1996,0.0,49,1029,...,366,0,112,0,0,0,0,4,2010,142125


In [20]:
from numpy.linalg import norm

def center_normalize_df (dataframe):
    
    for column in dataframe:
        dataframe[column] = dataframe[column] - np.mean(dataframe[column])
        dataframe[column] = dataframe[column] / norm(dataframe[column])
    
    return dataframe

In [21]:
 #lasso with lasso cv

X_lasso = df_cont.loc[:, df_cont.columns != 'SalePrice']
y = df_cont['SalePrice']

from sklearn.model_selection import KFold
from sklearn.linear_model import LassoCV

cv = KFold(n_splits=10, shuffle=True, random_state=10)
model = LassoCV(cv=cv)
lassocv = model.fit(X_lasso, y)

B_lasso = lassocv.coef_
B_lasso = np.insert(B_lasso, 0, lassocv.intercept_, axis=0)

print('The Coef are')
print(B_lasso)
#print('\nlambda best is = %f' % lassocv.alpha_)

The Coef are
[-8.32805588e+05 -0.00000000e+00 -0.00000000e+00  2.88545204e-01
  0.00000000e+00  0.00000000e+00  3.12796739e+02  1.07480214e+02
  3.69335263e+01  1.21767180e+01 -0.00000000e+00  0.00000000e+00
  3.33140494e+01  0.00000000e+00  2.32085069e+00 -0.00000000e+00
  6.38936371e+01  0.00000000e+00 -0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00  0.00000000e+00
  0.00000000e+00  7.28449327e-01  0.00000000e+00  7.21718300e+01
  2.95471235e+01  0.00000000e+00 -0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -1.20265783e-01  0.00000000e+00
 -0.00000000e+00]


In [22]:
df_cont2 =  df_cont.loc[:, df_cont.columns != 'SalePrice'] * lassocv.coef_
df_cont2

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,-0.0,-0.0,2438.206975,0.0,0.0,626531.868167,215282.869374,7238.971158,8596.762919,-0.0,...,39550.162863,0.000000,0.0,-0.0,0.0,0.0,-0.0,-0.000000,0.0,-0.0
1,-0.0,-0.0,2770.033960,0.0,0.0,618086.356214,212380.903586,0.000000,11908.830220,-0.0,...,33199.041820,8805.042793,0.0,-0.0,0.0,0.0,-0.0,-0.000000,0.0,-0.0
2,-0.0,-0.0,3246.133547,0.0,0.0,625906.274689,215175.389159,5983.231263,5917.884956,-0.0,...,43880.472666,0.000000,0.0,-0.0,0.0,0.0,-0.0,-0.000000,0.0,-0.0
3,-0.0,-0.0,2755.606700,0.0,0.0,599005.755137,211736.022299,0.000000,2630.171091,-0.0,...,46334.314887,0.000000,0.0,-0.0,0.0,0.0,-0.0,-0.000000,0.0,-0.0
4,-0.0,-0.0,4114.654611,0.0,0.0,625593.477950,214960.428730,12926.734210,7975.750301,-0.0,...,60335.649916,5673.047706,0.0,-0.0,0.0,0.0,-0.0,-0.000000,0.0,-0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,-0.0,-0.0,2284.412381,0.0,0.0,625280.681211,214960.428730,0.000000,0.000000,-0.0,...,33199.041820,0.000000,0.0,-0.0,0.0,0.0,-0.0,-0.000000,0.0,-0.0
1456,-0.0,-0.0,3801.583065,0.0,0.0,618711.949692,213670.666158,4395.089632,9619.607233,-0.0,...,36085.915021,10311.946090,0.0,-0.0,0.0,0.0,-0.0,-0.000000,0.0,-0.0
1457,-0.0,-0.0,2609.025736,0.0,0.0,607138.470350,215605.310017,0.000000,3348.597454,-0.0,...,18187.301171,0.000000,0.0,-0.0,0.0,0.0,-0.0,-300.664458,0.0,-0.0
1458,-0.0,-0.0,2803.793749,0.0,0.0,609953.641001,214530.507873,0.000000,596.659183,-0.0,...,17321.239210,10814.247189,0.0,-0.0,0.0,0.0,-0.0,-0.000000,0.0,-0.0


In [23]:
df_cont2 = df_cont2.loc[:, (df_cont2 != 0).any(axis=0)]
df_cont2

Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,2ndFlrSF,GrLivArea,GarageYrBlt,GarageArea,WoodDeckSF,MiscVal
0,2438.206975,626531.868167,215282.869374,7238.971158,8596.762919,28516.826284,1982.006488,109258.119444,1459.084002,39550.162863,0.000000,-0.000000
1,2770.033960,618086.356214,212380.903586,0.000000,11908.830220,42042.330339,0.000000,80633.770022,1439.415870,33199.041820,8805.042793,-0.000000
2,3246.133547,625906.274689,215175.389159,5983.231263,5917.884956,30648.925445,2009.856697,114114.035863,1457.627104,43880.472666,0.000000,-0.000000
3,2755.606700,599005.755137,211736.022299,0.000000,2630.171091,25185.421344,1754.563121,109705.374903,1455.441756,46334.314887,0.000000,-0.000000
4,4114.654611,625593.477950,214960.428730,12926.734210,7975.750301,38144.586559,2443.855775,140438.214349,1456.898654,60335.649916,5673.047706,-0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
1455,2284.412381,625280.681211,214960.428730,0.000000,0.000000,31748.289075,1610.670378,105232.820306,1456.170205,33199.041820,0.000000,-0.000000
1456,3801.583065,618711.949692,213670.666158,4395.089632,9619.607233,51370.264170,0.000000,132451.509712,1440.872769,36085.915021,10311.946090,-0.000000
1457,2609.025736,607138.470350,215605.310017,0.000000,3348.597454,38377.784905,2673.619994,149511.110818,1413.920144,18187.301171,0.000000,-300.664458
1458,2803.793749,609953.641001,214530.507873,0.000000,596.659183,35912.545250,0.000000,68877.340796,1420.476188,17321.239210,10814.247189,-0.000000


In [24]:
def outliers (dataframe):
    
    for column in dataframe:
        Q1 = dataframe[column].quantile(0.25)
        Q3 = dataframe[column].quantile(0.75)
        IQR = Q3 - Q1
        Lower_Bound = Q1 - 1.5*IQR
        Upper_Bound = Q3 + 1.5*IQR
        
        for i in np.linspace(0,len(dataframe[column])-1, len(dataframe[column])):
            if dataframe[column][i] < Lower_Bound:
                dataframe[column][i] = Q1
            elif dataframe[column][i] > Upper_Bound:
                dataframe[column][i] = Q3
                
    return dataframe

In [25]:
np.linspace(0,len(df_cont2['LotArea'])-1,len(df_cont2['LotArea']))

array([0.000e+00, 1.000e+00, 2.000e+00, ..., 1.457e+03, 1.458e+03,
       1.459e+03])

In [26]:
df_cont2 = outliers(df_cont2)
df_cont2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column][i] = Q3
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column][i] = Q1


Unnamed: 0,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,TotalBsmtSF,2ndFlrSF,GrLivArea,GarageYrBlt,GarageArea,WoodDeckSF,MiscVal
0,2438.206975,626531.868167,215282.869374,7238.971158,8596.762919,28516.826284,1982.006488,109258.119444,1459.084002,39550.162863,0.000000,-0.0
1,2770.033960,618086.356214,212380.903586,0.000000,11908.830220,42042.330339,0.000000,80633.770022,1439.415870,33199.041820,8805.042793,-0.0
2,3246.133547,625906.274689,215175.389159,5983.231263,5917.884956,30648.925445,2009.856697,114114.035863,1457.627104,43880.472666,0.000000,-0.0
3,2755.606700,599005.755137,211736.022299,0.000000,2630.171091,25185.421344,1754.563121,109705.374903,1455.441756,46334.314887,0.000000,-0.0
4,4114.654611,625593.477950,214960.428730,12926.734210,7975.750301,38144.586559,2443.855775,140438.214349,1456.898654,60335.649916,5673.047706,-0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1455,2284.412381,625280.681211,214960.428730,0.000000,0.000000,31748.289075,1610.670378,105232.820306,1456.170205,33199.041820,0.000000,-0.0
1456,3801.583065,618711.949692,213670.666158,4395.089632,9619.607233,51370.264170,0.000000,132451.509712,1440.872769,36085.915021,10311.946090,-0.0
1457,2609.025736,607138.470350,215605.310017,0.000000,3348.597454,38377.784905,2673.619994,149511.110818,1413.920144,18187.301171,0.000000,-0.0
1458,2803.793749,609953.641001,214530.507873,0.000000,596.659183,35912.545250,0.000000,68877.340796,1420.476188,17321.239210,10814.247189,-0.0
