In [1]:
import numpy as np
import pandas as pd
import patsy

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
import statsmodels as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sys
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 1000)
sys.path.append('/Users/ganeshsivam/Mods')
import corr

In [9]:
#cols = [c for c in train.columns if c not in ["SalePrice",'Id']]

In [10]:
train = pd.read_csv('./data/train.csv')

In [11]:
d = pd.read_csv('./data/sample_sub_reg.csv')

In [146]:
test = pd.read_csv('./data/test.csv')

In [13]:
c = train.columns[train.isna().any()].tolist()

In [120]:

def fill_na(df):
    df["Lot Frontage"] = df["Lot Frontage"].fillna(df["Lot Frontage"].mean())
    df["Alley"] = df["Alley"].fillna("No alley access")
    df["Mas Vnr Type"] = df["Mas Vnr Type"].fillna("None")
    df["Mas Vnr Area"] = df["Mas Vnr Area"].fillna(0)
    c = ['Bsmt Qual',
     'Bsmt Cond',
     'Bsmt Exposure',
     'BsmtFin Type 1',
     ]
    for i in df.index:
        if df.loc[i,"BsmtFin SF 1"] == 0:
            df.loc[i,c] = "No Basement"
        elif df.loc[i,"BsmtFin SF 1"] !=  df.loc[i,"BsmtFin SF 1"]:
            df.loc[i,c] = "No Basement"
            #df.loc[i,"BsmtFin SF 1"]
    c = ['BsmtFin Type 2']
    for i in df.index:
        if df.loc[i,"BsmtFin SF 2"] == 0:
            df.loc[i,c] = "No Basement"
        elif df.loc[i,"BsmtFin SF 2"] !=  df.loc[i,"BsmtFin SF 2"]:
            df.loc[i,c] = "No Basement"
        if df.loc[i,'BsmtFin Type 2'] != df.loc[i,'BsmtFin Type 2']:
            df.loc[i,'BsmtFin Type 2'] = df.loc[i,'BsmtFin Type 1']
    
    df['BsmtFin SF 1'] = df['BsmtFin SF 1'].fillna(0)
    df['BsmtFin SF 2'] = df['BsmtFin SF 2'].fillna(0)
    df['Bsmt Unf SF'] = df['Bsmt Unf SF'].fillna(0)
    df['Total Bsmt SF'] = df['Total Bsmt SF'].fillna(0)
    df['Bsmt Full Bath'] = df['Bsmt Full Bath'].fillna(0)
    df['Bsmt Half Bath'] = df['Bsmt Half Bath'].fillna(0)
    df['Fireplace Qu'] = df['Fireplace Qu'].fillna("No Fireplace")
    df['Garage Type'] = df['Garage Type'].fillna("No Garage")
    df['Garage Yr Blt'] = df['Garage Yr Blt'].fillna("No Garage")
    df['Garage Finish'] = df['Garage Finish'].fillna("No Garage")
    df['Garage Cars'] = df['Garage Cars'].fillna("No Garage")
    df['Garage Area'] = df['Garage Area'].fillna(0)
    df['Garage Qual'] = df['Garage Qual'].fillna("No Garage")
    df['Garage Cond'] = df['Garage Cond'].fillna("No Garage")
    df['Pool QC'] = df['Pool QC'].fillna("No Pool")
    df['Fence'] = df['Fence'].fillna("No Fence")
    df['Misc Feature'] = df['Misc Feature'].fillna("No Misc")
    return df


In [116]:
mp1 =  {'Ex':4,'Gd':3,'TA':2,'Fa':1,'Po':0}
mpb = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'No Basement':0}
mpbe = {'Gd':4,'Av':3,'Mn':2,'No':1,'No Basement':0}
mpbt = {'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,'No Basement':0}
funct = {'Typ':7,'Min1':6,'Min2':5,'Mod':4,'Maj1':3,
     'Maj2':2,'Sev':1,'Sal':0}
fp = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'No Fireplace':0}
gf = {'Fin':3,'RFn':2,'Unf':1,'No Garage':0}
gq = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'No Garage':0}
gc = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'No Garage':0}
pool = {'Ex':4,'Gd':3,'TA':2,'Fa':1,'No Pool':0}
ls = {'Sev': 2, 'Mod': 1, 'Gtl': 0}
fence = {'GdPrv':4,'MnPrv':3,'GdWo':2,'MnWw':1,'No Fence':0}

ord_dict = {"Exter Qual":mp1,
           "Exter Cond":mp1,
           'Heating QC':mp1,
           'Kitchen Qual':mp1,
           'Bsmt Qual' : mpb,
           'Bsmt Cond': mpb,
           'Bsmt Exposure': mpbe,
            'BsmtFin Type 1':mpbt,
            'BsmtFin Type 2': mpbt,
            'Functional':funct,
            'Fireplace Qu': fp,
            'Garage Finish':gf,
            'Garage Qual':gq,
            'Garage Cond':gc,
            'Pool QC':pool,
            'Land Slope':ls,
            'Fence':fence
           }
def clean_ord(df,ord_dict): # Function to convert ordinal variables that are ranked for the model to deal with
    for key,value in ord_dict.items():
        df[key] = df[key].map(value)
    return df


In [164]:

def dummies(df):
    cat_cols = []
    for i in df.columns:
        if df.dtypes[i] == "O":
            cat_cols.append(i)
    cat_cols.append("MS SubClass")
    for c in cat_cols:
        dums = pd.get_dummies(df[c],prefix=c)
        df = pd.concat([df,dums],axis=1)
        df.drop(c,inplace=True,axis=1)
        
    year_cols = [c for c in df.columns if "Year" in c or "year" in c or 'Yr' in c]
    year_cols = [c for c in year_cols if "rage" not in c]
    #year_cols.remove('Garage Yr Blt')
    for c in year_cols:
        df[c] = 2019 - df[c]
    return df


In [329]:
def prep_data(df):
    X = train[[c for c in train.columns if c != "PID"and c != "Id" and c != 'SalePrice']]
    final_cols = list(X.columns)
    y = trains['SalePrice']
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=42)
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test,final_cols,scaler


In [330]:

def mod_results(X_train, X_test, y_train, y_test):
    from sklearn.linear_model import RidgeCV
    from sklearn.linear_model import Lasso
    from sklearn.linear_model import ElasticNetCV
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import Ridge

    y_pred = pd.Series([float(y_train.mean()) for c in y_test])
    d = corr.reg_metrics(y_pred,y_test).transpose()
    d.index = ["Mean Baseline mondel"]
    
    lr = LinearRegression()
    lr = lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    train_score = lr.score(X_train,y_train)
    test_score = lr.score(X_test,y_test)
    #return(y_test,y_pred)
    temp_df = corr.reg_metrics(y_pred,y_test).transpose()
    
    temp_df.index = ["Liner Regression"]
    temp_df.loc['Liner Regression',"Model Train Score"] = train_score
    temp_df.loc['Liner Regression',"Model Test Score"] = test_score
    d = d.append(temp_df,sort=True)
    
    lasso = LassoCV(n_alphas=200,cv=5).fit(X_train, y_train)
    y_pred = lasso.predict(X_test)
    train_score = lasso.score(X_train,y_train)
    test_score = lasso.score(X_test,y_test)
    temp_df = corr.reg_metrics(y_pred,y_test).transpose()
    temp_df.index = ["Lasso"]
    temp_df.loc['Lasso',"Model Train Score"] = train_score
    temp_df.loc['Lasso',"Model Test Score"] = test_score
    temp_df.loc["Lasso",'Alpha'] = lasso.alpha_
    d = d.append(temp_df,sort=True)
    
    parameter_space = {
    "alpha": [ 10, 100, 290, 500,1000],
    "fit_intercept": [True, False],
    "solver": ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
    }

    clf = GridSearchCV(Ridge(random_state=3), parameter_space, n_jobs=4,
                   cv=3, scoring="neg_mean_absolute_error")
    clf.fit(X_train,y_train)
    #print("Best parameters:")
    #print(clf.best_params_)
    ridge = Ridge(random_state=3, **clf.best_params_)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    train_score = ridge.score(X_train,y_train)
    test_score = ridge.score(X_test,y_test)
    temp_df = corr.reg_metrics(y_pred,y_test).transpose()
    temp_df.index = ["Ridge"]
    temp_df.loc['Ridge',"Model Train Score"] = train_score
    temp_df.loc['Ridge',"Model Test Score"] = test_score
    temp_df.loc["Ridge",'Alpha'] = clf.best_params_['alpha']
    d = d.append(temp_df,sort=True)
    
    
    return d



In [331]:
train = pd.read_csv('./data/train.csv')
trains = train.copy()
train = train[[c for c in trains.columns if c != "PID"and c != "Id" and c != 'SalePrice']]
train = fill_na(train)
train = clean_ord(train,ord_dict)
train = dummies(train)
X_train, X_test, y_train, y_test,final_cols,scaler = prep_data(train)

d = mod_results(X_train, X_test, y_train, y_test)
#d

In [328]:
def prep_test_data(df,scaler):
    
    from sklearn.preprocessing import StandardScaler
    x_cols = []
    for c in train.columns:
        if c not in df.columns:
            x_cols.append(c)
            df[c] = 0
            
    #scaler = StandardScaler()
    #scaler.fit(X_train)
    df = df[final_cols]
    df = df.values
    df = scaler.transform(df)
    
    #X_test = scaler.transform(X_test)
    return df

In [332]:
test = pd.read_csv('./data/test.csv')
ids = test['Id']
test = test[[c for c in test.columns if c != "PID"and c != "Id" and c != 'SalePrice']]
test = fill_na(test)
test = clean_ord(test,ord_dict)
test = dummies(test)
mod_test = prep_test_data(test,scaler)
lasso = LassoCV(n_alphas=200,cv=5).fit(X_train, y_train)
y_pred = lasso.predict(mod_test)


In [333]:
y_pred[:5]

array([138234.95482996, 160311.2650111 , 228340.1563468 , 111501.12140155,
       178315.00318718])

In [304]:
#mod_test = test.values
y_pred = lasso.predict(test)

In [320]:
d = pd.DataFrame()
d['Id'] = ids
d['SalePrice'] = y_pred

In [224]:
d['Id'] = ids
d['SalePrice'] = y_pred

In [316]:
d.to_csv("submission.csv",index=False)

In [317]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [321]:
d

Unnamed: 0,Id,SalePrice
0,2658,97375683.8593
1,2718,109137475.5551
2,2414,127802725.2305
3,1989,71707979.3758
4,625,96237131.6416
...,...,...
874,1662,98900660.1836
875,1234,132774907.4949
876,1373,78464928.1087
877,1672,75666266.7741


In [284]:
X_train[:5]

array([[-1.02491036, -0.55407391, -0.22106951, ..., -0.20957354,
        -0.070014  , -0.16012815],
       [ 1.86172625,  0.75984144, -0.22106951, ..., -0.20957354,
        -0.070014  , -0.16012815],
       [-0.01434926,  1.07046524,  3.80730817, ..., -0.20957354,
        -0.070014  , -0.16012815],
       [ 1.08279257,  0.37654136, -0.22106951, ..., -0.20957354,
        -0.070014  , -0.16012815],
       [ 0.48713739, -0.07028317, -0.22106951, ..., -0.20957354,
        -0.070014  , -0.16012815]])

In [280]:
scaler.mean_

array([6.93250752e+01, 9.95567620e+03, 6.10988296e+00, 5.55981795e+00,
       4.71391417e+01, 3.48315995e+01, 9.81853056e+01, 4.43600780e+02,
       4.80975293e+01, 5.65533160e+02, 1.05723147e+03, 1.16188427e+03,
       3.29815345e+02, 5.63784135e+00, 1.49733745e+03, 4.27178153e-01,
       6.04681404e-02, 1.57217165e+00, 3.77113134e-01, 2.83420026e+00,
       1.03836151e+00, 6.41417425e+00, 5.78673602e-01, 4.72576723e+02,
       9.40123537e+01, 4.75273082e+01, 2.21261378e+01, 2.61898570e+00,
       1.62984395e+01, 2.56957087e+00, 5.83381014e+01, 6.17750325e+00,
       1.12230169e+01, 1.30039012e-03, 9.10273082e-03, 5.46163849e-02,
       0.00000000e+00, 7.80234070e-03, 7.75032510e-01, 1.52145644e-01,
       3.25097529e-03, 9.96749025e-01, 4.42132640e-02, 9.27828349e-01,
       2.79583875e-02, 3.40702211e-01, 2.40572172e-02, 4.55136541e-03,
       6.30689207e-01, 3.96618986e-02, 4.29128739e-02, 2.01560468e-02,
       8.97269181e-01, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [275]:
mod_test[:5]

array([[ 6.90000000e+01,  9.14200000e+03,  1.51640218e-17, ...,
         9.96492861e-17,  1.08314441e-18,  1.00000000e+00],
       [ 6.96300417e+01,  9.66200000e+03,  1.51640218e-17, ...,
         9.96492861e-17,  1.08314441e-18, -1.13730163e-17],
       [ 5.80000000e+01,  1.71040000e+04,  1.51640218e-17, ...,
         9.96492861e-17,  1.08314441e-18, -1.13730163e-17],
       [ 6.00000000e+01,  8.52000000e+03,  1.51640218e-17, ...,
         9.96492861e-17,  1.08314441e-18, -1.13730163e-17],
       [ 6.96300417e+01,  9.50000000e+03,  1.51640218e-17, ...,
         9.96492861e-17,  1.08314441e-18, -1.13730163e-17]])

In [273]:
d.head()

Unnamed: 0,Id,SalePrice
0,2658,97320049.5585
1,2718,108617484.8846
2,2414,129602280.5058
3,1989,72279094.4963
4,625,96093478.3882


In [226]:
test = pd.read_csv('./data/test.csv')

In [228]:
test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,2fmCon,2Story,6,8,1910,1950,Gable,CompShg,AsbShng,AsbShng,,0.0,TA,Fa,Stone,Fa,TA,No,Unf,0.0,Unf,0.0,1020.0,1020.0,GasA,Gd,N,FuseP,908,1020,0,1928,0.0,0.0,2,0,4,2,Fa,9,Typ,0,,Detchd,1910.0,Unf,1.0,440.0,Po,Po,Y,0,60,112,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Sawyer,Norm,Norm,Duplex,1Story,5,4,1977,1977,Gable,CompShg,Plywood,Plywood,,0.0,TA,TA,CBlock,Gd,TA,No,Unf,0.0,Unf,0.0,1967.0,1967.0,GasA,TA,Y,SBrkr,1967,0,0,1967,0.0,0.0,2,0,6,2,TA,10,Typ,0,,Attchd,1977.0,Fin,2.0,580.0,TA,TA,Y,170,0,0,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,7,5,2006,2006,Gable,CompShg,VinylSd,VinylSd,,0.0,Gd,TA,PConc,Gd,Gd,Av,GLQ,554.0,Unf,0.0,100.0,654.0,GasA,Ex,Y,SBrkr,664,832,0,1496,1.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,2006.0,RFn,2.0,426.0,TA,TA,Y,100,24,0,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,1Fam,1Story,5,6,1923,2006,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,Gd,TA,CBlock,TA,TA,No,Unf,0.0,Unf,0.0,968.0,968.0,GasA,TA,Y,SBrkr,968,0,0,968,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Detchd,1935.0,Unf,2.0,480.0,Fa,TA,N,0,0,184,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,5,1963,1963,Gable,CompShg,Plywood,Plywood,BrkFace,247.0,TA,TA,CBlock,Gd,TA,No,BLQ,609.0,Unf,0.0,785.0,1394.0,GasA,Gd,Y,SBrkr,1394,0,0,1394,1.0,0.0,1,1,3,1,TA,6,Typ,2,Gd,Attchd,1963.0,RFn,2.0,514.0,TA,TA,Y,0,76,0,0,185,0,,,,0,7,2009,WD


In [217]:
y_pred = lasso.predict(mod_test)

In [136]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

parameter_space = {
    "alpha": [1, 10, 100, 290, 500],
    "fit_intercept": [True, False],
    "solver": ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
}

clf = GridSearchCV(Ridge(random_state=3), parameter_space, n_jobs=4,
                   cv=3, scoring="neg_mean_absolute_error")
clf.fit(X_train,y_train)
#print("Best parameters:")
#print(clf.best_params_)
ridge_model = Ridge(random_state=3, **clf.best_params_)
ridge_model.fit(X_train, y_train)

Best parameters:
{'alpha': 500, 'fit_intercept': True, 'solver': 'svd'}


In [129]:
train = pd.read_csv('./data/train.csv')
train = fill_na(train)
train = clean_ord(train,ord_dict)

In [112]:
X_train, X_test, y_train, y_test = prep_data(train)

In [113]:
d = mod_results(X_train, X_test, y_train, y_test)



In [131]:
ridge_model.fit(X_train, y_train)

Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Land Slope,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,Exter Qual,Exter Cond,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating QC,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Finish,Garage Area,Garage Qual,Garage Cond,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Val,Mo Sold,Yr Sold,SalePrice
Id,1.0,0.1758,0.0265,-0.023,0.0329,0.0142,-0.0615,-0.0261,-0.0644,-0.09,-0.0358,-0.0716,-0.0176,-0.0562,-0.03,0.0101,-0.0639,-0.0458,-0.0113,0.0007,0.0086,-0.0381,-0.0291,-0.0226,-0.0084,0.0117,-0.0239,-0.0333,0.0144,-0.0591,-0.0421,0.0106,-0.0024,-0.0655,-0.0091,-0.0208,-0.0376,-0.0258,-0.066,-0.0461,-0.0465,-0.045,-0.009,0.0014,0.0337,-0.0228,0.0098,0.0557,0.0484,-0.0148,-0.0127,0.1277,-0.9757,-0.0514
PID,0.1758,1.0,-0.0036,-0.0836,0.0241,0.1166,-0.2659,0.1069,-0.347,-0.1767,-0.2425,-0.2263,0.0398,-0.0971,-0.0773,-0.0023,-0.1046,-0.087,-0.0165,-0.0133,-0.1118,-0.2046,-0.1035,-0.1459,-0.0056,0.0723,-0.1129,-0.0313,-0.0022,-0.1794,-0.164,0.0091,0.0684,-0.1947,-0.085,-0.1014,-0.1207,-0.1645,-0.2749,-0.2017,-0.1749,-0.1699,-0.0412,-0.0811,0.1502,-0.0247,-0.042,0.0058,0.0252,0.091,0.0042,-0.0327,0.0085,-0.2551
MS SubClass,0.0265,-0.0036,1.0,-0.3643,-0.2455,-0.0421,0.0358,-0.0701,0.036,0.0448,-0.0028,0.0173,-0.0571,0.028,-0.0066,0.0531,0.0513,-0.0604,-0.049,-0.0681,-0.1393,-0.2247,-0.0242,-0.2462,0.3058,0.0311,0.0682,0.0257,-0.017,0.1421,0.1794,-0.0035,0.2521,-0.0195,0.0345,-0.0001,-0.0551,-0.0577,-0.0331,-0.1084,-0.0978,-0.1149,0.0016,-0.0203,-0.0398,-0.0301,-0.0388,-0.0046,-0.0075,-0.0914,-0.0275,0.013,-0.0329,-0.0873
Lot Frontage,-0.023,-0.0836,-0.3643,1.0,0.3895,0.055,0.1825,-0.0498,0.1047,0.0797,0.1985,0.1394,-0.0234,0.0536,0.029,0.1271,0.041,0.1977,0.0129,0.0326,0.1174,0.3308,0.081,0.4292,0.0123,-0.0003,0.3504,0.1002,-0.0288,0.1388,0.0324,0.2006,0.0147,0.1662,0.3326,-0.0192,0.2258,0.2153,0.1961,0.3394,0.0993,0.0957,0.1014,0.1703,-0.003,0.0286,0.068,0.1413,0.1508,0.0164,0.0483,-0.016,0.0084,0.3258
Lot Area,0.0329,0.0241,-0.2455,0.3895,1.0,0.3037,0.1058,-0.0192,0.036,0.0508,0.1675,0.0892,0.0196,0.0811,0.0568,0.1838,0.0488,0.2156,0.0291,0.0418,0.0415,0.2775,0.0221,0.3816,0.0294,0.0013,0.3274,0.1133,0.0292,0.1256,0.05,0.1391,-0.0135,0.1193,0.2387,-0.0864,0.2895,0.2014,0.1245,0.2631,0.1011,0.0917,0.1556,0.1409,0.0141,0.0196,0.0677,0.1151,0.128,-0.0336,0.0939,0.0032,-0.0295,0.2966
Land Slope,0.0142,0.1166,-0.0421,0.055,0.3037,1.0,-0.0352,0.0005,-0.0501,-0.051,0.0282,-0.0301,-0.0229,0.0759,0.0681,0.2152,0.0478,0.1028,0.0509,0.0708,-0.0995,0.0329,-0.0334,0.0564,-0.0379,-0.0083,0.0116,0.0874,0.0568,-0.074,-0.0083,-0.092,-0.0346,-0.0206,-0.0577,-0.079,0.1194,0.0614,-0.0024,0.011,0.0035,0.0013,0.0833,-0.0222,0.0301,0.0142,0.0483,-0.0137,-0.0133,-0.0219,-0.0053,-0.0144,0.0156,0.058
Overall Qual,-0.0615,-0.2659,0.0358,0.1825,0.1058,-0.0352,1.0,-0.0828,0.603,0.5847,0.43,0.7403,0.0204,0.2581,0.1054,0.2534,0.2497,0.2792,-0.0525,-0.028,0.2764,0.5494,0.4756,0.4771,0.2282,-0.0523,0.5667,0.1759,-0.047,0.5151,0.2749,0.0534,-0.171,0.6906,0.382,0.1817,0.3889,0.4808,0.5546,0.5639,0.2994,0.2859,0.2571,0.3089,-0.1546,0.0319,0.0488,0.0066,0.0227,-0.178,0.0221,0.0192,-0.0116,0.8002
Overall Cond,-0.0261,0.1069,-0.0701,-0.0498,-0.0192,0.0005,-0.0828,1.0,-0.371,0.0426,-0.1319,-0.1565,0.4124,-0.0313,0.0597,-0.0345,-0.0032,-0.0463,0.0958,0.0476,-0.1312,-0.1599,0.0024,-0.1509,0.0109,0.0048,-0.1098,-0.0401,0.0999,-0.2192,-0.0933,-0.0099,-0.0957,-0.0487,-0.0936,0.1293,-0.0065,-0.0429,-0.1674,-0.1382,0.0362,0.0436,0.011,-0.0523,0.1083,0.0269,0.0474,-0.0058,-0.0086,0.1758,0.0143,-0.0031,0.0477,-0.097
Year Built,-0.0644,-0.347,0.036,0.1047,0.036,-0.0501,0.603,-0.371,1.0,0.6291,0.3208,0.6164,-0.0848,0.3178,0.1817,0.2678,0.3445,0.2757,-0.0322,-0.0209,0.1371,0.4106,0.4638,0.3233,0.0223,-0.1594,0.2588,0.2156,-0.0313,0.4802,0.2832,-0.0421,-0.1272,0.5372,0.1378,0.1585,0.1688,0.2426,0.599,0.488,0.3007,0.2847,0.2163,0.2078,-0.3801,0.0161,-0.0379,0.0037,0.0098,-0.2119,0.0006,-0.0071,-0.0036,0.5718
Year Remod/Add,-0.09,-0.1767,0.0448,0.0797,0.0508,-0.051,0.5847,0.0426,0.6291,1.0,0.2042,0.6172,0.1027,0.1784,0.048,0.1581,0.2227,0.1673,-0.057,-0.0569,0.1609,0.3098,0.5637,0.2442,0.1596,-0.0666,0.3224,0.1538,-0.0378,0.4716,0.2241,-0.0197,-0.1359,0.6177,0.2024,0.1072,0.1389,0.2311,0.4584,0.3977,0.1801,0.1576,0.2163,0.2645,-0.2375,0.0404,-0.0412,-0.0224,-0.0193,-0.1862,-0.0017,0.0116,0.0427,0.5504


In [49]:
d1 = mod_results(X_train, X_test, y_train, y_test)



In [50]:
d1 #The first metrics for our 3 regression, before any data cleaning & cleansing has been done. 

Unnamed: 0,Alpha,Explained Variance Score,Max Errors,Mean Absolute Error,Mean Squared Errors,Mean Squared Log Erros,Median Absolute Error,Model Test Score,Model Train Score,R_squared
Mean Baseline mondel,,-7.248861747602752e+30,278938.0065,59200.1585,6142681753.1851,0.1850,46061.9935,,,-7.251998606544079e+30
Liner Regression,,-0.0,5.6851650823430406e+17,3102748225058680.5,1.0599254535240908e+33,Contains negatives,11668.5899,-1.726256000715519e+23,0.9502,-0.0
Lasso,948.8672,0.9173,111964.2152,14435.5153,460790892.3083,0.0188,9429.4931,0.925,0.9323,0.9172
Ridge,909.1818,0.8843,141282.2109,16585.2778,594281101.6972,0.0203,11192.5216,0.9032,0.9252,0.8843


In [52]:
cols = [c for c in train.columns if c not in ["SalePrice",'Id','PID']]
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(train[cols].values, i) for i in range(train[cols].shape[1])]
vif["features"] = train[cols].columns

TypeError: '>=' not supported between instances of 'float' and 'str'

In [None]:
vif["features"] = train[cols].columns

In [None]:
vif["features"] = train.columns

In [None]:
vif["VIF Factor"] = vif["VIF Factor"].astype(float)

In [None]:
vif.sort_values("VIF Factor",ascending=False)

In [None]:
cols = [c for c in train.columns if c not in ["SalePrice",'Id']]
variance_inflation_factor(train[cols],1)

In [None]:
len(cols)

In [None]:
co.count("Pave")

In [None]:
co = list(train.columns)
for i in co:
    if co.count(i) > 1:
        print(i)

In [None]:
train[train['PID'].isna()]

In [None]:
if train.dtypes["MS Zoning"] == "O":
    print('nhj')

In [None]:
train.info()

In [None]:
train.loc[train["BsmtFin Type 2"].isna()]

In [None]:
train.info()

In [2]:
train[c[15]] = train[c[15]].fillna("NA")

NameError: name 'train' is not defined

In [None]:
train.loc[(train["Fireplace Qu"].isna())&(train["Fireplaces"]>0)]

In [None]:
c[16]

In [None]:
train.loc[train[c[25]].isna()]

In [None]:
train.loc[(train["Pool QC"].isna())&(train["Pool Area"]>0)]

In [None]:
train.loc[(train[c[22]].isna())&(train["Garage Area"]>0)]

In [None]:
train[c[24]] =train[c[24]].fillna("No Fence")

In [None]:
train[c[25]] =train[c[25]].fillna("NA")

In [None]:
train.info()

In [None]:
m_d

In [None]:
m_d['Fireplace Qu'] = "Filled as NA for no fireplace"

In [None]:
list(m_d.keys())[16]

In [None]:
for i in range(16,23):
    m_d[c[i]] = "Filled as NA for no garage"

In [None]:
m_d[c[23]] = "Filled as NA for no pool"
m_d[c[24]] = "Filled as NA for no fence"
m_d[c[25]] = "Filled as NA for no none"


In [None]:
m_d.pop("BsmtFin Unf SF")

In [None]:
m_d

In [None]:
list(range(16,23))

In [None]:
train["BsmtFin SF 1"] = train["BsmtFin SF 1"].fillna(0)
train["BsmtFin SF 2"] = train["BsmtFin SF 2"].fillna(0)
train["Bsmt Unf SF"] = train["Bsmt Unf SF"].fillna(0)
train["Total Bsmt SF"] = train["Total Bsmt SF"].fillna(0)

train['BsmtFin Type 2'] = train['BsmtFin Type 2'].fillna("No Basement")

In [None]:
train