In [960]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier

In [961]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [962]:
data = train.copy()
test_data = test.copy()
data = data.drop(["PoolQC", "Id"], axis=1)

In [963]:
data = pd.concat([data, pd.get_dummies(data[["Alley","Fence", "MiscFeature", "FireplaceQu"]])], axis=1)

In [964]:
data = data.drop(["Alley","Fence", "MiscFeature", "FireplaceQu"], axis=1)

In [965]:
needLRImpute = data.loc[:,data.isnull().any()].select_dtypes("number")
LRImputeMissingRows = needLRImpute[needLRImpute.isnull().any(axis=1)] = needLRImpute[needLRImpute.isnull().any(axis=1)]
LRcompleteRows = needLRImpute[~needLRImpute.isnull().any(axis=1)]

In [966]:
def imputeLR(column):
    train = data.copy()
    target = data[column]
    train = data.select_dtypes("number").drop(needLRImpute.columns, axis=1)
    imputeLr = LinearRegression()
    imputeLr.fit(train.iloc[LRcompleteRows.index],target.iloc[LRcompleteRows.index])
    LRImputeMissingRow = LRImputeMissingRows[LRImputeMissingRows[column].isnull()]
    
    return pd.Series(imputeLr.predict(train.iloc[LRImputeMissingRows.index]), index=LRImputeMissingRows.index, dtype='float64')
    

In [967]:
for item in needLRImpute.columns:
    data[item] = data[item].fillna(imputeLR(item))
    test_data[item] = test_data[item].fillna(imputeLR(item))

In [968]:
labelKNNImpute = data.loc[:,data.isnull().any()]

In [969]:
def knnImpute(col):
    copyData = data.copy()
    copyData = copyData.drop(labelKNNImpute, axis=1)
    
    labelCols = copyData.select_dtypes("object")
    copyData = pd.concat([copyData, pd.get_dummies(labelCols)], axis=1)
    copyData = copyData.drop(labelCols, axis=1)
    KNN = KNeighborsClassifier(5, weights='distance')
    currentTarget = labelKNNImpute[col]
    currentMissing = labelKNNImpute[labelKNNImpute[col].isnull()]
    
    indexOfComplete = labelKNNImpute[labelKNNImpute[col].notnull()].index
    
    currentTrain = copyData.iloc[indexOfComplete]
    currentTarget = currentTarget.iloc[indexOfComplete]
    fit = KNN.fit(currentTrain, currentTarget)
    
    return pd.Series(fit.predict(copyData.iloc[currentMissing.index]), index=currentMissing.index)

In [970]:
for item in labelKNNImpute.columns:
    data[item] = data[item].fillna(knnImpute(item))
    test_data[item] = test_data[item].fillna(knnImpute(item))

In [971]:
from sklearn.model_selection import train_test_split

In [972]:
lr_data = data.copy()
drop_non_numeric = lr_data.select_dtypes("object")

In [973]:
lr_data = pd.concat([lr_data, pd.get_dummies(drop_non_numeric)], axis=1)

In [974]:
lr_data = lr_data.drop(drop_non_numeric, axis=1)

In [975]:
x_train, x_test, y_train, y_test = train_test_split(lr_data[lr_data.columns.difference(["SalePrice"])], lr_data["SalePrice"], test_size=0.2)

In [976]:
lr = LinearRegression()

In [977]:
lr_model = lr.fit(x_train, y_train)

In [978]:
pred = lr_model.predict(x_test)

In [979]:
from sklearn import metrics
from scipy.stats import linregress
from sklearn.linear_model import LassoCV

In [980]:
lasso = LassoCV()

In [981]:
lasso.fit(lr_data[lr_data.columns.difference(["SalePrice"])],lr_data["SalePrice"])



LassoCV(alphas=None, copy_X=True, cv='warn', eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
    positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False)

In [982]:
chosenVar = pd.Series(lasso.coef_, index=x_train.columns)

In [983]:
chosenLR = lr_data[chosenVar[chosenVar > 0].index]

In [984]:
x_train2, x_test2, y_train2, y_test2 = train_test_split(chosenLR, lr_data["SalePrice"], test_size=0.2)

In [985]:
lr2 = LinearRegression()

In [986]:
lr2.fit(x_train2, y_train2)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [987]:
pred2 = lr2.predict(x_test2)

In [988]:
metrics.mean_squared_error(y_test2, pred2)

1540112164.3536162

In [989]:
true_lasso = LassoCV()

In [990]:
true_y = lr_data["SalePrice"]

In [991]:
true_x = lr_data.drop("SalePrice", axis=1)

In [992]:
true_lasso.fit(true_x, true_y)



LassoCV(alphas=None, copy_X=True, cv='warn', eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=None, normalize=False,
    positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False)

In [993]:
true_cols = pd.Series(true_lasso.coef_, index=true_x.columns)

In [994]:
true_train_lr = lr_data.loc[:, true_cols[true_cols > 0].index]

In [995]:
true_lr = LinearRegression()

In [996]:
true_lr.fit(true_train_lr, true_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [997]:
test.loc[:,true_train_lr.columns].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 10 columns):
LotArea         1459 non-null int64
YearBuilt       1459 non-null int64
YearRemodAdd    1459 non-null int64
MasVnrArea      1444 non-null float64
BsmtFinSF1      1458 non-null float64
TotalBsmtSF     1458 non-null float64
2ndFlrSF        1459 non-null int64
GrLivArea       1459 non-null int64
GarageArea      1458 non-null float64
WoodDeckSF      1459 non-null int64
dtypes: float64(4), int64(6)
memory usage: 114.1 KB


In [998]:
true_lr.predict(test.loc[:,true_train_lr.columns])

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').