In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from scipy.stats import skew
from scipy.stats.stats import pearsonr
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, make_scorer
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns

dataset_train = pd.read_csv("https://www.dropbox.com/s/i51whixi8pyxopr/train.csv?dl=1")
dataset_test = pd.read_csv("https://www.dropbox.com/s/5a86581dbi0yr18/test.csv?dl=1")

# Looking at categorical values
def cat_exploration(column):
    return dataset_train[column].value_counts()

# Imputing missing values
def cat_imputation(column, value):
    dataset_train.loc[dataset_train[column].isnull(),column] = value

dataset_train['LotFrontage'].corr(dataset_train['LotArea'])
dataset_train['SqrtLotArea']=np.sqrt(dataset_train['LotArea'])
dataset_train['LotFrontage'].corr(dataset_train['SqrtLotArea'])
cond = dataset_train['LotFrontage'].isnull()
dataset_train.LotFrontage[cond]=dataset_train.SqrtLotArea[cond]
del dataset_train['SqrtLotArea']

basement_cols=['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','BsmtFinSF1','BsmtFinSF2']
for cols in basement_cols:
    if 'FinSF'not in cols:
        cat_imputation(cols,'None')
        
garage_cols=['GarageType','GarageQual','GarageCond','GarageYrBlt','GarageFinish','GarageCars','GarageArea']
for cols in garage_cols:
    if dataset_train[cols].dtype==np.object:
        cat_imputation(cols,'None')
    else:
        cat_imputation(cols, 0)

cat_imputation('Alley','None')
cat_imputation('MasVnrType', 'None')
cat_imputation('MasVnrArea', 0.0)
cat_imputation('Electrical','SBrkr')
cat_imputation('FireplaceQu','None')
cat_imputation('PoolQC', 'None')
cat_imputation('Fence', 'None')
cat_imputation('MiscFeature', 'None')

all_data = pd.concat((dataset_train.loc[:,'MSSubClass':'SaleCondition'],
                      dataset_test.loc[:,'MSSubClass':'SaleCondition']))

dataset_train["SalePrice"] = np.log1p(dataset_train["SalePrice"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = dataset_train[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)

#filling NA's with the mean of the column:
all_data = all_data.fillna(all_data.mean())

#creating matrices for sklearn:
X_train = all_data[:dataset_train.shape[0]]
X_test = all_data[dataset_train.shape[0]:]
y = dataset_train.SalePrice

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [2]:
dataset_train.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,SimplBsmtFinType1,SimplBsmtFinType2,SimplBsmtCond,SimplBsmtQual,SimplExterCond,SimplExterQual,SaleCondition_PriceDown,BoughtOffPlan,1stFlr_2ndFlr_Sf,All_Liv_SF
0,856,854,0,,3,1Fam,3,0,706.0,0.0,...,2,1,1,2,1,2,0,0,7.444833,7.449175
1,1262,0,0,,3,1Fam,3,3,978.0,0.0,...,2,1,1,2,1,1,0,0,7.141245,7.146883
2,920,866,0,,3,1Fam,3,1,486.0,0.0,...,2,1,1,2,1,2,0,0,7.488294,7.492475
3,961,756,0,,3,1Fam,4,0,216.0,0.0,...,2,1,2,1,1,1,1,0,7.448916,7.453243
4,1145,1053,0,,4,1Fam,3,2,655.0,0.0,...,2,1,1,2,1,2,0,0,7.695758,7.699252


In [3]:
dataset_train.all()

1stFlrSF                    True
2ndFlrSF                   False
3SsnPorch                  False
Alley                       True
BedroomAbvGr               False
BldgType                    True
BsmtCond                   False
BsmtExposure               False
BsmtFinSF1                 False
BsmtFinSF2                 False
BsmtFinType1               False
BsmtFinType2               False
BsmtFullBath               False
BsmtHalfBath               False
BsmtQual                   False
BsmtUnfSF                  False
CentralAir                  True
Condition1                  True
Condition2                  True
Electrical                  True
EnclosedPorch              False
ExterCond                   True
ExterQual                   True
Exterior1st                 True
Exterior2nd                 True
Fence                       True
FireplaceQu                False
Fireplaces                 False
Foundation                  True
FullBath                   False
          

In [4]:
dataset_test.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,...,SimplBsmtFinType1,SimplBsmtFinType2,SimplBsmtCond,SimplBsmtQual,SimplExterCond,SimplExterQual,SaleCondition_PriceDown,BoughtOffPlan,1stFlr_2ndFlr_Sf,All_Liv_SF
0,896,0,0,,2,1Fam,3,0,468.0,144.0,...,1,1,1,1,1,1,0,0,6.799056,6.806607
1,1329,0,0,,3,1Fam,3,0,923.0,0.0,...,2,1,1,1,1,1,0,0,7.192934,7.198328
2,928,701,0,,3,1Fam,3,0,791.0,0.0,...,2,1,1,2,1,1,0,0,7.396335,7.400863
3,926,678,0,,3,1Fam,3,0,602.0,0.0,...,2,1,1,1,1,1,0,0,7.380879,7.385467
4,1280,0,0,,2,TwnhsE,3,0,263.0,0.0,...,2,1,1,2,1,2,0,0,7.155396,7.160967


In [5]:
dataset_train.describe()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFinType2,...,SimplBsmtFinType1,SimplBsmtFinType2,SimplBsmtCond,SimplBsmtQual,SimplExterCond,SimplExterQual,SaleCondition_PriceDown,BoughtOffPlan,1stFlr_2ndFlr_Sf,All_Liv_SF
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,1162.626712,346.992466,3.409589,2.866438,2.934932,0.656164,443.639726,46.549315,3.54589,1.24726,...,1.513014,1.019178,1.019178,1.480822,1.102055,1.369863,0.093836,0.085616,7.264729,7.276714
std,386.587738,436.528436,29.317331,0.815778,0.552159,1.039123,456.098091,161.319273,2.107776,0.892332,...,0.548381,0.266296,0.26371,0.5482,0.302824,0.482933,0.2917,0.279893,0.333612,0.333907
min,334.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,5.814131,5.831337
25%,882.0,0.0,0.0,2.0,3.0,0.0,0.0,0.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,7.025316,7.036615
50%,1087.0,0.0,0.0,3.0,3.0,0.0,383.5,0.0,4.0,1.0,...,2.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,7.285507,7.297628
75%,1391.25,728.0,0.0,3.0,3.0,1.0,712.25,0.0,6.0,1.0,...,2.0,1.0,1.0,2.0,1.0,2.0,0.0,0.0,7.48226,7.492475
max,4692.0,2065.0,508.0,8.0,4.0,3.0,5644.0,1474.0,6.0,6.0,...,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,8.638171,8.639701


In [6]:
X_train, y_train = dataset_train.OverallQual, dataset_train.SalePrice

In [7]:
X_test = dataset_test.OverallQual

In [8]:
def covariance(series_x, series_y):
    N = len(series_x)
    return sum([(series_x[i] - np.mean(series_x)) * (series_y[i] - np.mean(series_y)) for i in range(N)])


In [9]:
def coefficients(X, Y):
    beta_1 = covariance(X, Y) / np.var(X)
    beta_0 = np.mean(Y) - beta_1 * np.mean(X)
    return beta_0, beta_1

In [10]:
def prediction(X, coefficients):
    beta_0, beta_1 = coefficients
    return [beta_0 + beta_1 * x for x in X]

In [11]:
y_hat = prediction(X_test, coefficients(X_train, y_train))

y1_hat = [abs(g) for g in y_hat]

y_hat_df = pd.DataFrame({'Id': dataset_test.Id, 'SalePrice': np.around(y1_hat,2)})

y_hat_df.to_csv('submission.csv', index=False)