In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
significant_features = ['Gr Liv Area',
 'Overall Qual/Cond',
 'Total Bsmt SF',
 'BsmtFin SF 1',
 'Neighborhood_NridgHt',
 'Lot Area',
 'Neighborhood_StoneBr',
 'Mas Vnr Area',
 'Kitchen Qual',
 'Garage Area',
 'Bldg Type_1Fam',
 'Sale Type_New',
 'Bsmt Exposure',
 'Bsmt Qual',
 'Neighborhood_NoRidge',
 'Functional',
 'House Remod Yrs',
 'Lot Config_Corner',
 'Exterior 2nd_HdBoard',
 'Neighborhood_NWAmes',
 'Garage Qual/Cond',
 'Mas Vnr Type_BrkFace',
 'Foundation_CBlock',
 'Bedroom AbvGr',
 'House Age']

significant_features.sort()
significant_features

['Bedroom AbvGr',
 'Bldg Type_1Fam',
 'Bsmt Exposure',
 'Bsmt Qual',
 'BsmtFin SF 1',
 'Exterior 1st_BrkFace',
 'Exterior 1st_HdBoard',
 'Exterior Qual/Cond',
 'Foundation_PConc',
 'Functional',
 'Garage Area',
 'Gr Liv Area',
 'House Age',
 'Kitchen Qual',
 'Land Contour_HLS',
 'Lot Area',
 'Lot Frontage',
 'Mas Vnr Area',
 'Neighborhood_NoRidge',
 'Neighborhood_NridgHt',
 'Neighborhood_Somerst',
 'Neighborhood_StoneBr',
 'Overall Qual/Cond',
 'Roof Style_Mansard',
 'Sale Type_COD',
 'Sale Type_New',
 'Total Bsmt SF']

In [3]:
len(significant_features)

27

In [4]:
housing_train = pd.read_csv('datasets/Train_Cleaned.csv')
housing_train.head()

Unnamed: 0,Id,Lot Frontage,Lot Area,Lot Shape,Land Slope,Mas Vnr Area,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,...,Bldg Type_TwnhsE,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,109,68.0,13517,2,2,289.0,3,3,1,6,...,0,0,0,0,0,0,0,0,0,1
1,544,43.0,11492,2,2,132.0,4,3,1,6,...,0,0,0,0,0,0,0,0,0,1
2,153,68.0,7922,3,2,0.0,3,3,1,6,...,0,0,0,0,0,0,0,0,0,1
3,318,73.0,9802,3,2,0.0,4,3,1,1,...,0,0,0,0,0,0,0,0,0,1
4,255,82.0,14235,2,2,0.0,2,4,1,1,...,0,0,0,0,0,0,0,0,0,1


In [5]:
housing_train.set_index('Id', inplace = True)
housing_train.shape

(1948, 192)

In [6]:
X = housing_train[significant_features]
X.shape

(1948, 27)

In [7]:
y = housing_train['SalePrice']
y.shape

(1948,)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)
X_train.shape

(1461, 27)

In [9]:
X_val.shape

(487, 27)

In [10]:
ss = StandardScaler()

X_train_scaled = ss.fit_transform(X_train)
X_val_scaled = ss.transform(X_val)

X_train_scaled.shape

(1461, 27)

In [11]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

lr_cv_scores = cross_val_score(lr, X_train_scaled, y_train, cv = 10)
lr_cv_scores.mean()

0.9087741511576122

In [12]:
ridge = RidgeCV(alphas = np.logspace(0, 3, 1500))
ridge.fit(X_train_scaled, y_train)
ridge.alpha_

13.575479245259912

In [13]:
ridge_cv_scores = cross_val_score(ridge, X_train_scaled, y_train, cv = 5)
ridge_cv_scores.mean()

0.9088481115548375

In [14]:
lasso = LassoCV(n_alphas = 200)
lasso.fit(X_train_scaled, y_train)

LassoCV(n_alphas=200)

In [15]:
lasso_cv_scores = cross_val_score(lasso, X_train_scaled, y_train, cv = 5)
lasso_cv_scores.mean()

0.9087655204222361

In [16]:
sample_submission = pd.read_csv('datasets/sample_submission_reg.csv')
sample_submission.head()

Unnamed: 0,Id,SalePrice
0,2,181479.1217
1,4,181479.1217
2,6,181479.1217
3,7,181479.1217
4,17,181479.1217


In [17]:
sample_submission.shape

(879, 2)

In [18]:
housing_test = pd.read_csv('datasets/Test_Cleaned.csv')
housing_test.head()

Unnamed: 0,Id,MS SubClass,Lot Frontage,Lot Area,Lot Shape,Land Slope,Mas Vnr Area,Bsmt Qual,Bsmt Cond,Bsmt Exposure,...,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
0,2658,190,69.0,9142,3,2,0.0,2,3,1,...,0,0,0,0,0,0,0,0,0,1
1,2718,90,69.6,9662,2,2,0.0,4,3,1,...,0,0,0,0,0,0,0,0,0,1
2,2414,60,58.0,17104,2,2,0.0,4,4,3,...,0,0,0,0,0,0,1,0,0,0
3,1989,30,60.0,8520,3,2,0.0,3,3,1,...,0,0,0,0,0,0,0,0,0,1
4,625,20,69.6,9500,2,2,247.0,4,3,1,...,0,0,0,0,0,0,0,0,0,1


In [19]:
housing_test.set_index('Id', inplace = True)
housing_test.shape

(879, 177)

In [20]:
housing_test.head(1)

Unnamed: 0_level_0,MS SubClass,Lot Frontage,Lot Area,Lot Shape,Land Slope,Mas Vnr Area,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,...,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,190,69.0,9142,3,2,0.0,2,3,1,1,...,0,0,0,0,0,0,0,0,0,1


In [21]:
X_test = housing_test[significant_features]

In [22]:
X_test.head()

Unnamed: 0_level_0,Bedroom AbvGr,Bldg Type_1Fam,Bsmt Exposure,Bsmt Qual,BsmtFin SF 1,Exterior 1st_BrkFace,Exterior 1st_HdBoard,Exterior Qual/Cond,Foundation_PConc,Functional,...,Mas Vnr Area,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_Somerst,Neighborhood_StoneBr,Overall Qual/Cond,Roof Style_Mansard,Sale Type_COD,Sale Type_New,Total Bsmt SF
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,4,0,1,2,0,0,0,3,0,7,...,0.0,0,0,0,0,14,0,0,0,1020
2718,6,0,1,4,0,0,0,4,0,7,...,0.0,0,0,0,0,9,0,0,0,1967
2414,3,1,3,4,554,0,0,5,1,7,...,0.0,0,0,0,0,12,0,0,1,654
1989,2,1,1,3,0,0,0,5,0,7,...,0.0,0,0,0,0,11,0,0,0,968
625,3,1,1,4,609,0,0,4,0,7,...,247.0,0,0,0,0,11,0,0,0,1394


In [23]:
X_test.shape

(879, 27)

In [24]:
X_test_scaled = ss.fit_transform(X_test)

In [25]:
X_test_scaled.shape

(879, 27)

In [26]:
predictions = ridge.predict(X_test_scaled)

In [27]:
predictions

array([137842.47604239, 156874.17305091, 211504.18461787, 112324.81324956,
       171144.3469649 ,  89141.92307073, 112734.45617103, 166097.83656763,
       192777.36332664, 162505.85753612, 156458.26017571, 125908.50338835,
       149828.79172553, 243748.91639955, 165220.47353793, 117364.42439442,
       143272.72771265, 123009.11099765, 178000.00565706, 198128.26139343,
       153271.4165673 , 126216.42613099, 184934.16057056, 165866.84489725,
       185963.87976373, 125397.13230121, 116925.18023433, 132070.08410975,
       181466.53059663,  49782.59151956, 113452.42508908, 107190.41371888,
       197061.38433782, 165928.22853672, 205614.3247159 , 186379.40680919,
       127325.07607493,  80203.40168538, 118900.35310705, 190248.05250701,
       178993.27538331, 198185.96029393, 155087.66063182, 158439.84606489,
       211862.48351896,  90737.62914814, 218212.78353504, 115233.34313372,
       125529.07852391, 123214.6675832 , 118175.4388855 , 204407.19644499,
       229265.36106665, 1

In [28]:
X_test['SalePrice'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [29]:
X_test.head()

Unnamed: 0_level_0,Bedroom AbvGr,Bldg Type_1Fam,Bsmt Exposure,Bsmt Qual,BsmtFin SF 1,Exterior 1st_BrkFace,Exterior 1st_HdBoard,Exterior Qual/Cond,Foundation_PConc,Functional,...,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_Somerst,Neighborhood_StoneBr,Overall Qual/Cond,Roof Style_Mansard,Sale Type_COD,Sale Type_New,Total Bsmt SF,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,4,0,1,2,0,0,0,3,0,7,...,0,0,0,0,14,0,0,0,1020,137842.476042
2718,6,0,1,4,0,0,0,4,0,7,...,0,0,0,0,9,0,0,0,1967,156874.173051
2414,3,1,3,4,554,0,0,5,1,7,...,0,0,0,0,12,0,0,1,654,211504.184618
1989,2,1,1,3,0,0,0,5,0,7,...,0,0,0,0,11,0,0,0,968,112324.81325
625,3,1,1,4,609,0,0,4,0,7,...,0,0,0,0,11,0,0,0,1394,171144.346965


In [30]:
submission = X_test[['SalePrice']]

In [31]:
submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,137842.476042
2718,156874.173051
2414,211504.184618
1989,112324.81325
625,171144.346965


In [32]:
submission.sort_index(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [33]:
submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2,131866.276477
4,273291.557725
6,192823.179642
7,230133.084327
17,216507.67617


In [34]:
submission.shape

(879, 1)

In [35]:
submission.to_csv('Projected Prices.csv')