In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
significant_features = ['Gr Liv Area',
 'Overall Qual',
 'Neighborhood_NridgHt',
 'Neighborhood_StoneBr',
 'Neighborhood_NoRidge',
 'Garage Cars',
 'Exter Qual',
 'Bsmt Exposure',
 'Kitchen Qual',
 'Bldg Type_1Fam',
 'Bsmt Qual',
 'Screen Porch',
 'BsmtFin Type 1',
 'Sale Type_New',
 'Bsmt Full Bath',
 'Roof Style_Hip',
 'Lot Area',
 'TotRms AbvGrd',
 'Fireplace Qu',
 'Neighborhood_Crawfor',
 'MS SubClass_20',
 'Mas Vnr Area',
 'Exterior 1st_BrkFace',
 'Full Bath',
 'House Style_SLvl',
 'Bldg Type_Twnhs',
 'Exterior 2nd_Stucco',
 'Roof Style_Mansard',
 'Exterior 1st_Stucco',
 'Land Contour_Bnk',
 'House Remod Yrs',
 'Neighborhood_Edwards']

In [3]:
len(significant_features)

32

In [4]:
housing_train = pd.read_csv('datasets/Train_Cleaned.csv')
housing_train.head()

Unnamed: 0,Id,Lot Frontage,Lot Area,Lot Shape,Overall Qual,Mas Vnr Area,Exter Qual,Bsmt Qual,Bsmt Exposure,BsmtFin Type 1,...,Exterior 1st_Stucco,Exterior 1st_VinylSd,Exterior 1st_Wd Sdng,Exterior 1st_WdShing,Roof Style_Flat,Roof Style_Gable,Roof Style_Gambrel,Roof Style_Hip,Roof Style_Mansard,Roof Style_Shed
0,109,68.0,13517,2,6,289.0,3,3,1,6,...,0,0,0,0,0,1,0,0,0,0
1,544,43.0,11492,2,7,132.0,3,4,1,6,...,0,1,0,0,0,1,0,0,0,0
2,153,68.0,7922,3,5,0.0,2,3,1,6,...,0,1,0,0,0,1,0,0,0,0
3,318,73.0,9802,3,5,0.0,2,4,1,1,...,0,1,0,0,0,1,0,0,0,0
4,255,82.0,14235,2,6,0.0,2,2,1,1,...,0,0,1,0,0,1,0,0,0,0


In [5]:
housing_train.set_index('Id', inplace = True)
housing_train.shape

(1969, 178)

In [6]:
X = housing_train[significant_features]
X.shape

(1969, 32)

In [7]:
y = housing_train['SalePrice']
y.shape

(1969,)

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)
X_train.shape

(1476, 32)

In [9]:
X_val.shape

(493, 32)

In [10]:
ss = StandardScaler()
ss.fit(X_train)

X_train_scaled = ss.transform(X_train)
X_val_scaled = ss.transform(X_val)

X_train_scaled.shape

(1476, 32)

In [11]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

lr_cv_scores = cross_val_score(lr, X_train_scaled, y_train, cv = 10)
lr_cv_scores.mean()

0.852478939453785

In [12]:
ridge = RidgeCV(alphas = np.logspace(0, 5, 1000))
ridge.fit(X_train_scaled, y_train)
ridge.alpha_

198.28839491270713

In [13]:
ridge_cv_scores = cross_val_score(ridge, X_train_scaled, y_train, cv = 5)
ridge_cv_scores.mean()

0.8514278814883947

In [14]:
lasso = LassoCV(n_alphas = 100)
lasso.fit(X_train_scaled, y_train)

LassoCV()

In [15]:
lasso_cv_scores = cross_val_score(lasso, X_train_scaled, y_train, cv = 5)
lasso_cv_scores.mean()

0.8496172230147193

In [16]:
sample_submission = pd.read_csv('datasets/sample_submission_reg.csv')
sample_submission.head()

Unnamed: 0,Id,SalePrice
0,2,181479.1217
1,4,181479.1217
2,6,181479.1217
3,7,181479.1217
4,17,181479.1217


In [17]:
sample_submission.shape

(879, 2)

In [18]:
housing_test = pd.read_csv('datasets/Test_Cleaned.csv')
housing_test.head()

Unnamed: 0,Id,Lot Frontage,Lot Area,Lot Shape,Overall Qual,Mas Vnr Area,Exter Qual,Bsmt Qual,Bsmt Exposure,BsmtFin Type 1,...,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
0,2658,69.0,9142,3,6,0.0,2,2,1,1,...,0,0,0,0,0,0,0,0,0,1
1,2718,69.6,9662,2,5,0.0,2,4,1,1,...,0,0,0,0,0,0,0,0,0,1
2,2414,58.0,17104,2,7,0.0,3,4,3,6,...,0,0,0,0,0,0,1,0,0,0
3,1989,60.0,8520,3,5,0.0,3,3,1,1,...,0,0,0,0,0,0,0,0,0,1
4,625,69.6,9500,2,6,247.0,2,4,1,4,...,0,0,0,0,0,0,0,0,0,1


In [19]:
housing_test.set_index('Id', inplace = True)
housing_test.shape

(879, 177)

In [20]:
X_test = housing_test[significant_features]

In [21]:
X_test.head()

Unnamed: 0_level_0,Gr Liv Area,Overall Qual,Neighborhood_NridgHt,Neighborhood_StoneBr,Neighborhood_NoRidge,Garage Cars,Exter Qual,Bsmt Exposure,Kitchen Qual,Bldg Type_1Fam,...,Exterior 1st_BrkFace,Full Bath,House Style_SLvl,Bldg Type_Twnhs,Exterior 2nd_Stucco,Roof Style_Mansard,Exterior 1st_Stucco,Land Contour_Bnk,House Remod Yrs,Neighborhood_Edwards
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,1928,6,0,0,0,1,2,1,1,0,...,0,2,0,0,0,0,0,0,70,0
2718,1967,5,0,0,0,2,2,1,2,0,...,0,2,0,0,0,0,0,0,43,0
2414,1496,7,0,0,0,2,3,3,3,1,...,0,2,0,0,0,0,0,0,14,0
1989,968,5,0,0,0,2,3,1,2,1,...,0,1,0,0,0,0,0,0,14,0
625,1394,6,0,0,0,2,2,1,2,1,...,0,1,0,0,0,0,0,0,57,0


In [22]:
X_test.shape

(879, 32)

In [23]:
X_test_scaled = ss.transform(X_test)

In [24]:
X_test_scaled.shape

(879, 32)

In [25]:
predictions = ridge.predict(X_test_scaled)

In [26]:
predictions

array([126555.28187603, 162094.40061665, 249525.58820621, 134046.02364339,
       186275.95844529,  86502.1598124 , 114080.29844348, 162804.99613107,
       167223.38202072, 155075.63718984, 165478.5398508 , 129940.09959883,
       146382.74910127, 268235.63289256, 134306.40319723, 136062.74634676,
       144414.67653645, 124093.4902447 , 189573.77677458, 209805.6077119 ,
       149899.90014652, 136560.39709289, 170891.17781448, 153823.09989786,
       185883.84150026, 139389.41322014, 142657.66236892, 153834.98742007,
       179487.38510142,  65152.20440102, 106185.23192048, 115138.40887445,
       180675.63181139, 152203.63638594, 229916.02795913, 201706.03985739,
       120992.64828715,  99712.40037453, 157883.34217108, 190298.11845619,
       181084.0550021 , 216746.90215156, 164990.81990485, 182194.96574618,
       203303.92349234,  81303.28522088, 227611.12250329, 130559.49290659,
       138233.10412858, 130146.32751433, 118494.27426429, 215945.37391424,
       238918.34820462, 1

In [27]:
X_test['SalePrice'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [28]:
X_test.head()

Unnamed: 0_level_0,Gr Liv Area,Overall Qual,Neighborhood_NridgHt,Neighborhood_StoneBr,Neighborhood_NoRidge,Garage Cars,Exter Qual,Bsmt Exposure,Kitchen Qual,Bldg Type_1Fam,...,Full Bath,House Style_SLvl,Bldg Type_Twnhs,Exterior 2nd_Stucco,Roof Style_Mansard,Exterior 1st_Stucco,Land Contour_Bnk,House Remod Yrs,Neighborhood_Edwards,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2658,1928,6,0,0,0,1,2,1,1,0,...,2,0,0,0,0,0,0,70,0,126555.281876
2718,1967,5,0,0,0,2,2,1,2,0,...,2,0,0,0,0,0,0,43,0,162094.400617
2414,1496,7,0,0,0,2,3,3,3,1,...,2,0,0,0,0,0,0,14,0,249525.588206
1989,968,5,0,0,0,2,3,1,2,1,...,1,0,0,0,0,0,0,14,0,134046.023643
625,1394,6,0,0,0,2,2,1,2,1,...,1,0,0,0,0,0,0,57,0,186275.958445


In [29]:
submission = X_test[['SalePrice']]

In [30]:
submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2658,126555.281876
2718,162094.400617
2414,249525.588206
1989,134046.023643
625,186275.958445


In [31]:
submission.sort_index(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [32]:
submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2,125539.084681
4,263425.462712
6,182406.298366
7,257088.545089
17,218922.157266


In [33]:
submission.shape

(879, 1)

In [34]:
submission.to_csv('Projected Prices.csv')