In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

In [2]:
data = 'train.csv' 
test = 'test.csv'

In [3]:
houses = pd.read_csv('./datasets/' + data)
houses_test = pd.read_csv('./datasets/' + test)

In [4]:
info = houses_test['Id']

In [5]:
houses.columns = [x.lower() for x in houses.columns]
houses.columns = [x.replace(' ', '_') for x in houses.columns]
houses_test.columns = [x.lower() for x in houses_test.columns]
houses_test.columns = [x.replace(' ', '_') for x in houses_test.columns]

In [6]:
houses_test.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,...,0,185,0,,,,0,7,2009,WD


In [7]:
houses = houses.fillna(0)
houses_test = houses_test.fillna(0)
houses = houses[houses.lot_area <= 60000]

In [8]:
houses = houses.drop(columns = ['id',
                                'pid',
                                'mo_sold',
                                'yr_sold', 
                                'bsmtfin_sf_2',
                                'pool_qc', 
                                'misc_feature',
                                'lot_shape',
                                'land_contour',
                                'land_slope',
                                'misc_val', 
                                '3ssn_porch', 
                                'electrical', 
                                'condition_1', 
                                'condition_2', 
                                'garage_cars', 
                                'pool_area', 
                                'alley', 
                                'mo_sold'])

In [9]:
houses_test = houses_test.drop(columns = ['id',
                                    'pid',
                                    'mo_sold',
                                    'yr_sold', 
                                    'bsmtfin_sf_2',
                                    'pool_qc', 
                                    'misc_feature',
                                    'lot_shape',
                                    'land_contour',
                                    'land_slope',
                                    'misc_val', 
                                    '3ssn_porch', 
                                    'electrical', 
                                    'condition_1', 
                                    'condition_2', 
                                    'garage_cars', 
                                    'pool_area', 
                                    'alley',
                                    'mo_sold'])

In [10]:
objects = [x for x in houses.columns if houses[x].dtype == object]
objects_test = [x for x in houses_test.columns if houses_test[x].dtype == object]

In [11]:
houses =  pd.get_dummies(houses, columns = objects, drop_first = True)
houses_test = pd.get_dummies(houses_test, columns = objects_test, drop_first = True)

In [12]:
print(houses.shape)
print(houses_test.shape)

(2047, 217)
(878, 208)


In [13]:
houses.head()

Unnamed: 0,ms_subclass,lot_frontage,lot_area,overall_qual,overall_cond,year_built,year_remod/add,mas_vnr_area,bsmtfin_sf_1,bsmt_unf_sf,...,fence_MnPrv,fence_MnWw,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_WD
0,60,0.0,13517,6,8,1976,2005,289.0,533.0,192.0,...,0,0,0,0,0,0,0,0,0,1
1,60,43.0,11492,7,5,1996,1997,132.0,637.0,276.0,...,0,0,0,0,0,0,0,0,0,1
2,20,68.0,7922,5,7,1953,2007,0.0,731.0,326.0,...,0,0,0,0,0,0,0,0,0,1
3,60,73.0,9802,5,5,2006,2007,0.0,0.0,384.0,...,0,0,0,0,0,0,0,0,0,1
4,50,82.0,14235,6,8,1900,1993,0.0,0.0,676.0,...,0,0,0,0,0,0,0,0,0,1


In [14]:
common_cols = list(set(houses.columns).intersection(houses_test.columns))

In [15]:
houses_test = houses_test[common_cols]

In [16]:
common_cols.append('saleprice')

In [17]:
houses = houses[common_cols]

In [18]:
X = houses.drop(columns = 'saleprice')
y = houses['saleprice']
test = houses_test

In [19]:
r_alphas = np.logspace(0, 5, 100)
ridge = RidgeCV(alphas=r_alphas, scoring = 'r2', cv= 5).fit(X, y);

In [20]:
test['saleprice'] = ridge.predict(test)

In [21]:
test['Id'] = info

In [22]:
submission = test[['Id', 'saleprice']].copy()

In [23]:
submission.head()

Unnamed: 0,Id,saleprice
0,2658,148469.382434
1,2718,142889.364332
2,2414,217946.149436
3,1989,106141.832345
4,625,173791.177732


In [24]:
submission.to_csv('./datasets/submission_5.csv', index = False)