In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

%matplotlib inline

In [2]:
train_model = pd.read_csv('../datasets/train_cleaning.csv')
train_model.head()

Unnamed: 0,id,overall_qual,year_built,year_remod/add,mas_vnr_area,total_bsmt_sf,1st_flr_sf,gr_liv_area,full_bath,totrms_abvgrd,...,garage_type_Basment,garage_type_BuiltIn,garage_type_CarPort,garage_type_Detchd,garage_finish_RFn,garage_finish_Unf,garage_cond_Fa,garage_cond_Gd,garage_cond_Po,garage_cond_TA
0,109,6,1976,2005,289.0,725.0,725,1479,2,6,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,544,7,1996,1997,132.0,913.0,913,2122,2,8,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,153,5,1953,2007,0.0,1057.0,1057,1057,1,5,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,318,5,2006,2007,0.0,384.0,744,1444,2,7,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,255,6,1900,1993,0.0,676.0,831,1445,2,6,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [3]:
X = train_model.drop(columns = ['id','saleprice'])
y = train_model['saleprice']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

Modeling using LinearRegression 

In [4]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [5]:
lr.score(X_train, y_train)

0.8607828407527225

In [6]:
lr.score(X_test, y_test)

0.8677778732496824

In [7]:
lr_scores = cross_val_score(lr, X_train, y_train, cv = 3)
lr_scores.mean()

0.8036831226174179

The score(r2) of the test data is higher than the score(r2) of the train data. This model is underfit, need more data. 

In [8]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

Modeling using Ridge

In [9]:
ridge = RidgeCV(alphas = np.logspace(1,10,100))

In [10]:
ridge = ridge.fit(X_train_sc,y_train)

In [26]:
#model 1 + model 2
#model 1 less feature than model 2
#ridge = RidgeCV(alphas=np.linspace(.1, 10, 100))

In [12]:
ridge.score(X_train_sc, y_train)

0.860153747916093

In [13]:
ridge.score(X_test_sc, y_test)

0.8708378678333742

In [14]:
ridge_scores = cross_val_score(ridge, X_train_sc, y_train, cv = 5)
ridge_scores.mean()

0.8185072398368156

In [15]:
ridge.alpha_

15.199110829529339

Modeling using Lasso

In [16]:
#model 1 + model 2
#lasso = LassoCV(n_alphas=200)

In [17]:
lasso = LassoCV(alphas = np.logspace(-3,0,100))
lasso = lasso.fit(X_train_sc, y_train)

In [18]:
lasso_scores = cross_val_score(lasso, X_train_sc, y_train, cv = 5)
lasso_scores.mean()

0.820845776814228

In [19]:
lasso.score(X_train_sc, y_train)

0.8607826780613012

In [20]:
lasso.score(X_test_sc, y_test)

0.8678548339427786

The lasso mean of cross_val_score is higher than ridge

In [21]:
test_model = pd.read_csv('../datasets/test_cleaning.csv')
test_model.head()

Unnamed: 0,overall_qual,year_built,year_remod/add,mas_vnr_area,total_bsmt_sf,1st_flr_sf,gr_liv_area,full_bath,totrms_abvgrd,garage_yr_blt,...,garage_type_Basment,garage_type_BuiltIn,garage_type_CarPort,garage_type_Detchd,garage_finish_RFn,garage_finish_Unf,garage_cond_Fa,garage_cond_Gd,garage_cond_Po,garage_cond_TA
0,6,1910,1950,0.0,1020,908,1928,2,9,1910.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,5,1977,1977,0.0,1967,1967,1967,2,10,1977.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,7,2006,2006,0.0,654,664,1496,2,7,2006.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,5,1923,2006,0.0,968,968,968,1,5,1935.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,6,1963,1963,247.0,1394,1394,1394,1,6,1963.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [22]:
#X_pred = [[col for col in test._get_numeric_data().columns if col != 'id']]

#predictions = lr.predict(test_model)

predictions = lr.predict(test_model)

In [23]:
test = pd.read_csv('../datasets/test.csv')

In [24]:
test_preds_df = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': predictions
})

test_preds_df.head()

Unnamed: 0,Id,SalePrice
0,2658,118192.749611
1,2718,164732.206429
2,2414,216325.656846
3,1989,132949.416565
4,625,170653.728759


In [25]:
test_preds_df.to_csv('../datasets/model_3.csv', index=False)