In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer

## Import Data 

In [3]:
train = pd.read_csv('train_new.csv')
test = pd.read_csv('test_new.csv')

In [4]:
y = train.SalePrice
train = train.drop(columns=['Id', 'SalePrice'])

In [5]:
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

Train shape: (1451, 341)
Test shape: (1459, 341)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.30, random_state=0)
print(f'X_train : {X_train.shape}')
print(f'X_test : {X_test.shape}')
print(f'y_train : {y_train.shape}')
print(f'y_test : {y_test.shape}')

X_train : (1015, 341)
X_test : (436, 341)
y_train : (1015,)
y_test : (436,)


In [7]:
stdSc = StandardScaler()
X_train = pd.DataFrame(stdSc.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(stdSc.transform(X_test), columns=X_test.columns, index=X_test.index)

In [59]:
scorer = make_scorer(mean_squared_error, greater_is_better=False)

def rmse_train(model):
    r = np.sqrt(-cross_val_score(model, X_train, y_train, scoring = scorer, cv = 10))
    return(r)
    
def rmse_test(model):
    r = np.sqrt(-cross_val_score(model, X_test, y_test, scoring = scorer, cv = 10))
    return(r)

## RidgeCV Modeling 

In [46]:
alpha_list = [0.0001, 0.0003, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
ridge = RidgeCV(alphas=alpha_list, cv=10)
ridge.fit(X_train, y_train)
alpha = ridge.alpha_
print(f'First try alpha: {alpha}')

ranging = np.arange(0.5, 1.55, 0.05).astype('float16')
ridge = RidgeCV(alphas=(alpha*ranging), cv=10)
ridge.fit(X_train, y_train)
alpha = ridge.alpha_
print(f'Second try alpha: {alpha}')

ridge = RidgeCV(alphas=(alpha*ranging), cv=10)
ridge.fit(X_train, y_train)
alpha = ridge.alpha_
print(f'Third try alpha: {alpha}')

First try alpha: 100.0
Second try alpha: 150.0
Third try alpha: 225.0


## LassoCV Modeling

In [47]:
lasso = LassoCV(alphas=alpha_list, max_iter=10000 ,cv=10)
lasso.fit(X_train, y_train)
alpha = lasso.alpha_
print(f'First try alpha: {alpha}')

First try alpha: 0.003


In [48]:
lasso = LassoCV(alphas=(alpha*ranging), max_iter=10000 ,cv=10)
lasso.fit(X_train, y_train)
alpha = lasso.alpha_
print(f'Second try alpha: {alpha}')

Second try alpha: 0.003299713134765625


In [50]:
lasso = LassoCV(alphas=(alpha*ranging), max_iter=10000 ,cv=10)
lasso.fit(X_train, y_train)
alpha = lasso.alpha_
print(f'Third try alpha: {alpha}')

Third try alpha: 0.003299713134765625


## ElasticNet Modeling 

In [52]:
l1_ratio = np.arange(0.1, 1.1, 0.1)
elasticNet = ElasticNetCV(l1_ratio=l1_ratio, alphas=alpha_list, max_iter=100000, cv=10)
elasticNet.fit(X_train, y_train)
l1 = elasticNet.l1_ratio_
alpha = elasticNet.alpha_
print(f'First Try: l1 = {l1}, alpha = {alpha}')

First Try: l1 = 0.1, alpha = 0.03


In [53]:
elasticNet = ElasticNetCV(l1_ratio=(l1*ranging), alphas=(alpha*ranging), max_iter=100000, cv=10)
elasticNet.fit(X_train, y_train)
l1 = elasticNet.l1_ratio_
alpha = elasticNet.alpha_
print(f'Second Try: l1 = {l1}, alpha = {alpha}')

Second Try: l1 = 0.0799560546875, alpha = 0.040496826171875


In [54]:
elasticNet = ElasticNetCV(l1_ratio=(l1*ranging), alphas=(alpha*ranging), max_iter=100000, cv=10)
elasticNet.fit(X_train, y_train)
l1 = elasticNet.l1_ratio_
alpha = elasticNet.alpha_
print(f'Third Try: l1 = {l1}, alpha = {alpha}')

Third Try: l1 = 0.07598876953125, alpha = 0.042510986328125


## Train and Validation Prediction

In [63]:
lasso_stat = [rmse_train(lasso).mean(), rmse_test(lasso).mean()]
print(lasso_stat)

[0.1092846060226084, 0.11053330693666312]


In [64]:
ridge_stat = [rmse_train(ridge).mean(), rmse_test(ridge).mean()]
print(ridge_stat)

[0.11067888377500676, 0.11559875843961358]


In [65]:
elasticNet_stat = [rmse_train(elasticNet).mean(), rmse_test(elasticNet).mean()]
print(elasticNet_stat)

[0.10886286387175903, 0.11129260347211634]


In [76]:
df = pd.DataFrame([ridge_stat, lasso_stat, elasticNet_stat], 
                  columns=['rmse train mean', 'rmse test mean'],
                 index=['RidgeCv', 'LassoCV', 'ElasticNetCV'])
df

Unnamed: 0,rmse train mean,rmse test mean
RidgeCv,0.110679,0.115599
LassoCV,0.109285,0.110533
ElasticNetCV,0.108863,0.111293


## Submission

In [99]:
test = pd.read_csv('test_new.csv')
test = pd.DataFrame(stdSc.transform(test), columns=test.columns, index=test.index)
prediction = elasticNet.predict(test)
prediction

array([11.70529752, 11.97940076, 12.11069368, ..., 12.01758014,
       11.71059091, 12.34404321])

In [100]:
prediction = np.expm1(prediction)
submission = pd.DataFrame(prediction, columns=['SalePrice'], index=pd.read_csv('test.csv').Id)
submission.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,121211.14113
1462,159435.461602
1463,181804.676242
1464,192883.110479
1465,194835.424921


In [101]:
submission.to_csv('August17_v2.csv')