In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pp
import seaborn as sns

from sklearn.linear_model import RidgeCV, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, make_scorer

from scipy.stats import skew
%matplotlib inline

# 1. Import Dataset

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train.head()

In [None]:
print(f'Train shape : {train.shape}')

# 2. Outliers

Remove cheap and expensive house price

In [None]:
sns.scatterplot(data=train, x='SalePrice', y='GrLivArea')

In [None]:
#remove >600k SalePrice and >4000 GrLivArea
train = train[train.SalePrice<600000]

In [None]:
train = train[train.GrLivArea<4000]

In [None]:
print(f'Removed outliers, train shape : {train.shape}')

# 3. Study dataset

In [None]:
corr = train.corr()

In [None]:
sns.heatmap(corr)

In [None]:
corr.sort_values(['SalePrice'], ascending=False, inplace=True)
corr.SalePrice.head(11)

In [None]:
top10_col = corr.SalePrice[0:11].index
print(top10_col)

In [None]:
train[top10_col].info()

In [None]:
train = train[top10_col]

In [None]:
train.head()

In [None]:
top10_col.drop('SalePrice')

In [None]:
#Creating polynomials feature
for c in train.drop(columns=['SalePrice']):
    train[c + '_s2'] = train[c] ** 2
    train[c + '_s3'] = train[c] ** 3
    train[c + '_sqrt'] = np.sqrt(train[c])

In [None]:
train.columns

# 5 Check Skewness and Log Transform 

In [None]:
skewness = train.apply(lambda x: skew(x))

In [None]:
skewness = skewness[abs(skewness) > 0.5]
print(skewness)

In [None]:
print(str(skewness.shape[0]) + " skewed numerical features to log transform")

In [None]:
skewed_features = skewness.index
train[skewed_features] = np.log1p(train[skewed_features])

In [None]:
y = train.SalePrice
train=train.drop(columns=['SalePrice'])

In [None]:
train.head()

# 6. Ridge model 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size = 0.2, random_state=0)
print(f'X_train : {X_train.shape}')
print(f'X_test : {X_test.shape}')
print(f'y_train : {y_train.shape}')
print(f'y_test : {y_test.shape}')

In [None]:
X_train.head()

In [None]:
# Standardize numerical features
stdSc = StandardScaler()
X_train = pd.DataFrame(stdSc.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(stdSc.fit_transform(X_test), columns=X_test.columns, index=X_test.index)

In [None]:
# X_train = X_train.drop(columns=['SalePrice'])

In [None]:
X_train.head()

In [None]:
X_test.head()

In [None]:
scorer = make_scorer(mean_squared_error, greater_is_better = False)

def rmse_cv_train(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y_train, scoring = scorer, cv = 10))
    return(rmse)

def rmse_cv_test(model):
    rmse= np.sqrt(-cross_val_score(model, X_test, y_test, scoring = scorer, cv = 10))
    return(rmse)

In [None]:
ridge = RidgeCV(alphas = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100])
ridge.fit(X_train, y_train)
alpha = ridge.alpha_
print("Best alpha :", alpha)

In [None]:
alpha_list = list(alpha*np.arange(0.6,1.4,0.05))

In [None]:
alpha_list = [ '%.4f' % elem for elem in alpha_list ]
print(alpha_list)

In [None]:
ridge = RidgeCV(alphas = alpha_list, 
                cv = 10)

In [None]:
ridge.fit(X_train, y_train)
alpha = ridge.alpha_
print("Best alpha :", alpha)

In [None]:
alpha_list = list(0.006*np.arange(0.05,0.95,0.05))
alpha_list = [ '%.4f' % elem for elem in alpha_list ]
print(alpha_list)

In [None]:
ridge = RidgeCV(alphas = alpha_list, 
                cv = 10)

In [None]:
ridge.fit(X_train, y_train)
alpha = ridge.alpha_
print("Best alpha :", alpha)

In [None]:
print("Ridge RMSE on Training set :", rmse_cv_train(ridge).mean())
print("Ridge RMSE on Test set :", rmse_cv_test(ridge).mean())
y_train_rdg = ridge.predict(X_train)
y_test_rdg = ridge.predict(X_test)

# Plot residuals
pp.scatter(y_train_rdg, y_train_rdg - y_train, c = "blue", marker = "s", label = "Training data")
pp.scatter(y_test_rdg, y_test_rdg - y_test, c = "lightgreen", marker = "s", label = "Validation data")
pp.title("Linear regression with Ridge regularization")
pp.xlabel("Predicted values")
pp.ylabel("Residuals")
pp.legend(loc = "upper left")
pp.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
pp.show()

# Plot predictions
pp.scatter(y_train_rdg, y_train, c = "blue", marker = "s", label = "Training data")
pp.scatter(y_test_rdg, y_test, c = "lightgreen", marker = "s", label = "Validation data")
pp.title("Linear regression with Ridge regularization")
pp.xlabel("Predicted values")
pp.ylabel("Real values")
pp.legend(loc = "upper left")
pp.plot([10.5, 13.5], [10.5, 13.5], c = "red")
pp.show()

# Plot important coefficients
coefs = pd.Series(ridge.coef_, index = X_train.columns)
print("Ridge picked " + str(sum(coefs != 0)) + " features and eliminated the other " +  \
      str(sum(coefs == 0)) + " features")
imp_coefs = pd.concat([coefs.sort_values().head(10),
                     coefs.sort_values().tail(10)])
imp_coefs.plot(kind = "barh")
pp.title("Coefficients in the Ridge Model")
pp.show()

# 6. Test set prediction 

In [None]:
df_test = pd.read_csv('test.csv')
df_test.head()

In [None]:
print(f'Test dataset shape: {df_test.shape}')

In [None]:
df_test = pd.DataFrame(data=df_test, columns=top10_col[1:])
df_test.head()

In [None]:
null_sum = df_test.isnull().sum()
null_sum[null_sum > 0].plot.bar()

In [None]:
null_columns = list(pd.DataFrame(null_sum[null_sum > 0]).index)

for c in null_columns:
    if df_test[c].dtype == 'object':
        df_test[c] = df_test[[c]].replace(np.NAN, df_test[c].mode()[0])
    else:
        df_test[c] = df_test[c].replace(np.NAN, df_test[c].mean())
        
print(df_test.isnull().sum()[df_test.isnull().sum() > 0])

In [None]:
for c in df_test.columns:
    df_test[c + '_s2'] = df_test[c] ** 2
    df_test[c + '_s3'] = df_test[c] ** 3
    df_test[c + '_sqrt'] = np.sqrt(df_test[c])

In [None]:
print(df_test.columns)

In [None]:
skewed_features = skewness.index
skewed_features = skewed_features[1:]
print(f'Features to be skewed: {skewed_features}')

In [None]:
df_test[skewed_features] = np.log1p(df_test[skewed_features])

In [None]:
df_test.head()

In [None]:
df_test = pd.DataFrame(stdSc.fit_transform(df_test), columns=df_test.columns, index=df_test.index)

In [None]:
df_test.head()

In [None]:
test_pred = ridge.predict(df_test)
print(test_pred)

In [None]:
test_pred = np.expm1(test_pred)
print(test_pred)

In [None]:
submission = pd.concat([pd.read_csv('test.csv').Id, pd.DataFrame(test_pred, columns=['SalePrice'])], axis=1)
submission.head()

In [None]:
# submission.to_csv('Prediction2.csv',index=False)

In [None]:
# pd.read_csv('Prediction2.csv').head()

# LASSO (L1) penalty 

In [None]:
from sklearn.linear_model import LassoCV

In [None]:
# 3* Lasso
lasso = LassoCV(alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 
                          0.3, 0.6, 1], 
                max_iter = 100000, cv = 10, tol = 0.001)
lasso.fit(X_train, y_train)
alpha = lasso.alpha_
print("Best alpha :", alpha)

print("Try again for more precision with alphas centered around " + str(alpha))
lasso = LassoCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, 
                          alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, 
                          alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, 
                          alpha * 1.4], 
                max_iter = 100000, cv = 10, tol = 0.001)
lasso.fit(X_train, y_train)

# for i in range(0,50):
#     print(f'Iteration {i}, RMSE : {rmse_cv_train(lasso).mean()}')
#     lasso.fit(X_train, y_train)

alpha = lasso.alpha_
print("Best alpha :", alpha)

print("Lasso RMSE on Training set :", rmse_cv_train(lasso).mean())
print("Lasso RMSE on Test set :", rmse_cv_test(lasso).mean())
y_train_las = lasso.predict(X_train)
y_test_las = lasso.predict(X_test)

# Plot residuals
pp.scatter(y_train_las, y_train_las - y_train, c = "blue", marker = "s", label = "Training data")
pp.scatter(y_test_las, y_test_las - y_test, c = "lightgreen", marker = "s", label = "Validation data")
pp.title("Linear regression with Lasso regularization")
pp.xlabel("Predicted values")
pp.ylabel("Residuals")
pp.legend(loc = "upper left")
pp.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
pp.show()

# Plot predictions
pp.scatter(y_train_las, y_train, c = "blue", marker = "s", label = "Training data")
pp.scatter(y_test_las, y_test, c = "lightgreen", marker = "s", label = "Validation data")
pp.title("Linear regression with Lasso regularization")
pp.xlabel("Predicted values")
pp.ylabel("Real values")
pp.legend(loc = "upper left")
pp.plot([10.5, 13.5], [10.5, 13.5], c = "red")
pp.show()

# Plot important coefficients
coefs = pd.Series(lasso.coef_, index = X_train.columns)
print("Lasso picked " + str(sum(coefs != 0)) + " features and eliminated the other " +  \
      str(sum(coefs == 0)) + " features")
imp_coefs = pd.concat([coefs.sort_values().head(10),
                     coefs.sort_values().tail(10)])
imp_coefs.plot(kind = "barh")
pp.title("Coefficients in the Lasso Model")
pp.show()

In [None]:
test_pred = lasso.predict(df_test)

In [None]:
test_pred = np.expm1(test_pred)

In [None]:
submission = pd.concat([pd.read_csv('test.csv').Id, pd.DataFrame(test_pred, columns=['SalePrice'])], axis=1)
submission.head()

In [None]:
submission.to_csv('Prediction3.csv',index=False)

In [None]:
from sklearn.linear_model import ElasticNetCV

In [None]:
# 4* ElasticNet
elasticNet = ElasticNetCV(l1_ratio = [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1],
                          alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 
                                    0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6], 
                          max_iter = 100000, cv = 10)
elasticNet.fit(X_train, y_train)
alpha = elasticNet.alpha_
ratio = elasticNet.l1_ratio_
print("Best l1_ratio :", ratio)
print("Best alpha :", alpha )

print("Try again for more precision with l1_ratio centered around " + str(ratio))
elasticNet = ElasticNetCV(l1_ratio = [ratio * .85, ratio * .9, ratio * .95, ratio, ratio * 1.05, ratio * 1.1, ratio * 1.15],
                          alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6], 
                          max_iter = 100000, cv = 10)
elasticNet.fit(X_train, y_train)
if (elasticNet.l1_ratio_ > 1):
    elasticNet.l1_ratio_ = 1    
alpha = elasticNet.alpha_
ratio = elasticNet.l1_ratio_
print("Best l1_ratio :", ratio)
print("Best alpha :", alpha )

print("Now try again for more precision on alpha, with l1_ratio fixed at " + str(ratio) + 
      " and alpha centered around " + str(alpha))
elasticNet = ElasticNetCV(l1_ratio = ratio,
                          alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, alpha * .9, 
                                    alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, 
                                    alpha * 1.35, alpha * 1.4], 
                          max_iter = 100000, cv = 10)
elasticNet.fit(X_train, y_train)
if (elasticNet.l1_ratio_ > 1):
    elasticNet.l1_ratio_ = 1    
alpha = elasticNet.alpha_
ratio = elasticNet.l1_ratio_
print("Best l1_ratio :", ratio)
print("Best alpha :", alpha )

print("ElasticNet RMSE on Training set :", rmse_cv_train(elasticNet).mean())
print("ElasticNet RMSE on Test set :", rmse_cv_test(elasticNet).mean())
y_train_ela = elasticNet.predict(X_train)
y_test_ela = elasticNet.predict(X_test)

# Plot residuals
pp.scatter(y_train_ela, y_train_ela - y_train, c = "blue", marker = "s", label = "Training data")
pp.scatter(y_test_ela, y_test_ela - y_test, c = "lightgreen", marker = "s", label = "Validation data")
pp.title("Linear regression with ElasticNet regularization")
pp.xlabel("Predicted values")
pp.ylabel("Residuals")
pp.legend(loc = "upper left")
pp.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
pp.show()

# Plot predictions
pp.scatter(y_train, y_train_ela, c = "blue", marker = "s", label = "Training data")
pp.scatter(y_test, y_test_ela, c = "lightgreen", marker = "s", label = "Validation data")
pp.title("Linear regression with ElasticNet regularization")
pp.xlabel("Predicted values")
pp.ylabel("Real values")
pp.legend(loc = "upper left")
pp.plot([10.5, 13.5], [10.5, 13.5], c = "red")
pp.show()

# Plot important coefficients
coefs = pd.Series(elasticNet.coef_, index = X_train.columns)
print("ElasticNet picked " + str(sum(coefs != 0)) + " features and eliminated the other " +  str(sum(coefs == 0)) + " features")
imp_coefs = pd.concat([coefs.sort_values().head(10),
                     coefs.sort_values().tail(10)])
imp_coefs.plot(kind = "barh")
pp.title("Coefficients in the ElasticNet Model")
pp.show()

In [None]:
test_pred = elasticNet.predict(df_test)

In [None]:
test_pred = np.expm1(test_pred)

In [None]:
submission = pd.concat([pd.read_csv('test.csv').Id, pd.DataFrame(test_pred, columns=['SalePrice'])], axis=1)
submission.head()

In [None]:
submission.to_csv('Prediction4.csv',index=False)