In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib 
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr

# 读取数据集
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 处理 SalePrice 
prices = pd.DataFrame({'price':train['SalePrice'], 'log(price+1)':np.log1p(train['SalePrice'])})
# prices.hist()
train['SalePrice'] = np.log1p(train['SalePrice'])

# GrLivArea 
train['GrLivArea'] = train['GrLivArea'][train['GrLivArea']<4500]

# 合并训练集和测试机
all_data = pd.concat((train.loc[:,"MSSubClass":"SaleCondition"],
                     test.loc[:, "MSSubClass":"SaleCondition"]))

# 处理 偏斜>0.5
numeric_feats = all_data.dtypes[all_data.dtypes!='object'].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.5]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

# 转换成虚拟变量
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())

X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train["SalePrice"]





## Models

In [None]:
from sklearn.linear_model import Ridge, RidgeCV, ElasticNetCV, LassoCV, LassoLarsCV
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline 

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, y, scoring='neg_mean_squared_error', cv=5))
    return rmse


"""
    Ridge
"""
# model_ridge = Ridge()
# alphas = [0.03, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 100]
# cv_ridge = [rmse_cv(Ridge(alpha=alpha)).mean() for alpha in alphas]

# cv_ridge = pd.Series(cv_ridge, index=alphas)
# cv_ridge.plot(title = 'Validation - Just Do it')
# plt.xlabel('alpha')
# plt.ylabel('rmse')


"""
    Lasso
"""
# alphas = [1,0.3,0.1,0.03,0.01,0.003,0.001,0.0003,0.0001,0.00003,0.00001,0.000003]
# model_lasso = LassoCV(alphas=alphas, max_iter=50000).fit(X_train, y)
# rmse_cv(model_lasso).mean()
# preds = model_lasso.predict(X_test)
# solution = pd.DataFrame({'id':test.Id, 'SalePrice':np.expm1(preds)})
# solution.to_csv('linear_col.csv', index=False)

# for alpha in alphas:
#     model_lasso = LassoCV(alpha, max_iter=50000).fit(X_train, y)
#     res = rmse_cv(model_lasso)
#     print(alpha, res, res.mean())
# model_lasso = LassoCV(0.00001, max_iter=50000).fit(X_train, y)
# preds = model_lasso.predict(X_test)
# solution = pd.DataFrame({'id':test.Id, 'SalePrice':np.expm1(preds)})
# solution.to_csv('submission_2019_12_18.csv', index=False)

model_lasso = make_pipeline(RobustScaler(), LassoCV(0.00001, max_iter=50000))
model_lasso = model_lasso.fit(X_train, y)
print(rmse_cv(model_lasso))
preds = model_lasso.predict(X_test)
solution = pd.DataFrame({'id':test.Id, 'SalePrice':np.expm1(preds)})
solution.to_csv('submission_2019_12_18_2.csv', index=False)