# Introduction
**This will be your workspace for Kaggle's Machine Learning education track.**

You will build and continually improve a model to predict housing prices as you work through each tutorial.  Fork this notebook and write your code in it.

The data from the tutorial, the Melbourne data, is not available in this workspace.  You will need to translate the concepts to work with the data in this notebook, the Iowa data.

Come to the [Learn Discussion](https://www.kaggle.com/learn-forum) forum for any questions or comments. 

# Write Your Code Below



In [None]:
import pandas as pd

main_file_path = '../input/train.csv'
data = pd.read_csv(main_file_path)
print(data.describe())
# print(data)

In [None]:
print(data.columns)

In [None]:
data_price_copy = data.SalePrice
print(data.SalePrice.head())
# data_price_copy[0] = 208500
# print(data.SalePrice.head())

In [None]:
from sklearn.tree import DecisionTreeRegressor

y = data.SalePrice
five_columns = data[['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']]
X = five_columns

house_price_model = DecisionTreeRegressor()

house_price_model.fit(X, y)

print('make some predictions:')

print(X.head())
print(house_price_model.predict(X.head()))

In [None]:
print(y)

In [None]:
from sklearn.metrics import mean_absolute_error

predict_house_price = house_price_model.predict(X)
mean_absolute_error(y, predict_house_price)

In [None]:
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

house_price_model2 = DecisionTreeRegressor()
house_price_model2.fit(train_X, train_y)

predict_house_price2 = house_price_model2.predict(val_X)
mean_absolute_error(val_y, predict_house_price2)

In [None]:
def get_mae(max_leaf_nodes, predictors_train, predictors_val, targ_train, targ_val):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(predictors_train, targ_train)
    pred_val = model.predict(predictors_val)
    mae = mean_absolute_error(targ_val, pred_val)
    return mae

In [None]:
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print("Max leaf nodes are %d:       Mean absolute error:%d" %(max_leaf_nodes, my_mae))

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor()
forest_model.fit(train_X, train_y)
preds_y = forest_model.predict(val_X)
mae = mean_absolute_error(val_y, preds_y)
print(mae)

In [None]:
forest_model.fit(X, y)

# 读取测试文件
test = pd.read_csv('../input/test.csv')
test_X = test[['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']]

test_y = forest_model.predict(test_X)
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': test_y})

# 生成提交文件
my_submission.to_csv('submission.csv', index = False)

In [None]:
# ----------------------------------------------------
# Level2
# ----------------------------------------------------

In [None]:
# import 
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

In [None]:
# 读取数据，只保留数值型
train = pd.read_csv('../input/train.csv')
test_org = pd.read_csv('../input/test.csv')

train_target = train.SalePrice

# 去掉Id列
train_predictors = train.drop(['SalePrice', 'Id'], axis=1).select_dtypes(exclude=['object'])
print(train_predictors.columns)
test = test_org[train_predictors.columns]
print(test.head())

In [None]:
# find some columns with missing values in your dataset
train_cols_with_miss = [col for col in train_predictors.columns if train_predictors[col].isnull().any()]
print(train_cols_with_miss)
test_cols_with_miss = [col for col in test if test[col].isnull().any()]
print(test_cols_with_miss)

In [None]:
# Use the Imputer class to impute missing values

In [None]:
imputed_X_train_plus = train_predictors.copy()
imputed_X_test_plus = test.copy()

for col in train_cols_with_miss:
    imputed_X_train_plus[col + '_with_missing'] = imputed_X_train_plus[col].isnull()
    imputed_X_test_plus[col + '_with_missing'] = imputed_X_test_plus[col].isnull()


print(imputed_X_train_plus.shape)
print(imputed_X_train_plus.columns)
    
# Imputer
my_imputer = Imputer()
imputed_X_train = my_imputer.fit_transform(imputed_X_train_plus)
imputed_X_test = my_imputer.transform(imputed_X_test_plus)

# print(imputed_X_train)

# print(imputed_X_test)

def get_mae(n_estimators, train_X, train_y, val_X, val_y):
    model = train_model(n_estimators, train_X, train_y)
    preds = get_preds(model, val_X)
    mae = mean_absolute_error(val_y, preds)
    return mae

def train_model(n_estimators, train_X, train_y):
    model = RandomForestRegressor(n_estimators)
    model.fit(train_X, train_y)
    return model

def get_preds(model, val_X):
    preds = model.predict(val_X)
    return preds

imputed_X_train = pd.DataFrame(imputed_X_train, columns=imputed_X_train_plus.columns)    

# 添加非数值型特征
some_cols = ['MSZoning', 'Utilities', 'Condition1', 'BldgType', 'HouseStyle', 'MasVnrType', 'HeatingQC', 'CentralAir', 'KitchenQual', 'PoolQC', 'MiscFeature']
imputed_X_train = pd.concat([imputed_X_train, pd.get_dummies(train[some_cols])], axis=1)
print(imputed_X_train.shape)
# 从给出的数据集中，划分训练集和测试集，以确定最好的评估器的个数，最后再以总体数据作为训练集进行训练
train_X, val_X, train_y, val_y = train_test_split(imputed_X_train, train_target, random_state=0)
# print(val_y.isnull().sum())
# train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)

for n in [10, 15, 20, 25, 30, 35, 40, 45, 50]:
    mae = get_mae(n, train_X, train_y, val_X, val_y)
    print("n = %d: \t\tmae value:%d" % (n, mae))

# model = train_model(10, train_X, train_y)
# preds = get_preds(model, val_X)
# print(preds)
# mae = mean_absolute_error(val_y, preds)


In [None]:
# sub_data = pd.read_csv('submission2.csv')
# print(sub_data)

In [None]:
model = train_model(50, imputed_X_train, train_target)
imputed_X_test = pd.DataFrame(imputed_X_test, columns=imputed_X_test_plus.columns)
# print(type(pd.get_dummies(test_org[some_cols])))
train_dummies, test_dummies = pd.get_dummies(train[some_cols]).align(pd.get_dummies(test_org[some_cols]), join='left', axis=1)
# print(test_dummies[0].shape)
test_dummies.fillna(0, inplace=True)
print(test_dummies.isnull().head())
imputed_X_test = pd.concat([imputed_X_test, test_dummies], axis=1)
# imputed_X_test.fillna(0)
# imputed_X_test = imputed_X_train.align(imputed_X_test, join='left', axis=1)
# print(imputed_X_test.isnull())
preds = model.predict(imputed_X_test)

# my_submission = pd.DataFrame({'Id': test_org.Id, 'SalePrice': preds})
# my_submission.to_csv('submission4.csv', index=False)

In [None]:
###########################################
# 添加非数值型特征 #
###########################################

In [None]:
my_submission = pd.DataFrame({'Id': test_org.Id, 'SalePrice': preds})
my_submission.to_csv('submission4.csv', index=False)


In [None]:
# train_dum = pd.get_dummies(train[some_cols])
# print(list(train_dum.columns))
# # test_dum = pd.get_dummies(test_org[some_cols])
# # print(test_dum.shape)
# # print(list(train_dum.columns) - list(test_dum.shape))
# set1 = set(list(train_dum))
# set2 = set(list(test_dum))
# print(set1 & set2)

In [None]:
###########################################
# 使用XGBoost #
###########################################

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

train_data = pd.read_csv('../input/train.csv')
test_org = pd.read_csv('../input/test.csv')

final_test_data = test_org.drop(['Id'], axis=1)
train_data.dropna(axis = 0, subset=['SalePrice'], inplace=True)

y  = train_data.SalePrice
X = train_data.drop(['SalePrice', 'Id'], axis=1).select_dtypes(exclude=['object'])
final_test_data = final_test_data.select_dtypes(exclude=['object'])

# object类型的列
train_object = pd.get_dummies(train_data.select_dtypes(include=['object']))
test_object = pd.get_dummies(test_org.select_dtypes(include=['object']))
train_object, test_object = train_object.align(test_object, join='left', axis=1)

# 组合起来
X = pd.concat([X, train_object], axis=1)
final_test_data = pd.concat([final_test_data, test_object], axis=1)
final_test_data.fillna(0, inplace=True)
print(final_test_data.head())

# print(y.as_matrix())

train_X, test_X, train_y, test_y = train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25)

my_imputer = Imputer()
# train_X = my_imputer.fit_transform(train_X)
# test_X = my_imputer.transform(test_X)
# final_test_data = my_imputer.transform(final_test_data)

# my_model = XGBRegressor()
# my_model.fit(train_X, train_y, verbose=False)

my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
# my_model.fit(train_X, train_y, early_stopping_rounds=300, eval_set=[(test_X, test_y)], verbose=True)

X = my_imputer.fit_transform(X)
print(final_test_data.shape)
final_test_data = my_imputer.transform(final_test_data)
my_model.fit(X, y, verbose=True)

In [None]:
# predictions = my_model.predict(test_X)
# print("mae: %d" % (mean_absolute_error(predictions, test_y)))

predictions = my_model.predict(final_test_data)

submission = pd.DataFrame({'Id': test_org.Id, 'SalePrice': predictions})
submission.to_csv('submission5.csv', index=False)

In [None]:
# 视图

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble.partial_dependence import partial_dependence, plot_partial_dependence
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.ensemble import GradientBoostingRegressor
# from xgboost import XGBRegressor

In [None]:
train_org = pd.read_csv('../input/train.csv')
test_org = pd.read_csv('../input/test.csv')

print(train_org.shape)

In [None]:
train_org.select_dtypes(exclude=['object']).columns

In [None]:
def get_some_data():
    cols_to_use = ['MSSubClass', 'LotFrontage', 'OverallQual']
    train_org = pd.read_csv('../input/train.csv')
    y = train_org.SalePrice
    X = train_org[cols_to_use]
    my_imputer = Imputer()
    X = my_imputer.fit_transform(X)
    return X, y

In [None]:
X, y = get_some_data()

my_model = GradientBoostingRegressor()
my_model.fit(X, y)

my_plots = plot_partial_dependence(my_model,
                                   features=[0, 1, 2],
                                   X=X,
                                   feature_names=['MSSubClass', 'LotFrontage', 'OverallQual'],
                                   grid_resolution=10)

In [None]:
## 分析

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline


In [None]:
train_org = pd.read_csv('../input/train.csv')

In [None]:
total = train_org.isnull().sum().sort_values(ascending=False)
count = train_org.isnull().count()
# print(type(total))
# print(train_org.count())
percent = (train_org.isnull().sum()/train_org.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [None]:
# 从缺失值统计图，去掉超过15%的特征列， 所以去掉的特征列有：PoolQC, MiscFeature, Alley, Fence, FireplaceQu, LotFrontage

In [None]:
train_data = train_org.drop((missing_data[missing_data['Total'] > 1]).index, 1)
print(train_data.shape)
# print(train_org.loc[train_data['Electrical'].isnull()])
train_data = train_data.drop(train_org.loc[train_data['Electrical'].isnull()].index)
train_data.isnull().sum().max()

In [None]:
#standardizing data
saleprice_scaled = StandardScaler().fit_transform(train_org['SalePrice'][:,np.newaxis])
print(saleprice_scaled.shape)
print(saleprice_scaled[:,0])
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)

In [None]:
train_data.sort_values(by = 'GrLivArea', ascending = False)[:2]

In [None]:

train_data = train_data.drop(train_data[train_data['Id'] == 1299].index)
train_data = train_data.drop(train_data[train_data['Id'] == 524].index)

In [None]:
sns.distplot(train_data['SalePrice'], fit=norm)
fig = plt.figure()
res = stats.probplot(train_data['SalePrice'], plot=plt)

In [None]:
train_data_log = train_data.copy()
train_data_log['SalePrice'] = np.log(train_data_log['SalePrice'])

sns.distplot(train_data_log['SalePrice'], fit=norm)
fig = plt.figure()
res = stats.probplot(train_data_log['SalePrice'], plot=plt)

In [None]:
# 对'GrLivArea'进行同样的处理
train_data_log['GrLivArea'] = np.log(train_data_log['GrLivArea'])

In [None]:
train_data_log['hasBsmt'] = pd.Series(len(train_data_log['TotalBsmtSF']), index=train_data_log.index)
train_data_log['hasBsmt'] = 0
train_data_log.loc[train_data_log['TotalBsmtSF']>0, 'hasBsmt'] = 1

In [None]:
# print(train_data_log.loc[train_data_log['hasBsmt']==1, 'TotalBsmtSF'].shape)
# print(np.log(train_data_log['TotalBsmtSF']))
train_data_log.loc[train_data_log['hasBsmt']==1, 'TotalBsmtSF'] = np.log(train_data_log['TotalBsmtSF'])

In [None]:
sns.distplot(train_data_log[train_data_log['TotalBsmtSF']>0]['TotalBsmtSF'], fit=norm);
fig = plt.figure()
res = stats.probplot(train_data_log[train_data_log['TotalBsmtSF']>0]['TotalBsmtSF'], plot=plt)

In [None]:
from sklearn.model_selection import train_test_split
train_data_log.dropna(axis = 0, subset=['SalePrice'], inplace=True)
train_data_log_target = train_data_log.SalePrice
train_data_log_feature = train_data_log.drop(['SalePrice', 'Id'], axis=1)
train_X1, val_X1, train_y, val_y = train_test_split(train_data_log_feature, train_data_log_target, test_size=0.25)

train_X = train_X1.select_dtypes(exclude=['object'])
val_X = val_X1.select_dtypes(exclude=['object'])
# val_X.fillna(0, inplace=True)

from sklearn.preprocessing import Imputer
# print(train_X.isin(['Normal']))

my_imputer = Imputer()
train_data_imputer = my_imputer.fit_transform(train_X)
val_data_imputer = my_imputer.transform(val_X)

train_dum = pd.get_dummies(train_X1.select_dtypes(include=['object']))
val_dum = pd.get_dummies(val_X1.select_dtypes(include=['object']))
train_dum, val_dum = train_dum.align(val_dum, join='left', axis=1)
val_dum.fillna(0, inplace=True)

train_data_imputer = pd.DataFrame(train_data_imputer, columns=train_X.columns)
val_data_imputer = pd.DataFrame(val_data_imputer, columns=val_X.columns)
print(type(train_data_imputer))
print(type(train_dum))

# print(train_data_imputer.shape)
# print(train_val_imputer.shape)

In [None]:
# print(train_dum.head())
# print(train_data_imputer.head())

In [None]:
# train_dum_copy = train_dum.copy()
train_dum_t = pd.concat([train_data_imputer, train_dum.reset_index()], axis=1)
val_dum_t = pd.concat([val_data_imputer, val_dum.reset_index()], axis=1)

In [None]:
# print(train_dum_copy.head())
# print(train_data_imputer.head())
# print(train_dum_t.shape)

In [None]:
from xgboost import XGBRegressor

test_data_org = pd.read_csv('../input/test.csv')
test_data = test_data_org.drop(['Id'], axis=1)
test_data = pd.get_dummies(test_data)



# print(train_dum.head())
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
# my_model.fit(train_dum_t, train_y, early_stopping_rounds=300, eval_set=[(val_dum_t, val_y)], verbose=True)

# my_model.fit(X, y, verbose=True)

In [None]:
from sklearn.metrics import mean_absolute_error
preds = my_model.predict(val_dum_t)
# print(preds.size)
mae = mean_absolute_error(preds, val_y)
print("mae: %f" % (mae))

In [None]:
X_train_org = pd.read_csv('../input/train.csv')
X_test_org = pd.read_csv('../input/test.csv')

cols = (missing_data[missing_data['Total'] > 1]).index
X_train = X_train_org.drop(cols, 1)
X_train = X_train.drop(X_train.loc[X_train['Electrical'].isnull()].index)
X_train = X_train.drop(X_train[X_train['Id'] == 1299].index)
X_train = X_train.drop(X_train[X_train['Id'] == 524].index)

X_test = X_test_org.drop(cols, 1)

X_train_tg = np.log(X_train.SalePrice)
X_train_num = X_train.drop(['SalePrice', 'Id'], axis=1).select_dtypes(exclude=['object'])
X_train_num['GrLivArea'] = np.log(X_train_num['GrLivArea'])
X_train_num['HasBsmt'] = pd.Series(len(X_train_num['TotalBsmtSF']), index=X_train_num.index)
X_train_num['HasBsmt'] = 0 
X_train_num.loc[X_train_num['TotalBsmtSF']>0,'HasBsmt'] = 1
X_train_num.loc[X_train_num['HasBsmt']==1,'TotalBsmtSF'] = np.log(X_train_num['TotalBsmtSF'])
X_train_obj = X_train.drop(['SalePrice', 'Id'], axis=1).select_dtypes(include=['object'])


X_test_num = X_test.drop(['Id'], axis=1).select_dtypes(exclude=['object'])
X_test_num['GrLivArea'] = np.log(X_test_num['GrLivArea'])
X_test_num['HasBsmt'] = pd.Series(len(X_test_num['TotalBsmtSF']), index=X_test_num.index)
X_test_num['HasBsmt'] = 0 
X_test_num.loc[X_test_num['TotalBsmtSF']>0,'HasBsmt'] = 1
X_test_num.loc[X_test_num['HasBsmt']==1,'TotalBsmtSF'] = np.log(X_test_num['TotalBsmtSF'])
X_test_obj = X_test.drop(['Id'], axis=1).select_dtypes(include=['object'])

X_train_dum = pd.get_dummies(X_train_obj)
X_test_dum = pd.get_dummies(X_test_obj)
X_train_dum, X_test_dum = X_train_dum.align(X_test_dum, join='left', axis=1)
X_test_dum.fillna(0, inplace=True)

my_imputer = Imputer()
X_train_ipt = my_imputer.fit_transform(X_train_num)
X_test_ipt = my_imputer.transform(X_test_num)

X_train_ipt = pd.DataFrame(X_train_ipt, columns=X_train_num.columns)
X_test_ipt = pd.DataFrame(X_test_ipt, columns=X_test_num.columns)

X_train = pd.concat([X_train_ipt, X_train_dum.reset_index()], axis=1)
X_test = pd.concat([X_test_ipt, X_test_dum.reset_index()], axis=1)

my_model = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
my_model.fit(X_train, X_train_tg, verbose=True)

my_preds = my_model.predict(X_test)

In [None]:
my_preds = np.exp(my_preds)
# print(np.exp(my_preds))

In [None]:
print(my_preds)

In [None]:
submission6 = pd.DataFrame({'Id': X_test_org.Id, 'SalePrice': my_preds})
submission6.to_csv('submission6.csv', index=False)