# 資料前處理

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest

data_path = "data"

train_df = pd.read_csv(data_path+"/train.csv")
test_df = pd.read_csv(data_path+"/test.csv")

# 保存ID
test_id = test_df.Id

# 填補缺失值，填補0
for col in test_df:
    if test_df[col].dtypes == 'object':
        train_df.fillna('None',inplace=True)
        test_df.fillna('None',inplace=True)
    else:
        train_df.fillna(0,inplace=True)
        test_df.fillna(0,inplace=True)
    

# 替換字串
LotShape_dict = {'Reg':0,'IR1':1,'IR2':2,'IR3':3,'None':-1}
score_dict = {'Ex':5 ,'Gd':4 ,'TA':3 ,'Fa':2 ,'Po':1, 'NA':-1, 'None':-1}
BsmtExposure_dict = {'Gd':4, 'Av':3, 'Mn':2, 'No':1, 'NA':-1, 'None':-1}
BsmtFinType_dict = {'GLQ':5 ,'ALQ':4 ,'BLQ':3 ,'Rec':2 ,'LwQ':1 ,'Unf':0 ,'NA':-1 ,'None':-1}
CentralAir_dict = {'N':0 , 'Y':1}

train_df['LotShape'] = train_df['LotShape'].apply(lambda x: LotShape_dict[x]).astype('int32')
test_df['LotShape'] = test_df['LotShape'].apply(lambda x: LotShape_dict[x]).astype('int32')

train_df['ExterQual'] = train_df['ExterQual'].apply(lambda x: score_dict[x]).astype('int32')
test_df['ExterQual'] = test_df['ExterQual'].apply(lambda x: score_dict[x]).astype('int32')
train_df['ExterCond'] = train_df['ExterCond'].apply(lambda x: score_dict[x]).astype('int32')
test_df['ExterCond'] = test_df['ExterCond'].apply(lambda x: score_dict[x]).astype('int32')
train_df['BsmtCond'] = train_df['BsmtCond'].apply(lambda x: score_dict[x]).astype('int32')
test_df['BsmtCond'] = test_df['BsmtCond'].apply(lambda x: score_dict[x]).astype('int32')
train_df['HeatingQC'] = train_df['HeatingQC'].apply(lambda x: score_dict[x]).astype('int32')
test_df['HeatingQC'] = test_df['HeatingQC'].apply(lambda x: score_dict[x]).astype('int32')
train_df['KitchenQual'] = train_df['KitchenQual'].apply(lambda x: score_dict[x]).astype('int32')
test_df['KitchenQual'] = test_df['KitchenQual'].apply(lambda x: score_dict[x]).astype('int32')
train_df['FireplaceQu'] = train_df['FireplaceQu'].apply(lambda x: score_dict[x]).astype('int32')
test_df['FireplaceQu'] = test_df['FireplaceQu'].apply(lambda x: score_dict[x]).astype('int32')
train_df['GarageQual'] = train_df['GarageQual'].apply(lambda x: score_dict[x]).astype('int32')
test_df['GarageQual'] = test_df['GarageQual'].apply(lambda x: score_dict[x]).astype('int32')
train_df['GarageCond'] = train_df['GarageCond'].apply(lambda x: score_dict[x]).astype('int32')
test_df['GarageCond'] = test_df['GarageCond'].apply(lambda x: score_dict[x]).astype('int32')
train_df['PoolQC'] = train_df['PoolQC'].apply(lambda x: score_dict[x]).astype('int32')
test_df['PoolQC'] = test_df['PoolQC'].apply(lambda x: score_dict[x]).astype('int32')

train_df['BsmtExposure'] = train_df['BsmtExposure'].apply(lambda x: BsmtExposure_dict[x]).astype('int32')
test_df['BsmtExposure'] = test_df['BsmtExposure'].apply(lambda x: BsmtExposure_dict[x]).astype('int32')

train_df['BsmtFinType1'] = train_df['BsmtFinType1'].apply(lambda x: BsmtFinType_dict[x]).astype('int32')
test_df['BsmtFinType1'] = test_df['BsmtFinType1'].apply(lambda x: BsmtFinType_dict[x]).astype('int32')
train_df['BsmtFinType2'] = train_df['BsmtFinType2'].apply(lambda x: BsmtFinType_dict[x]).astype('int32')
test_df['BsmtFinType2'] = test_df['BsmtFinType2'].apply(lambda x: BsmtFinType_dict[x]).astype('int32')

train_df['CentralAir'] = train_df['CentralAir'].apply(lambda x: CentralAir_dict[x]).astype('int32')
test_df['CentralAir'] = test_df['CentralAir'].apply(lambda x: CentralAir_dict[x]).astype('int32')

# 指選取非類別的feature
train_df = train_df.select_dtypes(exclude=['object'])
test_df = test_df.select_dtypes(exclude=['object'])

# 消除無意義的數值欄位
nomeans_num_col = ['MSSubClass']
train_df = train_df.drop(columns=nomeans_num_col)
test_df = test_df.drop(columns=nomeans_num_col)

train_len = train_df.shape[0]
test_len = test_df.shape[0]

# isolation forest
isof = IsolationForest()
isof.fit(train_df)
y_noano = isof.predict(train_df)
normal_index = pd.DataFrame(y_noano,columns=['normal_index'],index=train_df.index)
# 僅保留資料正常值，y_noano為1者
train_df = train_df[normal_index['normal_index']==1]


df_train_y = train_df['SalePrice'].values.reshape(-1,1)
price_scaler = MinMaxScaler()
price_scaler.fit(df_train_y)
norm_train_y = price_scaler.transform(df_train_y)
norm_train_y = pd.DataFrame(norm_train_y,columns=['price'])

train_df.drop(columns=['SalePrice','Id'],inplace=True)
test_df.drop(columns=['Id'],inplace=True)

# normalized
train_len = train_df.shape[0]
test_len = test_df.shape[0]

min_scaler = MinMaxScaler()
df = pd.concat([train_df,test_df],sort=False)

for col in df:
    df[col] = min_scaler.fit_transform(df[col].values.reshape(-1,1))

norm_train_x = df[0:train_len]
norm_test_x = df[train_len:]

# 建立validation set
train_x, val_x, train_y, val_y = train_test_split(norm_train_x,norm_train_y)
train_x.head()


KeyError: 0

# 建立模型

In [None]:
import keras


layer_list = [keras.layers.Dense(100,kernel_initializer='normal',input_shape=(train_x.shape[1],),activation='selu'),
              keras.layers.Dense(50,kernel_initializer='normal',activation='selu'),
              keras.layers.Dense(25,kernel_initializer='normal',activation='selu'),
             keras.layers.Dense(1)]

model = keras.Sequential(layer_list)
model.compile(loss='mean_squared_error',
             optimizer=keras.optimizers.Adam())

training_process = model.fit(train_x,train_y,
                            epochs=50,
                            batch_size=10,
                            validation_data=(val_x,val_y))


In [None]:
import matplotlib.pyplot as plt

plt.title('Loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.plot(training_process.history['loss'])
plt.plot(training_process.history['val_loss'])
plt.show()

In [None]:
predict_norm = model.predict(norm_test_x)
predict = price_scaler.inverse_transform(predict_norm)

# 做成Dataframe
predict = pd.DataFrame(predict,columns=['SalePrice'])
output_df = pd.concat([test_id,predict],axis=1)
output_df.to_csv('res.csv',index=False)
output_df.head()
