In [29]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np

In [22]:
train_data_path ='./dataset/train.csv'
train = pd.read_csv(train_data_path)

test_data_path ='./dataset/test.csv'
test = pd.read_csv(test_data_path)

num_of_train_data = train.shape[0]

In [23]:
# 房价，要拟合的目标值
target = train.SalePrice

# 输入特征，可以将SalePrice列扔掉
train.drop(['SalePrice'],axis = 1 , inplace = True)

# 将train和test合并到一起，一块进行特征工程，方便预测test的房价
combined = train.append(test)
combined.reset_index(inplace=True)
combined.drop(['index', 'Id'], inplace=True, axis=1)

  combined = train.append(test)


In [24]:
# 选出非空列
def get_cols_with_no_nans(df,col_type):
    '''
    Arguments :
    df : The dataframe to process
    col_type : 
          num : to only get numerical columns with no nans
          no_num : to only get nun-numerical columns with no nans
          all : to get any columns with no nans    
    '''
    if (col_type == 'num'):
        predictors = df.select_dtypes(exclude=['object'])
    elif (col_type == 'no_num'):
        predictors = df.select_dtypes(include=['object'])
    elif (col_type == 'all'):
        predictors = df
    else :
        print('Error : choose a type (num, no_num, all)')
        return 0
    cols_with_no_nans = []
    for col in predictors.columns:
        if not df[col].isnull().any():
            cols_with_no_nans.append(col)
    return cols_with_no_nans

In [25]:
num_cols = get_cols_with_no_nans(combined, 'num')
cat_cols = get_cols_with_no_nans(combined, 'no_num')

# 过滤掉含有缺失值的特征
combined = combined[num_cols + cat_cols]

print(num_cols[:5])
print ('Number of numerical columns with no nan values: ',len(num_cols))
print(cat_cols[:5])
print ('Number of non-numerical columns with no nan values: ',len(cat_cols))

['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt']
Number of numerical columns with no nan values:  25
['Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope']
Number of non-numerical columns with no nan values:  20


In [26]:
# 对分类特征进行One-Hot编码
def oneHotEncode(df,colNames):
    for col in colNames:
        if( df[col].dtype == np.dtype('object')):
            # pandas.get_dummies 可以对分类特征进行One-Hot编码
            dummies = pd.get_dummies(df[col],prefix=col)
            df = pd.concat([df,dummies],axis=1)

            # drop the encoded column
            df.drop([col],axis = 1 , inplace=True)
    return df


combined = oneHotEncode(combined,cat_cols)

In [28]:
# 训练数据集特征
train_features = torch.tensor(combined[:num_of_train_data].values, dtype=torch.float)
# 训练数据集目标
train_labels = torch.tensor(target.values, dtype=torch.float).view(-1, 1)
# 测试数据集特征
test_features = torch.tensor(combined[num_of_train_data:].values, dtype=torch.float)

print("train data size: ", train_features.shape)
print("label data size: ", train_labels.shape)
print("test data size: ", test_features.shape)

train data size:  torch.Size([1460, 149])
label data size:  torch.Size([1460, 1])
test data size:  torch.Size([1459, 149])


In [30]:
# 搭积木式 构建
model_sequential = nn.Sequential(
          nn.Linear(train_features.shape[1], 128),
          nn.ReLU(),
          nn.Linear(128, 256),
          nn.ReLU(),
          nn.Linear(256, 256),
          nn.ReLU(),
          nn.Linear(256, 256),
          nn.ReLU(),
          nn.Linear(256, 1)
        )

In [33]:
class Net(nn.Module):
      
    def __init__(self, features):
        super(Net, self).__init__()
    # 定义网络结构
        self.linear_relu1 = nn.Linear(features, 128)
        self.linear_relu2 = nn.Linear(128, 256)
        self.linear_relu3 = nn.Linear(256, 256)
        self.linear_relu4 = nn.Linear(256, 256)
        self.linear5 = nn.Linear(256, 1)
    
    # 前向传播    
    def forward(self, x):
        
        y_pred = self.linear_relu1(x)
        y_pred = nn.functional.relu(y_pred)

        y_pred = self.linear_relu2(y_pred)
        y_pred = nn.functional.relu(y_pred)

        y_pred = self.linear_relu3(y_pred)
        y_pred = nn.functional.relu(y_pred)

        y_pred = self.linear_relu4(y_pred)
        y_pred = nn.functional.relu(y_pred)

        y_pred = self.linear5(y_pred)
        return y_pred

In [34]:
# 初始化模型对象
model = Net(features=train_features.shape[1])

# 使用均方误差作为损失函数
criterion = nn.MSELoss(reduction='mean')

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [36]:
losses = []

# 训练500轮
for _ in range(500):
    # 每次迭代使用训练数据集中的所有样本 train_features
    y_pred = model(train_features)

    loss = criterion(y_pred, train_labels)
    # print(t, loss.item())
    losses.append(loss.item())

    if torch.isnan(loss):
        break

    # 将模型中各参数的梯度清零。
    # PyTorch的backward()方法计算梯度会默认将本次计算的梯度与缓存中已有的梯度加和。
    # 必须在反向传播前先清零。
    optimizer.zero_grad()

    # 反向传播，计算各参数对于损失loss的梯度
    loss.backward()

    # 根据刚刚反向传播得到的梯度更新模型参数
    optimizer.step()

In [38]:
predictions = model(test_features).detach().numpy()
my_submission = pd.DataFrame({'Id':pd.read_csv('./dataset/test.csv').Id,'SalePrice': predictions[:, 0]})
my_submission.to_csv('./dataset/submission.csv', index=False)