# MLP简单版

In [1]:
import torch
import torch.nn as nn
import wandb # 参数可视化
import pandas as pd
from torch.nn import functional as F
from tqdm import tqdm # 进度条
import numpy as np
from d2l import torch as d2l
from torch.utils import data

In [2]:
class MLP(nn.Module):
    def __init__(self, in_features):
        super(self).__init__()
        # 每层神经元数量:以漏斗形逐层递减
        self.layer1 = nn.Linear(in_features, 256)
        self.layer2 = nn.Linear(256, 64)
        self.layer3 = nn.Linear(64, 16)
        self.out = nn.Linear(16, 1)
        
    def forward(self, X):
        X = F.relu(self.layer1(X))
        X = F.relu(self.layer2(X))
        X = F.relu(self.layer3(X))
        return self.out(X)
    
device = torch.device('mps' if torch.backends.mps.is_available()
                      else 'cpu')

In [3]:
loss = nn.MSELoss()

In [4]:
def log_rmse(net, features, labels):
    # 为了在取对数时进一步稳定该值，将小于1的值设置为1
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(criterion(torch.log(clipped_preds),
                           torch.log(labels)))
    return rmse.item()

In [5]:
test_data = pd.read_csv('../data/test.csv')
train_data = pd.read_csv('../data/train.csv')
train_data.shape, test_data.shape

((47439, 41), (31626, 40))

In [6]:
# 去掉冗余数据
redundant_data = ['Address', 'Summary', 'City', 'State', 'Zip']
for data in redundant_data:
    del train_data[data], test_data[data]

In [7]:
# 数据预处理 , Sold Price这里可能有点问题,因为test里面没有Sold Price
large_vel_cols = ['Lot', 'Total interior livable area', 
                  'Tax assessed value', 'Annual tax amount', 
                  'Listed Price', 'Last Sold Price']

for data in large_vel_cols:
    train_data[data] = np.log(train_data[data]+1)
    #if data != 'Sold Price':
    test_data[data] = np.log(test_data[data]+1)

In [8]:
print(train_data.iloc[0: 4, [0, 1, 2, 3, -3, -2, -1]])

   Id  Sold Price          Type  Year built  Listed Price Last Sold On  \
0   0   3825000.0  SingleFamily      1969.0     15.250119          NaN   
1   1    505000.0  SingleFamily      1926.0     13.171155   2019-08-30   
2   2    140000.0  SingleFamily      1958.0     12.100718          NaN   
3   3   1775000.0  SingleFamily      1947.0     14.454730   2016-08-30   

   Last Sold Price  
0              NaN  
1        12.700772  
2              NaN  
3        14.220976  


In [9]:
print(test_data.iloc[0: 4, [0, 1, 2, 3, -3, -2, -1]])

      Id          Type  Year built      Heating  Listed Price Last Sold On  \
0  47439  SingleFamily      2020.0      Central     13.592243   2020-07-01   
1  47440  SingleFamily      1924.0  Natural Gas     13.081439   2020-11-03   
2  47441  SingleFamily      2020.0      Central     13.641039          NaN   
3  47442  SingleFamily      2020.0      Central     13.604667   2020-09-21   

   Last Sold Price  
0        13.615841  
1         9.615872  
2              NaN  
3        13.604791  


In [10]:
# 把train和test去除id后放一起，train也要去掉label
all_features = pd.concat((train_data.iloc[:,2:],test_data.iloc[:,1:])) # 无Sold Price版本

# 时间数据赋日期格式
all_features['Listed On'] = pd.to_datetime(all_features['Listed On'], format="%Y-%m-%d")
all_features['Last Sold On'] = pd.to_datetime(all_features['Last Sold On'], format="%Y-%m-%d")

all_features.shape

(79065, 34)

In [11]:
# pandas的object是python里的str
for obj in all_features.dtypes[all_features.dtypes == 'object'].index: # 然后通过条件筛选出数据类型为 'object' 的列，即类别型特征列。
    print(obj.ljust(20), len(all_features[obj].unique())) # 这部分代码用于获取每个类别型特征列中不同取值的数量，即唯一值的数量

Type                 174
Heating              2660
Cooling              911
Parking              9913
Bedrooms             278
Region               1259
Elementary School    3568
Middle School        809
High School          922
Flooring             1740
Heating features     1763
Cooling features     596
Appliances included  11290
Laundry features     3031
Parking features     9695
