# MLP简单版

In [1]:
import torch
import torch.nn as nn
import wandb # 参数可视化
import pandas as pd
from torch.nn import functional as F
from tqdm import tqdm # 进度条
import numpy as np
from d2l import torch as d2l
from torch.utils import data

In [2]:
NUM_SAVE = 50
net_list = "in->256->64->1"

In [3]:
class MLP(nn.Module):
    def __init__(self, in_features):
        super(MLP, self).__init__()
        # 每层神经元数量:以漏斗形逐层递减
        self.layer1 = nn.Linear(in_features, 256)
        self.layer2 = nn.Linear(256, 64)
        self.layer3 = nn.Linear(64, 16)
        self.out = nn.Linear(16, 1)
        
    def forward(self, X):
        X = F.relu(self.layer1(X))
        X = F.relu(self.layer2(X))
        X = F.relu(self.layer3(X))
        return self.out(X)
    
device = torch.device('mps' if torch.backends.mps.is_available()
                      else 'cpu')

In [4]:
def log_rmse(net, features, labels):
    # 为了在取对数时进一步稳定该值，将小于1的值设置为1
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(criterion(torch.log(clipped_preds),
                           torch.log(labels)))
    return rmse.item()

In [5]:
test_data = pd.read_csv('../data/test.csv')
train_data = pd.read_csv('../data/train.csv')
train_data.shape, test_data.shape

((47439, 41), (31626, 40))

In [6]:
# 去掉冗余数据
redundant_data = ['Address', 'Summary', 'City', 'State', 'Zip']
for data in redundant_data:
    del train_data[data], test_data[data]

In [7]:
# 数据预处理 , Sold Price这里可能有点问题,因为test里面没有Sold Price
large_vel_cols = ['Lot', 'Total interior livable area', 
                  'Tax assessed value', 'Annual tax amount', 
                  'Listed Price', 'Last Sold Price']

for data in large_vel_cols:
    train_data[data] = np.log(train_data[data]+1)
    #if data != 'Sold Price':
    test_data[data] = np.log(test_data[data]+1)

In [8]:
print(train_data.iloc[0: 4, [0, 1, 2, 3, -3, -2, -1]])

   Id  Sold Price          Type  Year built  Listed Price Last Sold On  \
0   0   3825000.0  SingleFamily      1969.0     15.250119          NaN   
1   1    505000.0  SingleFamily      1926.0     13.171155   2019-08-30   
2   2    140000.0  SingleFamily      1958.0     12.100718          NaN   
3   3   1775000.0  SingleFamily      1947.0     14.454730   2016-08-30   

   Last Sold Price  
0              NaN  
1        12.700772  
2              NaN  
3        14.220976  


In [9]:
print(test_data.iloc[0: 4, [0, 1, 2, 3, -3, -2, -1]])

      Id          Type  Year built      Heating  Listed Price Last Sold On  \
0  47439  SingleFamily      2020.0      Central     13.592243   2020-07-01   
1  47440  SingleFamily      1924.0  Natural Gas     13.081439   2020-11-03   
2  47441  SingleFamily      2020.0      Central     13.641039          NaN   
3  47442  SingleFamily      2020.0      Central     13.604667   2020-09-21   

   Last Sold Price  
0        13.615841  
1         9.615872  
2              NaN  
3        13.604791  


In [10]:
# 把train和test去除id后放一起，train也要去掉label
all_features = pd.concat((train_data.iloc[:,2:],test_data.iloc[:,1:])) # 无Sold Price版本

# 时间数据赋日期格式
all_features['Listed On'] = pd.to_datetime(all_features['Listed On'], format="%Y-%m-%d")
all_features['Last Sold On'] = pd.to_datetime(all_features['Last Sold On'], format="%Y-%m-%d")

all_features.shape

(79065, 34)

In [11]:
# 非数字的object
for obj in all_features.dtypes[all_features.dtypes == 'object'].index: # 然后通过条件筛选出数据类型为 'object' 的列，即类别型特征列。
    print(obj.ljust(20), len(all_features[obj].unique())) # 这部分代码用于获取每个类别型特征列中不同取值的数量，即唯一值的数量

Type                 174
Heating              2660
Cooling              911
Parking              9913
Bedrooms             278
Region               1259
Elementary School    3568
Middle School        809
High School          922
Flooring             1740
Heating features     1763
Cooling features     596
Appliances included  11290
Laundry features     3031
Parking features     9695


In [12]:
# 查询数字列 ->缺失数据赋0 -> 标准化
numeric_features = all_features.dtypes[all_features.dtypes == 'float64'].index # 找到下标
# python默认类型是float64
all_features = all_features.fillna(method='bfill', axis=0).fillna(0)
# 标准化
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std())) # mean均值 std 标准差

In [13]:
features = list(numeric_features)
features.extend(['Type', 'Bedrooms'])
all_features = all_features[features] #使用 all_features 数据帧的列索引（列名），选择在 features 列表中包含的那些特征列。这将会将 all_features 数据帧限制为只包括你选择的特征列。

In [14]:
'''
这是 get_dummies 函数的一个参数，
它表示是否要为缺失值（NaN）创建一个额外的虚拟列（dummy variable）。
如果设置为 True，那么对于每个类别型特征列中的缺失值，会创建一个额外的虚拟列，
用于表示缺失值的存在或缺失。如果设置为 False，缺失值将被忽略。
'''
print('before one hot code', all_features.shape)
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features.shape
print('after one hot code', all_features.shape)

before one hot code (79065, 19)
after one hot code (79065, 470)


In [15]:
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
print('train feature shape:', train_features.shape)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
print('test feature shape:', test_features.shape)
train_labels = torch.tensor(train_data['Sold Price'].values.reshape(-1, 1), dtype=torch.float32)
print('train label shape:', train_labels.shape)

train feature shape: torch.Size([47439, 470])
test feature shape: torch.Size([31626, 470])
train label shape: torch.Size([47439, 1])


In [16]:
criterion = nn.MSELoss()
in_features = train_features.shape[1]
net = MLP(in_features).to(device)

In [17]:
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    wandb.watch(net)
    train_ls, test_ls = [], []
    train_iter = d2l.load_array((train_features, train_labels), batch_size)
    # 这里使用的是Adam优化算法
    optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate, weight_decay = weight_decay)
    for epoch in tqdm(range(num_epochs)):
        for X, y in train_iter:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            outputs = net(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
        record_loss = log_rmse(net.to('cpu'), train_features, train_labels)
        wandb.log({'loss': record_loss,'epoch': epoch})
        train_ls.append(record_loss)
        if (epoch%NUM_SAVE==0 and epoch!=0) or (epoch==num_epochs-1):
            torch.save(net.state_dict(),'checkpoint_'+str(epoch))
            print('save checkpoints on:', epoch, 'rmse loss value is:', record_loss)
        del X, y
        net.to(device)
    wandb.finish()
    return train_ls, test_ls

In [18]:
k, num_epochs, lr, weight_decay, batch_size = 5, 2000, 0.005, 0.05, 256
wandb.init(project="calofornia_house_predict",
           config={ "learning_rate": lr,
                    "weight_decay": weight_decay,
                    "batch_size": batch_size,
                    "total_run": num_epochs,
                    "network": net_list}
          )
print("network:",net)

[34m[1mwandb[0m: Currently logged in as: [33mzengchen[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011168883800000006, max=1.0…

network: MLP(
  (layer1): Linear(in_features=470, out_features=256, bias=True)
  (layer2): Linear(in_features=256, out_features=64, bias=True)
  (layer3): Linear(in_features=64, out_features=16, bias=True)
  (out): Linear(in_features=16, out_features=1, bias=True)
)


In [19]:
net.to('cpu')
preds = net(test_features).detach().numpy()
# 将其重新格式化以导出到Kaggle
test_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
submission.to_csv('submission.csv', index=False)

In [20]:
# 读取已有 继续进行训练
k, num_epochs, lr, weight_decay, batch_size = 5, 500, 0.01, 0.01, 256
wandb.init(project="kaggle_predict",
           config={ "learning_rate": lr,
                    "weight_decay": weight_decay,
                    "batch_size": batch_size,
                    "total_run": num_epochs,
                    "network": net_list}
          )
#net.load_state_dict(torch.load('checkpoint_19676'))
print("network:",net)
net.to(device)
train_ls, valid_ls = train(net, train_features,train_labels, None, None, num_epochs, lr, weight_decay, batch_size)
net.to('cpu')
preds = net(test_features).detach().numpy()
# 将其重新格式化以导出到Kaggle
test_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
submission.to_csv('submission.csv', index=False)

VBox(children=(Label(value='0.000 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011153214811111114, max=1.0…

network: MLP(
  (layer1): Linear(in_features=470, out_features=256, bias=True)
  (layer2): Linear(in_features=256, out_features=64, bias=True)
  (layer3): Linear(in_features=64, out_features=16, bias=True)
  (out): Linear(in_features=16, out_features=1, bias=True)
)


 10%|████▎                                     | 51/500 [00:52<07:52,  1.05s/it]

save checkpoints on: 50 rmse loss value is: 0.2873043119907379


 20%|████████▎                                | 101/500 [01:43<06:49,  1.03s/it]

save checkpoints on: 100 rmse loss value is: 0.23667097091674805


 30%|████████████▍                            | 151/500 [02:33<05:43,  1.01it/s]

save checkpoints on: 150 rmse loss value is: 0.24659688770771027


 40%|████████████████▍                        | 201/500 [03:21<04:57,  1.00it/s]

save checkpoints on: 200 rmse loss value is: 0.32715266942977905


 50%|████████████████████▌                    | 251/500 [04:10<03:56,  1.05it/s]

save checkpoints on: 250 rmse loss value is: 0.2188827097415924


 60%|████████████████████████▋                | 301/500 [04:59<03:15,  1.02it/s]

save checkpoints on: 300 rmse loss value is: 0.21153587102890015


 70%|████████████████████████████▊            | 351/500 [05:49<02:29,  1.00s/it]

save checkpoints on: 350 rmse loss value is: 0.21063372492790222


 80%|████████████████████████████████▉        | 401/500 [06:39<01:36,  1.03it/s]

save checkpoints on: 400 rmse loss value is: 0.2070220410823822


 90%|████████████████████████████████████▉    | 451/500 [07:29<00:49,  1.00s/it]

save checkpoints on: 450 rmse loss value is: 0.23822523653507233


100%|█████████████████████████████████████████| 500/500 [08:17<00:00,  1.01it/s]

save checkpoints on: 499 rmse loss value is: 0.19866378605365753





0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▄▃▃▂▂▂▂▂▂▂▂▂▂▂▃▃▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,499.0
loss,0.19866
