In [1]:
import torch
from torch import nn
import pandas as pd
from torch.nn import functional as F
from torch.utils import data
import wandb
from tqdm import tqdm

In [2]:
train_data = pd.read_csv("../data/train.csv/train.csv") 
test_data = pd.read_csv("../data/test.csv/test.csv")

In [3]:
train_data.iloc[0:2,:]

Unnamed: 0,Id,Address,Sold Price,Summary,Type,Year built,Heating,Cooling,Parking,Lot,...,Parking features,Tax assessed value,Annual tax amount,Listed On,Listed Price,Last Sold On,Last Sold Price,City,Zip,State
0,0,540 Pine Ln,3825000.0,"540 Pine Ln, Los Altos, CA 94022 is a single f...",SingleFamily,1969.0,"Heating - 2+ Zones, Central Forced Air - Gas","Multi-Zone, Central AC, Whole House / Attic Fan","Garage, Garage - Attached, Covered",1.0,...,"Garage, Garage - Attached, Covered",886486.0,12580.0,2019-10-24,4198000.0,,,Los Altos,94022,CA
1,1,1727 W 67th St,505000.0,"HURRY, HURRY.......Great house 3 bed and 2 bat...",SingleFamily,1926.0,Combination,"Wall/Window Unit(s), Evaporative Cooling, See ...","Detached Carport, Garage",4047.0,...,"Detached Carport, Garage",505000.0,6253.0,2019-10-16,525000.0,2019-08-30,328000.0,Los Angeles,90047,CA


In [4]:
test_data.iloc[0:2,:]

Unnamed: 0,Id,Address,Summary,Type,Year built,Heating,Cooling,Parking,Lot,Bedrooms,...,Parking features,Tax assessed value,Annual tax amount,Listed On,Listed Price,Last Sold On,Last Sold Price,City,Zip,State
0,47439,3034 N Coolidge Ave,Live within steps to the scenic views on the L...,SingleFamily,2020.0,Central,Central Air,Tandem Uncovered,940.0,2,...,Tandem Uncovered,,,2020-11-06,799900.0,2020-07-01,819000.0,Dodgertown,90090,CA
1,47440,565 Kenilworth Ave,duplex fixer. Input for comps only,SingleFamily,1924.0,Natural Gas,,Detached,10018.8,3,...,Detached,521977.0,7494.0,2014-04-04,479950.0,2020-11-03,15000.0,San Leandro,94577,CA


In [5]:
train_data.shape, test_data.shape

((47439, 41), (31626, 40))

In [6]:
# 删除训练数据集中的第一列,最后一列
all_features = pd.concat((train_data.iloc[:,4:-1],  test_data.iloc[:,3:-1]))
all_features.iloc[:2,:]

Unnamed: 0,Type,Year built,Heating,Cooling,Parking,Lot,Bedrooms,Bathrooms,Full bathrooms,Total interior livable area,...,Laundry features,Parking features,Tax assessed value,Annual tax amount,Listed On,Listed Price,Last Sold On,Last Sold Price,City,Zip
0,SingleFamily,1969.0,"Heating - 2+ Zones, Central Forced Air - Gas","Multi-Zone, Central AC, Whole House / Attic Fan","Garage, Garage - Attached, Covered",1.0,"Ground Floor Bedroom, Master Bedroom on Ground...",0.0,,1.0,...,"Washer / Dryer, Inside, In Utility Room","Garage, Garage - Attached, Covered",886486.0,12580.0,2019-10-24,4198000.0,,,Los Altos,94022
1,SingleFamily,1926.0,Combination,"Wall/Window Unit(s), Evaporative Cooling, See ...","Detached Carport, Garage",4047.0,3,2.0,2.0,872.0,...,Inside,"Detached Carport, Garage",505000.0,6253.0,2019-10-16,525000.0,2019-08-30,328000.0,Los Angeles,90047


In [7]:
all_features.shape

(79065, 36)

In [8]:
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x:(x - x.mean()) / (x.std()))
all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [9]:
all_features['Last Sold On'] = all_features['Last Sold On'].fillna(method='ffill',axis=0).fillna(0)
all_features['Listed On'] = all_features['Listed On'].fillna(method='ffill',axis=0).fillna(0)

In [10]:
for in_object in all_features.dtypes[all_features.dtypes=='object'].index:
    print(in_object.ljust(20), len(all_features[in_object].unique()))

Type                 174
Heating              2660
Cooling              911
Parking              9913
Bedrooms             278
Region               1259
Elementary School    3568
Middle School        809
High School          922
Flooring             1740
Heating features     1763
Cooling features     596
Appliances included  11290
Laundry features     3031
Parking features     9695
Listed On            2815
Last Sold On         6949
City                 1122


In [11]:
all_features['Listed On'] = pd.to_datetime(all_features['Listed On'], format='%Y-%m-%d')
all_features['Last Sold On'] = pd.to_datetime(all_features['Last Sold On'], format='%Y-%m-%d')
all_features['Last Sold On'].head()

0   1970-01-01
1   2019-08-30
2   2019-08-30
3   2016-08-30
4   2016-06-27
Name: Last Sold On, dtype: datetime64[ns]

In [12]:
all_features.dtypes.unique()

array([dtype('O'), dtype('float64'), dtype('<M8[ns]')], dtype=object)

In [13]:
numeric_features = all_features.dtypes[all_features.dtypes == 'float64'].index
numeric_features

Index(['Year built', 'Lot', 'Bathrooms', 'Full bathrooms',
       'Total interior livable area', 'Total spaces', 'Garage spaces',
       'Elementary School Score', 'Elementary School Distance',
       'Middle School Score', 'Middle School Distance', 'High School Score',
       'High School Distance', 'Tax assessed value', 'Annual tax amount',
       'Listed Price', 'Last Sold Price', 'Zip'],
      dtype='object')

In [14]:
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))

In [15]:
features = list(numeric_features)
features.append('Type')
features

['Year built',
 'Lot',
 'Bathrooms',
 'Full bathrooms',
 'Total interior livable area',
 'Total spaces',
 'Garage spaces',
 'Elementary School Score',
 'Elementary School Distance',
 'Middle School Score',
 'Middle School Distance',
 'High School Score',
 'High School Distance',
 'Tax assessed value',
 'Annual tax amount',
 'Listed Price',
 'Last Sold Price',
 'Zip',
 'Type']

In [16]:
all_features = all_features[features]
all_features.shape

(79065, 19)

In [17]:
all_features.iloc[:2,10:]

Unnamed: 0,Middle School Distance,High School Score,High School Distance,Tax assessed value,Annual tax amount,Listed Price,Last Sold Price,Zip,Type
0,-8.38727e-17,0.978758,-0.333677,0.20307,0.320225,1.407144,1.6352040000000003e-17,0.411159,SingleFamily
1,-0.3282366,-2.212942,-0.333677,-0.182542,-0.197337,-0.28241,-0.4613161,-1.492275,SingleFamily


In [18]:
all_features = pd.get_dummies(all_features, dummy_na=True)

In [19]:
all_features.shape

(79065, 193)

In [20]:
class MLP(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.hidden1 = nn.Linear(input_size, 256)
        self.hidden2 = nn.Linear(256, 128)
        self.out_layer = nn.Linear(128, 1)
    
    def forward(self, X):
        X = F.relu(self.hidden1(X))
        X = F.relu(self.hidden2(X))
        return self.out_layer(X)
device = torch.device('cuda:0')

In [21]:
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values,
                              dtype=torch.float32)
print(train_features.shape)
train_labels = torch.tensor(train_data['Sold Price'].values.reshape(-1,1),
                            dtype=torch.float32)
print(train_labels.shape)
test_features = torch.tensor(all_features[n_train:].values,
                             dtype=torch.float32)
print(test_features.shape)

torch.Size([47439, 193])
torch.Size([47439, 1])
torch.Size([31626, 193])


In [22]:
input_size = train_features.shape[1]

net = MLP(input_size).to(device)

In [23]:
criterion = nn.MSELoss()
def load_array(data_array, batch_size, is_train=True):
    # TensorDataset可以对训练集进行打包
    dataset = data.TensorDataset(*data_array)
    return data.DataLoader(dataset, batch_size,shuffle=is_train)

def log_rmse(net, features, labels):
    clipped_preds = torch.clamp(net(features), 1,float('inf'))
    rmse = torch.sqrt(criterion(torch.log(clipped_preds),
                                torch.log(labels)))
    return rmse.item()

In [24]:
NUM_SAVE=40
def train(net, train_features, train_labels, test_features,
          test_labels, num_epochs, learning_rate, weight_decay,
          batch_size):
    wandb.watch(net)
    train_ls, test_ls = [], []
    train_iter = load_array((train_features, train_labels),batch_size)
    optimizer = torch.optim.Adam(net.parameters(), lr = learning_rate,
                                 weight_decay=weight_decay)
    # tqdm进度条
    for epoch in range(num_epochs):
        for X, y in train_iter:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            outputs = net(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
        record_loss = log_rmse(net.to('cpu'), train_features, train_labels)
        wandb.log({'loss':record_loss, 'epoch':epoch})
        train_ls.append(record_loss)
        if (epoch%NUM_SAVE==0 and epoch!=0) or (epoch==num_epochs-1):
            torch.save(net.state_dict(), 'checkpoin_'+str(epoch))
            print('save checkpoints on:',epoch,'rmse loss value is:',
                  record_loss)
        del X, y
        net.to(device)
    wandb.finish()
    return train_ls, test_ls   

In [25]:
num_epochs, lr, weight_decay, batch_size = 120, 0.01, 0.05, 256
wandb.init(project='kaggle_com1',
           config={ "learning_rate": lr,
                    "weight_decay": weight_decay,
                    "batch_size": batch_size,
                    "total_run": num_epochs}
          )
print("netword:",net)

wandb: Currently logged in as: wangfengcxz (use `wandb login --relogin` to force relogin)


netword: MLP(
  (hidden1): Linear(in_features=193, out_features=256, bias=True)
  (hidden2): Linear(in_features=256, out_features=128, bias=True)
  (out_layer): Linear(in_features=128, out_features=1, bias=True)
)


In [26]:
train_ls, valid_ls = train(net, train_features,train_labels,
                           None,None, num_epochs, lr, 
                           weight_decay, batch_size)

save checkpoints on: 40 rmse loss value is: 0.4506903290748596
save checkpoints on: 80 rmse loss value is: 0.4205334782600403
save checkpoints on: 119 rmse loss value is: 0.28159475326538086


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,119.0
loss,0.28159


In [27]:
net.to('cpu')
# detach将当前网络从计算图中分离出来，即不再计算梯度（因为梯度是通过计算图得方式得到的
preds = net(test_features).detach().numpy()

# 格式化导出
test_data['Sold Price'] = pd.Series(preds.reshape(1,-1)[0])
submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)

In [29]:
submission.to_csv('../Kaggle/加州房价预测/submission.csv',index=False)