In [1]:
import pandas as pd
from pathlib import Path
path = Path("~/Desktop/data/house-price")
train_data  = pd.read_csv(path/"train.csv")
test_data  = pd.read_csv(path/"test.csv")

all_data = pd.concat((train_data, test_data))
all_data[:3].transpose()

Unnamed: 0,0,1,2
Id,0,1,2
Address,540 Pine Ln,1727 W 67th St,28093 Pine Ave
Sold Price,3825000.0,505000.0,140000.0
Summary,"540 Pine Ln, Los Altos, CA 94022 is a single f...","HURRY, HURRY.......Great house 3 bed and 2 bat...",'THE PERFECT CABIN TO FLIP! Strawberry deligh...
Type,SingleFamily,SingleFamily,SingleFamily
Year built,1969.0,1926.0,1958.0
Heating,"Heating - 2+ Zones, Central Forced Air - Gas",Combination,Forced air
Cooling,"Multi-Zone, Central AC, Whole House / Attic Fan","Wall/Window Unit(s), Evaporative Cooling, See ...",
Parking,"Garage, Garage - Attached, Covered","Detached Carport, Garage",0 spaces
Lot,1.0,4047.0,9147.0


In [2]:
# all_data[["Listed On Year","Listed On Month","Listed On Day"]] = all_data["Listed On"].str.split("-",2,expand=True)
# all_data[["Last Sold On Year","Last Sold On Month","Last Sold On Day"]] = all_data["Last Sold On"].str.split("-",2,expand=True)
# all_data[:3].transpose()

In [3]:
features = ["Type", "Year built", "Lot",
            "Bathrooms", "Full bathrooms", "Total interior livable area", 
            "Total spaces", "Garage spaces", "Tax assessed value",
            "High School Score","Middle School Distance","Middle School Score","Elementary School Distance","Elementary School Score",
            "Annual tax amount","Listed Price","Last Sold Price","City","State",
            # "Zip",
            # "Listed On Year","Listed On Month","Listed On Day","Last Sold On Year","Last Sold On Month","Last Sold On Day",
            ]
all_features = all_data[features]
num_train, num_test = len(train_data), len(test_data)

In [5]:

num_features = all_features.select_dtypes("number")
num_features = num_features.fillna(num_features.mean())
num_features = (num_features - num_features.mean())/num_features.std()
all_features.loc[num_features.columns.to_list()] = num_features

all_data = pd.get_dummies(all_features, dummy_na = True)
all_data.info()

KeyError: "None of [Index(['Year built', 'Lot', 'Bathrooms', 'Full bathrooms',\n       'Total interior livable area', 'Total spaces', 'Garage spaces',\n       'Tax assessed value', 'High School Score', 'Middle School Distance',\n       'Middle School Score', 'Elementary School Distance',\n       'Elementary School Score', 'Annual tax amount', 'Listed Price',\n       'Last Sold Price'],\n      dtype='object')] are in the [index]"

In [None]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

val_ratio = 0.2
val_num = int(num_train * val_ratio)

def create_dataset(X,Y,range):
    to_tensor = lambda d: torch.tensor(d.to_numpy(), dtype = torch.float32)
    X = to_tensor(X[range])
    Y = to_tensor(Y[range]) if Y is not None else None
    
    return TensorDataset(X, Y) if Y is not None else TensorDataset(X)

labels = train_data["Sold Price"]
train_dataset = create_dataset(all_data[:num_train], labels, slice(val_num, num_train))
val_dataset = create_dataset(all_data[:num_train], labels, slice(0,val_num))
test_dataset = create_dataset(all_data[-num_test : ], None, slice(0, num_test))

len(train_dataset), len(val_dataset), len(test_dataset)


(37952, 9487, 31626)

In [None]:
import torch.nn.functional as F
from tqdm import tqdm

class Model(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(64,1)
        )

    def forward(self, X):
        y = self.mlp(X)
        y = torch.clamp_min(y, 1) # Too small prediction make no sense.
        return y

def loss(y_hat, y):
    eps = 1e-8
    # eps = 0
    mse_loss = F.mse_loss(torch.log(y_hat) , torch.log(y), reduction = "none")
    rmse = torch.sqrt(mse_loss+eps)
    return rmse
    
def evaluate(model, criterion, dataloader, device):
    training = model.training
    model.eval()
    total_loss, total_count = 0,0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            y_hat = model(X)
            l = criterion(y_hat.reshape(-1), y)
            total_loss+= l.sum().item()
            total_count += y.numel()
        
    model.train(training)

    return total_loss/total_count


def train(model, epochs, optimizer, criterion,train_loader, val_loader, device):
    model.to(device)

    for epoch in range(epochs):
        with tqdm(total = len(train_loader)) as p:
            total_loss, total_count, eval_loss = 0, 0, 0
            for X, y in train_loader:
                model.train()
                X, y = X.to(device), y.to(device)

                optimizer.zero_grad()
                y_hat = model(X)
                l = criterion(y_hat.reshape(-1), y)
                l.mean().backward()
                optimizer.step()

                total_loss+= l.sum().item()
                total_count+= y.numel()

                p.update()
                if p.n == p.total:
                    eval_loss = evaluate(model, criterion, val_loader, device)
                p.set_description(f"{epoch+1}/{epochs} train loss: {total_loss/total_count:.4f}"
                                    f", eval loss: {eval_loss:.4f}")

In [None]:
batch_size, epochs = 128, 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = train_dataset[0][0].size(0)

print("Use device:", device)

train_loader = DataLoader(train_dataset, batch_size, True)
val_loader = DataLoader(val_dataset, batch_size, True)

model  = Model(input_size)
def weight_init(m):
    if isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight)
model.apply(weight_init)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)


train(model,epochs, optimizer, loss, train_loader, val_loader, device)

Use device: cuda


1/100 train loss: 6.7135, eval loss: 3.8389: 100%|██████████| 297/297 [00:04<00:00, 74.06it/s]
2/100 train loss: 2.7041, eval loss: 1.8694: 100%|██████████| 297/297 [00:03<00:00, 80.11it/s]
3/100 train loss: 1.2951, eval loss: 0.8804: 100%|██████████| 297/297 [00:03<00:00, 80.11it/s]
4/100 train loss: 0.6586, eval loss: 0.4563: 100%|██████████| 297/297 [00:03<00:00, 80.50it/s]
5/100 train loss: 0.4107, eval loss: 0.3115: 100%|██████████| 297/297 [00:03<00:00, 78.98it/s]
6/100 train loss: 0.3529, eval loss: 0.0000:  16%|█▌        | 47/297 [00:00<00:03, 80.69it/s]

In [None]:
train(model,epochs, optimizer, loss, train_loader, val_loader, device)

In [None]:

def predict(model, dataloader, device):
    model.to(device)
    result = []
    training = model.training
    model.eval()
    
    with torch.no_grad():
        for (X,) in dataloader:
            X = X.to(device)
            y_hat = model(X)
            result.append(y_hat.reshape(-1))
    model.train(training)

    return torch.cat(result)

testloader = DataLoader(test_dataset, 2048)
res = predict(model, testloader, device)
res = res.cpu()

data = pd.DataFrame({"Id": test_data["Id"],
               "Sold Price": res})
data.to_csv("submission.csv", index = False)

len(data), len(test_data)