In [1]:
import numpy as np
import math
import torch
from flsim.utils.example_utils import SimpleConvNet
from medmnist import OrganAMNIST, OrganSMNIST
from torch.utils.data import Dataset
from flsim.data.data_sharder import SequentialSharder, PowerLawSharder
from flsim.utils.example_utils import DataLoader, DataProvider
from torchvision.models import resnet18

import argparse

from torch import nn
import torch.nn.functional as F


from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder


In [2]:
data_file = 'airbnb/AB_NYC_2019.csv'
df = pd.read_csv(data_file)

df = df.dropna()
df = df[df['price'] != 0]
df['last_review'] = pd.to_datetime(df['last_review'])

# Calculate the number of days from December 31, 2019
reference_date = pd.to_datetime('2019-12-31')
df['last_review'] = (df['last_review'] - reference_date).dt.days

X = df.drop(['price', 'name', 'host_name', 'id', 'host_id'], axis=1)
y = df['price']



In [4]:
X

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Kensington,40.64749,-73.97237,Private room,1,9,-438,0.21,6,365
1,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,1,45,-224,0.38,2,355
3,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,1,270,-179,4.64,1,194
4,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,10,9,-407,0.10,1,0
5,Manhattan,Murray Hill,40.74767,-73.97500,Entire home/apt,3,74,-192,0.59,1,129
...,...,...,...,...,...,...,...,...,...,...,...
48782,Manhattan,Upper East Side,40.78099,-73.95366,Private room,1,1,-177,1.00,1,147
48790,Queens,Flushing,40.75104,-73.81459,Private room,1,1,-177,1.00,6,339
48799,Staten Island,Great Kills,40.54179,-74.14275,Private room,1,1,-177,1.00,1,87
48805,Bronx,Mott Haven,40.80787,-73.92400,Entire home/apt,1,2,-177,2.00,1,40


In [5]:
categorical_columns = []
categorical_dims =  {}
for col in X.columns[X.dtypes == object]:
    print(col, X[col].nunique())
    l_enc = LabelEncoder()
    X[col] = X[col].fillna("VV_likely")
    X[col] = l_enc.fit_transform(X[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)


print("categorical columns:", categorical_columns)
print("categorical dims:", categorical_dims)




neighbourhood_group 5
neighbourhood 218
room_type 3
categorical columns: ['neighbourhood_group', 'neighbourhood', 'room_type']
categorical dims: {'neighbourhood_group': 5, 'neighbourhood': 218, 'room_type': 3}


In [6]:
X

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,1,107,40.64749,-73.97237,1,1,9,-438,0.21,6,365
1,2,126,40.75362,-73.98377,0,1,45,-224,0.38,2,355
3,1,41,40.68514,-73.95976,0,1,270,-179,4.64,1,194
4,2,61,40.79851,-73.94399,0,10,9,-407,0.10,1,0
5,2,136,40.74767,-73.97500,0,3,74,-192,0.59,1,129
...,...,...,...,...,...,...,...,...,...,...,...
48782,2,199,40.78099,-73.95366,1,1,1,-177,1.00,1,147
48790,3,77,40.75104,-73.81459,1,1,1,-177,1.00,6,339
48799,4,89,40.54179,-74.14275,1,1,1,-177,1.00,1,87
48805,0,133,40.80787,-73.92400,0,1,2,-177,2.00,1,40


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.2, random_state=42)
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)


In [15]:
y_train

array([[ 75],
       [130],
       [100],
       ...,
       [100],
       [219],
       [169]])

In [23]:
features = [ col for col in X.columns] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]
neighbourhood_embed_dim = 5  # number of embedding size of neighbourhood
cat_emb_dim = [1, neighbourhood_embed_dim, 1]

scheduler_params={"step_size":10, # how to use learning rate scheduler
                  "gamma":0.9}
scheduler_fn=torch.optim.lr_scheduler.StepLR

clf = TabNetRegressor(cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs, n_steps=10
                      # , scheduler_params = scheduler_params, scheduler_fn=scheduler_fn
                      )



In [24]:
clf

In [11]:
max_epochs = 100 # default: 300

In [25]:
from pytorch_tabnet.augmentations import RegressionSMOTE
aug = RegressionSMOTE(p=0.2)

In [26]:
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_name=['train', 'valid'],
    eval_metric=['rmsle', 'mae', 'rmse', 'mse'],
    max_epochs=max_epochs,
    patience=50,
    batch_size=512, virtual_batch_size=128,
    num_workers=0,
    drop_last=False,
    # augmentations=aug, #aug
) 

epoch 0  | loss: 50500.81585| train_rmsle: 1.31451 | train_mae: 240.85521| train_rmse: 319.80053| train_mse: 102272.37847| valid_rmsle: 1.31185 | valid_mae: 238.13752| valid_rmse: 305.12703| valid_mse: 93102.50473|  0:00:06s
epoch 1  | loss: 37596.93317| train_rmsle: 1.14874 | train_mae: 185.30075| train_rmse: 259.37429| train_mse: 67275.0228| valid_rmsle: 1.14592 | valid_mae: 182.0934| valid_rmse: 238.20641| valid_mse: 56742.29518|  0:00:13s
epoch 2  | loss: 36732.99072| train_rmsle: 1.52225 | train_mae: 362.37638| train_rmse: 460.24067| train_mse: 211821.47633| valid_rmsle: 1.52927 | valid_mae: 363.04306| valid_rmse: 453.19573| valid_mse: 205386.37415|  0:00:19s
epoch 3  | loss: 36434.91944| train_rmsle: 0.89265 | train_mae: 128.94713| train_rmse: 227.84339| train_mse: 51912.61095| valid_rmsle: 0.88947 | valid_mae: 126.89145| valid_rmse: 204.32984| valid_mse: 41750.68223|  0:00:26s
epoch 4  | loss: 35948.42302| train_rmsle: 0.6877  | train_mae: 89.82832| train_rmse: 201.23193| train_

KeyboardInterrupt: 

In [None]:
preds = clf.predict(X_test)

y_true = y_test

from sklearn.metrics import mean_squared_error
test_score = mean_squared_error(y_pred=preds, y_true=y_true)

print(f"BEST VALID SCORE : {clf.best_cost}")
print(f"FINAL TEST SCORE : {test_score}")