solafune:「夜間光データから土地価格を予測」のLSTMベースライン(by daikiclimate)

https://qiita.com/daikiclimate/items/9a4686a8732a35fd14da の写経

In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [40]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib_venn import venn2
#from ptitprince import RainCloud

from pandas_profiling import ProfileReport
%matplotlib inline

print(os.getcwd())

/content


In [53]:
INPUT_DIR = '/content/drive/MyDrive/Colab Notebooks/Solafune/夜間光データから土地価格を予測/data/inputs/'
OUTPUT_DIR = '/content/drive/MyDrive/Colab Notebooks/Solafune/夜間光データから土地価格を予測/data/outputs/'
NB_NAME = 'nb005'

os.makedirs(OUTPUT_DIR, exist_ok=True)

train = pd.read_csv(os.path.join(INPUT_DIR, 'TrainDataSet.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR, 'EvaluationData.csv'))
submission = pd.read_csv(os.path.join(INPUT_DIR, 'UploadFileTemplate.csv'))

In [54]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold
from copy import deepcopy
from sklearn.preprocessing import StandardScaler

In [55]:
count_ids = train['PlaceID'].value_counts().reset_index()
count_ids

Unnamed: 0,index,PlaceID
0,1631,22
1,1158,22
2,1526,22
3,1494,22
4,1462,22
...,...,...
1013,1508,3
1014,327,2
1015,494,2
1016,1622,2


In [56]:
valid_PlaceID = count_ids[count_ids['PlaceID'] == 22]['index']
valid_PlaceID

0      1631
1      1158
2      1526
3      1494
4      1462
       ... 
895     363
896      75
897     331
898     635
899     619
Name: index, Length: 900, dtype: int64

In [57]:
n_train = len(train)
train = train[train['PlaceID'].isin(valid_PlaceID)]
train = train.reset_index().drop(columns = 'index')
print(f'Dropped : {round(100 - 100 * len(train)/n_train, 1)}%')

Dropped : 9.5%


In [84]:
from contextlib import contextmanager
from time import time

@contextmanager
def timer(logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None):
    if prefix: format_str = str(prefix) + format_str
    if suffix: format_str = format_str + str(suffix)
    start = time()
    yield
    d = time() - start
    out_str = format_str.format(d)
    if logger:
        logger.info(out_str)
    else:
        print(out_str)

In [60]:
feature_cols = ['MeanLight', 'SumLight']

for f in feature_cols:
    ss = StandardScaler()
    # DataFrameでtransformする
    train[f] = ss.fit_transform(train[[f]])
    test[f] = ss.transform(test[[f]])

In [61]:
import torch
from torch.utils.data import Dataset
from torch import nn
import torch.optim as optim

In [66]:
class SeqDataset(Dataset):
    def __init__(self, df, place_ids, feature_col, target_col, is_log=True, test=False):
        self.df = df
        self.place_ids = place_ids
        self.feature_col = feature_col
        self.target_col = target_col
        self.is_log = is_log
        if test:
            self.df[target_col] = 0

    def __len__(self):
        return len(self.place_ids)

    def __getitem__(self, idx):
        place_id = self.place_ids[idx]
        seq = self.df.loc[self.df['PlaceID'] == place_id, self.feature_col].values
        label = self.df.loc[self.df['PlaceID'] == place_id, self.target_col].values
        seq, label = torch.tensor(seq).float(), torch.tensor(label).float()
        if self.is_log:
            label = torch.log1p(label)
        return seq, label

In [67]:
#dataset = SeqDataset(df = train, place_ids = train.PlaceID.unique(),
#                     feature_col = feature_cols, target_col = "AverageLandPrice")
testset = SeqDataset(df = test, place_ids = test.PlaceID.unique(),
                     feature_col = feature_cols, target_col = "AverageLandPrice", test = True)
#dataloader = torch.utils.data.DataLoader(dataset, batch_size = 5)
testloader =  torch.utils.data.DataLoader(testset, batch_size = 1)

In [73]:
testset

<__main__.SeqDataset at 0x7f160948e410>

In [69]:
class BaselineLSTM(nn.Module):
    def __init__(
        self,
        emb_dim=128,
        rnn_dim=128,
        hidden_size=128,
        num_layers=2,
        dropout=0.3,
        rnn_dropout=0.3,
    ):
        super().__init__()

        self.emb_dim = emb_dim
        self.rnn_dim = rnn_dim
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.rnn_dropout = rnn_dropout
        self.lstm = nn.LSTM(
            input_size=2,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=rnn_dropout,
            bidirectional=False,
            batch_first=True,
        )

        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x

In [70]:
EPOCHS = 10
LEARNING_LATE = 0.01

In [87]:
def fit_LSTM(X, y, cv=None):
    models = []
    oof_pred = np.zeros_like(y, dtype=np.float)

    for i, (idx_train, idx_valid) in enumerate(cv.split(X, y, train['PlaceID'])):
        trainloader = torch.utils.data.DataLoader(
            SeqDataset(
                df = X.iloc[idx_train],
                place_ids = X.iloc[idx_train]['PlaceID'].unique(),
                feature_col = feature_cols,
                target_col = 'AverageLandPrice'
            ),
            batch_size=4
        )
        validloader = torch.utils.data.DataLoader(
            SeqDataset(
                df = X.iloc[idx_valid],
                place_ids = X.iloc[idx_valid]['PlaceID'].unique(),
                feature_col = feature_cols,
                target_col = 'AverageLandPrice'
            ),
            batch_size = 1
        )
        
        model = BaselineLSTM()
        
        criterion = nn.MSELoss()
        #optimizer = optim.SGD(model.parameters(), lr=LEARNING_LATE)
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_LATE)

        with timer(prefix='fit fold={}'.format(i + 1)):
            for epoch in range(EPOCHS):
                for data in trainloader:
                    seq_data, label = data
                    optimizer.zero_grad()
                    pred = model(seq_data)
                    loss = torch.sqrt(criterion(pred.squeeze(2), label))
                    loss.backward()
                    optimizer.step()
                    print(f'epoch:{epoch}  loss{loss.item()}', end='')
        
        pred_i = []
        for data in validloader:
            seq_data, _ = data
            with torch.no_grad():
                model.eval()
                pred_i.append(model(seq_data).squeeze(2).detach().numpy())
        pred_i = np.array(pred_i).reshape(-1)
        oof_pred[idx_valid] = pred_i
        models.append(model)

        print(f'Fold {i} RMSLE: {mean_squared_error(np.log1p(X.iloc[idx_valid].AverageLandPrice.values), pred_i):.4f}')

    score = mean_squared_error(np.log1p(y), oof_pred)
    print('FINISHED \ whole score: {:.4f}'.format(score))
    return oof_pred, models

In [80]:
def create_predict(models, testloader):
    pred = []
    for data in testloader:
        seq_data, _ = data
        with torch.no_grad():
            p = []
            for model in models:
                model.eval()
                p.append(
                    model(seq_data).detach().numpy().reshape(-1)
                )
            pred.append(p)

    pred = np.mean(pred, axis=1)
    return pred

def fit_and_predict(train_df, target_df):
    target_name = 'AverageLandPrice'
    print('-' * 20 + ' start {} '.format(target_name) + '-' * 20)
    y = target_df.values
    cv = GroupKFold(n_splits=5)

    oof, models = fit_LSTM(train_df, y, cv=cv)
    return oof, models

In [88]:
oof, models = fit_and_predict(train_df=train, target_df=train['AverageLandPrice'])

-------------------- start AverageLandPrice --------------------
epoch:0  loss7.040541648864746epoch:0  loss5.120565414428711epoch:0  loss4.953586101531982epoch:0  loss2.7442312240600586epoch:0  loss2.053506374359131epoch:0  loss2.3498194217681885epoch:0  loss2.292112112045288epoch:0  loss2.1970043182373047epoch:0  loss1.759461760520935epoch:0  loss2.14237117767334epoch:0  loss1.4654591083526611epoch:0  loss1.1215418577194214epoch:0  loss1.2649353742599487epoch:0  loss1.4759150743484497epoch:0  loss1.361801028251648epoch:0  loss1.3524428606033325epoch:0  loss1.3283807039260864epoch:0  loss1.5348097085952759epoch:0  loss2.0002498626708984epoch:0  loss1.4271987676620483epoch:0  loss1.7747471332550049epoch:0  loss1.4387387037277222epoch:0  loss1.3832151889801025epoch:0  loss1.4605549573898315epoch:0  loss1.4130640029907227epoch:0  loss0.7068877816200256epoch:0  loss0.6352965831756592epoch:0  loss1.20090913772583epoch:0  loss1.0541502237319946epoch:0  loss1.0000032186508179epoch:0  loss1.8

In [89]:
with timer(prefix='predict'):
    pred = create_predict(models, testloader)

predict7.618[s]


In [90]:
submission['LandPrice'] = np.expm1(pred.reshape(-1))
submission.to_csv(os.path.join(OUTPUT_DIR, 'sub_' + NB_NAME +'.csv'), index=False)