# README
- Notebook版
- このNotebookだけで成立するようにしています
    - KaggleNotebook や Colaboratory で実行がしやすくなる
    - 継続的な改善はしにくいので使い捨てと割り切ったほうがいい

In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm

# DataPath
- パスの管理を任せる
- KaggleやColaboratoryで実行する場合は修正が必要

In [2]:
from enum import Enum
from pathlib import Path


class DataPath(Enum):
    Root = Path('../')
    Input = Root / 'input'
    Submission = Root / 'submissions'

    TrainCsv = Input / 'train.csv'
    TestCsv = Input / 'test.csv'

    SubmissionCsv = Input / 'sample_submission.csv'

# Dataset

In [3]:
import numpy as np
from torch.utils.data import Dataset


class TrainDataset(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx].astype(np.float32), self.y[idx]


class TestDataset(Dataset):
    def __init__(self, X: np.ndarray):
        self.X = X

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx].astype(np.float32)


In [4]:
def prepare_data_loaders(batch_size):
    train = pd.read_csv(DataPath.TrainCsv.value)

    X = train.iloc[:, 1:].values
    y = train.iloc[:, 0].values
    X_train, X_valid, y_train, y_valid = train_test_split(X,

                                                          y,
                                                          train_size=0.8,
                                                          random_state=0)
    train_dataset = TrainDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    valid_dataset = TrainDataset(X_valid, y_valid)
    valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, valid_loader


In [5]:
torch.manual_seed(0)

BATCH_SIZE = 16

# 1. DatasetとDataLoader
train_loader, valid_loader = prepare_data_loaders(BATCH_SIZE)

# Model

In [6]:
import torch
from torch import nn


class MySimpleNet(nn.Module):
    def __init__(self):
        super().__init__()

        self.fc1 = nn.Linear(in_features=784, out_features=60)
        self.fc2 = nn.Linear(in_features=60, out_features=10)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)

        return x

In [7]:
# 2. モデル(ネットワーク)
model: nn.Module = MySimpleNet()

# 最適化アルゴリズムと損失関数
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

# 学習


In [8]:
def run_train_epoch(model, train_loader, criterion, optimizer, epoch):
    model.train()

    running_loss = 0.0

    for images, labels in train_loader:
        # 勾配初期化
        optimizer.zero_grad()

        # 順伝播計算
        outputs = model(images)

        # 損失の計算
        loss = criterion(outputs, labels)

        # Backward
        loss.backward()

        # 重みの更新
        optimizer.step()

        running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f'Epoch: {epoch} Train Loss: {epoch_loss:.4f}')


def run_valid_epoch(model, valid_loader, criterion, epoch):
    model.eval()

    running_loss = 0.0

    with torch.no_grad():
        for images, labels in valid_loader:
            outputs = model(images)

            loss = criterion(outputs, labels)

            running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss / len(valid_loader.dataset)
    print(f'Epoch: {epoch} Valid Loss: {epoch_loss:.4f}')

In [9]:
# 3. 学習
NUM_EPOCHS = 2

for epoch in tqdm(range(1, NUM_EPOCHS + 1)):
    run_train_epoch(model, train_loader, criterion, optimizer, epoch)
    run_valid_epoch(model, valid_loader, criterion, epoch)

 50%|█████     | 1/2 [00:02<00:02,  2.24s/it]

Epoch: 1 Train Loss: 0.6391
Epoch: 1 Valid Loss: 0.3832


100%|██████████| 2/2 [00:04<00:00,  2.39s/it]

Epoch: 2 Train Loss: 0.3121
Epoch: 2 Valid Loss: 0.3274





# テストデータでの予測とSubmission

In [10]:
def make_predictions(model, test_loader):
    model.eval()
    predictions = np.array([], dtype=np.int)

    with torch.no_grad():
        for images in test_loader:
            outputs = model(images)

            _, y_pred = torch.max(outputs, dim=1)
            y_pred_label = y_pred.numpy()

            predictions = np.append(predictions, y_pred_label)

    return predictions

In [11]:
# 4. TestDataでの予測
df_test = pd.read_csv(DataPath.TestCsv.value)
X_test = df_test.values

test_dataset = TestDataset(X_test)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

predictions = make_predictions(model, test_loader)

In [12]:
import datetime


def make_submission_file(model, predictions):
    submit_data = pd.read_csv(DataPath.SubmissionCsv.value)
    submit_data['Label'] = predictions

    yymmddhhmmss = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    model_name = model.__class__.__name__

    # ex: '20201231_174530_SimpleNet.csv
    save_submission_path = DataPath.Submission.value / f'{yymmddhhmmss}_{model_name}.csv'

    submit_data.to_csv(save_submission_path, index=False)
    print(f'Saved {save_submission_path}')


In [13]:
# submissionの作成
make_submission_file(model, predictions)

Saved ../submissions/20191014_204326_MySimpleNet.csv


おわり