# Dataset Description
__Fashion-MNIST__ – аналог знаменитого датасета _MNIST_, который состоит не из рукописных символов, а из предметов одежды. Он был создан, чтобы актуализировать учебные задачи в компьютерном зрении и приблизить их к реальности.

Каждое изображение представляет собой набор их 784 пикселей (28x28). Каждый пиксель принимает значение от 0 до 255, обозначая его цвет в черно-белой шкале. Обучающая выборка содержит 786 столбцов, где первый (`label`) – принадлежность изображения к классу, а последний – его уникальный номер (`id`). Остальные столбцы – пиксели изображения.
Проведите первичный анализ данных, чтобы определить наличие NaN значений.
Тестовая выборка (на основе которой должно быть получено исследование) содержит все те же столбцы, кроме `label`.

Расшифровка классов (`label`), к которым принадлежат изображения:

0 T-shirt/top – футболка  
1 Trouser – брюки  
2 Pullover – свитер  
3 Dress – платье  
4 Coat – пальто  
5 Sandal – сандалия  
6 Shirt – рубашка  
7 Sneaker – кроссовок  
8 Bag – сумка  
9 Ankle boot – сапог  
  
Структура данных:

`fmnist_train.csv` – обучающая выборка  
`fmnist_test.csv` – тестовая выборка (на которой должно быть получено предсказание)  
`sample_submission.csv` – пример решения

In [16]:
import numpy as np
import pandas as pd

import torch
from torch import nn

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [39]:
test_df = pd.read_csv('fmnist_test.csv')
train_df = pd.read_csv('fmnist_train.csv')

sample_df = pd.read_csv('sample_submission.csv')

In [None]:
train_df

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784,Id
0,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,9,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,6,0,0,0,0,0,0,0,5,0,...,0.0,0.0,30.0,43.0,0.0,0.0,0.0,0.0,0.0,2
3,0,0,0,0,1,2,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3
4,3,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17035,4,0,0,0,0,0,0,0,1,0,...,186.0,16.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,17035
17036,4,0,0,0,0,0,0,5,4,3,...,0.0,33.0,48.0,37.0,3.0,0.0,0.0,0.0,0.0,17036
17037,4,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,17037
17038,6,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17038


In [None]:
train_df.dtypes.value_counts()

float64    424
int64      362
Name: count, dtype: int64

In [None]:
nan_counts = train_df.isna().sum()
nan_counts.sort_values(ascending=False)

pixel784    1
pixel768    1
pixel767    1
pixel766    1
pixel765    1
           ..
pixel15     0
pixel14     0
pixel13     0
pixel12     0
pixel11     0
Length: 786, dtype: int64

In [None]:
print(f'Rows with NaN: {train_df.isna().any(axis=1).sum()}')
train_df[train_df.isna().any(axis=1)]

Rows with NaN: 1


Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784,Id
17039,5,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,17039


В `train` датасете есть 424 значения фичей с NaN. Все они относятся к одной записи, поэтому она подлежит исключению.

In [None]:
train_df = train_df.dropna()

In [None]:
X_ = train_df.drop(columns=['label', 'Id'])
y = train_df['label']

print(f'Крайние значения фичей: \nmin - {X.min().min()}\nmax - {X.max().max()}')

Крайние значения фичей: 
min - 0.0
max - 255.0


Как и ожидалось, все значения пикселей лежат от 0 до 255. Нормализуем их.

In [24]:
X = X_ / 255

In [None]:
# Преобразование в тензор
X = torch.tensor(X.values, dtype=torch.float32).reshape(-1, 1, 28, 28)
y = torch.tensor(y.values, dtype=torch.long)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Создаём TensorDataset'ы
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
test_dataset = torch.utils.data.TensorDataset(X_test, y_test)
# Создаём DataLoader'ы
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


In [None]:
# Базовая 4-х слойная MLP модель
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits


model = NeuralNetwork().to(device)
model

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()

    # Итерируемся по батчам данных из DataLoader
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        y_pred = model(X)
        loss = loss_fn(y_pred, y)

        # Вычисляем градиенты по отношению к параметрам модели
        loss.backward()
        # Обнуляем градиенты для следующего шага
        optimizer.step()
        optimizer.zero_grad()

        # Логируем процесс обучения каждые 100 батчей
        if batch % 100 == 0:
            loss, curr = loss.item(), (batch + 1) * len(X)
            print(f'loss: {loss:>7f} [{curr:>5d}/{size:>5d}]')


def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        # Итерируемся по батчам тестовых данных
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0105)  # lr подбирался руками

# Обучение модели на 20 эпохах
num_epochs = 20
for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}\n=============================')
    train(train_data_loader, model, loss_fn, optimizer)
    test(test_data_loader, model, loss_fn)
print('Done!')

Epoch 1
loss: 0.594981 [   32/13631]
loss: 0.469878 [ 3232/13631]
loss: 0.600874 [ 6432/13631]
loss: 0.372308 [ 9632/13631]
loss: 0.237030 [12832/13631]
Test Error: 
 Accuracy: 85.2%, Avg loss: 0.429552 

Epoch 2
loss: 0.289967 [   32/13631]
loss: 0.503066 [ 3232/13631]
loss: 0.381106 [ 6432/13631]
loss: 0.285540 [ 9632/13631]
loss: 0.440506 [12832/13631]
Test Error: 
 Accuracy: 85.2%, Avg loss: 0.432136 

Epoch 3
loss: 0.196587 [   32/13631]
loss: 0.339373 [ 3232/13631]
loss: 0.309122 [ 6432/13631]
loss: 0.332502 [ 9632/13631]
loss: 0.713636 [12832/13631]
Test Error: 
 Accuracy: 85.9%, Avg loss: 0.416777 

Epoch 4
loss: 0.444905 [   32/13631]
loss: 0.215707 [ 3232/13631]
loss: 0.175055 [ 6432/13631]
loss: 0.414788 [ 9632/13631]
loss: 0.457021 [12832/13631]
Test Error: 
 Accuracy: 85.4%, Avg loss: 0.420158 

Epoch 5
loss: 0.410168 [   32/13631]
loss: 0.333813 [ 3232/13631]
loss: 0.244713 [ 6432/13631]
loss: 0.787889 [ 9632/13631]
loss: 0.269218 [12832/13631]
Test Error: 
 Accuracy: 85.

___
# Предсказания

In [40]:
test_ids = test_df['Id']
X_test = test_df.drop(columns=['Id'])

# Нормализация
X_test = X_test / 255

# Преобразование в тензор
X_test = torch.tensor(X_test.values, dtype=torch.float32).reshape(-1, 1, 28, 28)

# Создание DataLoader
test_dataset = torch.utils.data.TensorDataset(X_test)
test_data_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

In [41]:
model.eval()
predicted_labels = []

with torch.no_grad():
    for batch in test_data_loader:
        X = batch[0].to(device)
        y_predicted = model(X)
        predicted_labels.append(y_predicted.argmax(dim=1).cpu())

predicted_labels = torch.cat(predicted_labels).numpy()

In [None]:
submission_df = pd.DataFrame({'Id': test_ids, 'label': predicted_labels})

display(submission_df.head())

submission_df.to_csv('predicted_labels.csv', index=False)

Unnamed: 0,Id,label
0,0,0
1,1,1
2,2,2
3,3,2
4,4,4
