# Kaggle Fashion MNIST

## Либы

In [2]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from tqdm import tqdm

## Данные

In [4]:

train_df = pd.read_csv("dataset/fmnist_train.csv")

In [5]:
print(train_df.isnull().sum().sum())

424


Почистим грязные данные

In [6]:
print(train_df.shape)
train_df_cleaned = train_df.dropna()
print(train_df_cleaned.shape)

(17040, 786)
(17039, 786)


In [7]:
train_df['label'].value_counts()

label
0    1770
7    1761
3    1725
6    1704
1    1700
5    1695
9    1694
2    1677
8    1675
4    1639
Name: count, dtype: int64

In [8]:
train_df_cleaned['label'].value_counts()

label
0    1770
7    1761
3    1725
6    1704
1    1700
9    1694
5    1694
2    1677
8    1675
4    1639
Name: count, dtype: int64

Можем дропнуть `NaN`-ки

In [9]:
del train_df_cleaned
train_df = train_df.dropna()

## Кастомный Dataset под MNIST

In [10]:
class FashionMNISTDataset(Dataset):
    def __init__(self, dataframe, train=True):
        self.images = dataframe.drop(columns=["label", "id"]).values if train else dataframe.drop(columns=["id"]).values
        self.images = self.images.reshape(-1, 28, 28).astype("float32") / 255.0
        self.images = torch.tensor(self.images).unsqueeze(1)  # (N, 1, 28, 28)
        self.labels = torch.tensor(dataframe["label"].values).long() if train else None
        self.train = train

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        if self.train:
            return self.images[idx], self.labels[idx]
        else:
            return self.images[idx]

## Код модельки

In [11]:
class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.network = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),  # 14x14
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),  # 7x7
            nn.Flatten(),
            nn.Linear(64*7*7, 128), nn.ReLU(),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        return self.network(x)

## Обучим модельку

In [12]:
train_df['id'] = train_df['Id']
del train_df['Id']

In [13]:
dataset = FashionMNISTDataset(train_df)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_data, val_data = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64)

model = SimpleCNN()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [14]:
for epoch in tqdm(range(10), total=10):
    model.train()
    for x_batch, y_batch in train_loader:
        optimizer.zero_grad()
        output = model(x_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()

    # Оценка
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for x_batch, y_batch in val_loader:
            preds = model(x_batch).argmax(dim=1)
            all_preds.extend(preds.numpy())
            all_labels.extend(y_batch.numpy())
    acc = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1}, val accuracy: {acc:.4f}")


 10%|█         | 1/10 [00:14<02:07, 14.16s/it]

Epoch 1, val accuracy: 0.8316


 20%|██        | 2/10 [00:29<01:58, 14.80s/it]

Epoch 2, val accuracy: 0.8498


 30%|███       | 3/10 [00:46<01:49, 15.66s/it]

Epoch 3, val accuracy: 0.8773


 40%|████      | 4/10 [01:01<01:32, 15.50s/it]

Epoch 4, val accuracy: 0.8768


 50%|█████     | 5/10 [01:14<01:13, 14.78s/it]

Epoch 5, val accuracy: 0.8955


 60%|██████    | 6/10 [01:28<00:57, 14.35s/it]

Epoch 6, val accuracy: 0.8891


 70%|███████   | 7/10 [01:41<00:42, 14.07s/it]

Epoch 7, val accuracy: 0.8914


 80%|████████  | 8/10 [01:55<00:27, 13.93s/it]

Epoch 8, val accuracy: 0.8903


 90%|█████████ | 9/10 [02:08<00:13, 13.79s/it]

Epoch 9, val accuracy: 0.8914


100%|██████████| 10/10 [02:22<00:00, 14.26s/it]

Epoch 10, val accuracy: 0.8856





## Проверяем предсказания на тестовой выборке

In [15]:
test_df = pd.read_csv('dataset/fmnist_test.csv')

test_df['id'] = test_df['Id']
del test_df['Id']

In [16]:
test_dataset = FashionMNISTDataset(test_df, train=False)
test_loader = DataLoader(test_dataset, batch_size=64)

In [17]:
all_preds = []

with torch.no_grad():
    for batch in test_loader:
        preds = model(batch).argmax(dim=1)
        all_preds.extend(preds.cpu().numpy())

submission = pd.DataFrame({
    "id": test_df["id"],
    "label": all_preds
})

submission.to_csv("submission/submission.csv", index=False)

Отправим результат в Kaggle