## tsne 表征学习
- t-SNE
- 聚类结果可视化
- sklearn.manifold.TSNE

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from sklearn.manifold import TSNE
from torch import nn
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)),
    # transforms.Grayscale(3),
    # transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))  # 归一化到(0,1) 分布到(-1,1)
])
dataset1 = torchvision.datasets.MNIST(root="../../data", train=True, download=True, transform=transform)
dataset2 = torchvision.datasets.MNIST(root="../../data", train=False, download=False, transform=transform)
# DataLoader num_workers 不要设置, cpu: num_workers=2
train_loader: DataLoader = torch.utils.data.DataLoader(dataset1, batch_size=64, shuffle=True)
test_loader: DataLoader = torch.utils.data.DataLoader(dataset2, batch_size=1000, shuffle=True)

In [3]:
def train(model: nn.Module, device, train_loader, optimizer: torch.optim.Optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()

        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()

        optimizer.step()

        if batch_idx % 10 == 0:
            # Train Epoch: 2 [16640/60000 (28%)]	Loss: 0.142410
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch,
                batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader),
                loss.item()))


def test(model: nn.Module, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()
            pred = output.argmax(dim=1, keepdims=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    # Test set: Average loss: 0.0483, Accuracy: 9849/10000 (98%)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss,
        correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))

In [None]:
features = 123
data = pd.DataFrame(features)
tsne = TSNE()
tsne.fit_transform(data)
data = pd.DataFrame(tsne.embedding_, index=data.index)
data.to_pickle('./tsne.pkl')

In [None]:
tsne_data = pd.read_pickle('./tsne.pkl')

y_data = dataset1.test_labels
y_data = np.where(y_data == 1)[1] * (9.0 / 10)
plt.scatter(tsne[0], tsne[1], c=y_data, s=1, cmap=plt.cm.get_cmap('jet', 10))
plt.colorbar(ticks=range(10))
plt.clim(-.5, 9.5)
plt.show()