In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import time
import pandas as pd
%matplotlib inline

In [None]:
path = "./data"
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
transform = {
    'train': transforms.Compose([
        transforms.RandomSizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize
    ]),
    'val': transforms.Compose([
        transforms.Scale(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize
    ]),
}

data_image = {x:datasets.ImageFolder(root = os.path.join(path,x),
                                     transform = transform[x])
              for x in ["train", "val"]}

data_loader_image = {x:torch.utils.data.DataLoader(dataset=data_image[x],
                                                num_workers=4,
                                                batch_size=24,
                                                pin_memory=True,
                                                shuffle = True)
                     for x in ["train", "val"]}

dataset_sizes = {x: len(data_loader_image[x]) for x in ['train', 'val']}

In [None]:
use_gpu = torch.cuda.is_available()
print(use_gpu)

In [None]:
class_names = data_image['train'].classes
classes_index = data_image["train"].class_to_idx
# print(classes)
print(class_names)

In [None]:
print(u"训练集个数:", len(data_image["train"]))
print(u"验证集个数:", len(data_image["val"]))

In [None]:
def imshow(inp, title=None):
    """显示Tensor类型的图片"""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    plt.figure(figsize=(17, 10))
    plt.imshow(inp)
    if title is not None:
        plt.title(title)
    

#一批训练集
inputs, classes = next(iter(data_loader_image['train']))

#对图片制作网格
out = torchvision.utils.make_grid(inputs)

imshow(out, title=[class_names[i] for i in classes])



In [None]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = model.state_dict()
    best_acc = 0.0

    for epoch in range(num_epochs):
        since = time.time()
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # 每一轮都有一次训练和验证
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train(True)  # 将模型设置为训练模式
            else:
                model.train(False)  # 将模型设置为验证模式

            running_loss = 0.0
            running_corrects = 0

            # 迭代数据
            for data in data_loader_image[phase]:
                # 得到输入数据
                inputs, labels = data

                # 将他们包装在Variable
                if use_gpu:
                    inputs = Variable(inputs.cuda())
                    labels = Variable(labels.cuda())
                else:
                    inputs, labels = Variable(inputs), Variable(labels)

                # 梯度归零
                optimizer.zero_grad()

                # 前向传播
                outputs = model(inputs) #1
                _, preds = torch.max(outputs.data, 1) #2
                loss = criterion(outputs, labels)

                # 反向传播+参数优化，如果是处于训练时期
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                # 对每次迭代的loss和accuracy求和
                running_loss += loss.data[0] 
                running_corrects += torch.sum(preds == labels.data)
            # 统计每一轮的平均loss和accuracy   
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # 保存最好的模型
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = model.state_dict()
                
            print()
            
        now_time = time.time() - since   
        print("Training time is:{:.0f}m {:.0f}s".format(now_time//60, now_time%60))

        

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # 加载模型权重
    model.load_state_dict(best_model_wts) # 3
    return model


In [None]:
import torchvision.models as models
model_names = sorted(name for name in models.__dict__ if name.islower() and not name.startswith("__"))
model_names

In [None]:
arch = 'resnet101'

In [None]:
model_conv = models.__dict__[arch](pretrained=True)

In [None]:
# 调整模型

for param in model_conv.parameters():
    param.requires_grad = False

# 新加的层默认设置 requires_grad=True 
num_ftrs = model_conv.fc.in_features
model_conv.fc = nn.Linear(num_ftrs, 2) # 在resnet10之后增加一个全连接层

if use_gpu:
    model_conv = model_conv.cuda()

criterion = nn.CrossEntropyLoss()

# 仅仅优化最后一层的参数
optimizer_conv = optim.SGD(model_conv.fc.parameters(), lr=0.001, momentum=0.9)

# 每隔7轮学习率变为原来的0.1倍
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_conv, step_size=7, gamma=0.1)

In [None]:
model_conv

In [None]:
model_conv = train_model(model_conv, criterion, optimizer_conv,
                         exp_lr_scheduler, num_epochs=25)

In [None]:
def visualize_model(model, num_images=6):
    images_so_far = 0
    fig = plt.figure()

    for i, data in enumerate(data_loader_image['val']):
        inputs, labels = data
        if use_gpu:
            inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
        else:
            inputs, labels = Variable(inputs), Variable(labels)

        outputs = model(inputs)
        _, preds = torch.max(outputs.data, 1)

        for j in range(inputs.size()[0]):
            images_so_far += 1
            ax = plt.subplot(num_images//2, 2, images_so_far)
            ax.axis('off')
            ax.set_title('predicted: {}'.format(class_names[preds[j]]))
            imshow(inputs.cpu().data[j])

            if images_so_far == num_images:
                return


In [None]:
visualize_model(model_conv)

In [None]:
torch.save(model_conv.state_dict(), "model_%s_finetune.pkl" % arch)

In [None]:
model.load_state_dict(torch.load('model_%s_finetune.pkl' % arch))

In [None]:
dir(torch)

In [None]:
class ImageFolder(datasets.ImageFolder):
    def __getitem__(self, index):
        """
        Args:
            index (int): Index
        Returns:
            tuple: (image, target) where target is class_index of the target class.
        """
        path, target = self.imgs[index]
        img = self.loader(path)
        if self.transform is not None:
            img = self.transform(img)
        if self.target_transform is not None:
            target = self.target_transform(target)

        return img, target, path
    

In [None]:
data_test_img = ImageFolder(root="./test/",
                                     transform = transform['val'])
data_loader_test_img = torch.utils.data.DataLoader(dataset=data_test_img,
                                                   batch_size = 1)

In [None]:
for a,b,c in data_loader_test_img:
    print(a)
    print(b)
    print(c)
    break

In [None]:
import collections
clip = 0.005
cnt = 1
csv_map = {}

for image, label, path in data_loader_test_img:
    images = Variable(image.cuda())
    y_pred = model_conv(images)
    smax = torch.nn.Softmax()
    smax_out = smax(y_pred)[0]
    cat_prob = smax_out.data[0]
    dog_prob = smax_out.data[1]
    prob = dog_prob

    if cat_prob > dog_prob:
        prob = 1 - cat_prob
    prob = np.around(prob, decimals=4)
    prob = np.clip(prob, clip, 1-clip)
    filepath = path[0].split('/')[-1].split('.')[-2]
    
    csv_map[filepath] = prob
    


In [None]:
csv_list = []
for k,v in csv_map.items():
    csv_list.append((k,v))

len(csv_list)

In [None]:
df = pd.DataFrame(csv_list)
df.columns = ['id','label']
df.id = df.id.astype(int)
df = df.sort_values('id')

In [None]:
df.to_csv('result_%s.csv' % arch,index=False)

In [None]:
'result_%s.csv' % arch

In [None]:
df.groupby('id').count()