In [2]:
import os

from config import *
from utils import *
from data import Fashion_attr_prediction
from net import f_model, c_model, p_model

from torch.autograd import Variable
import torch.nn as nn

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [2]:
class FeatureExtractor(nn.Module):
    def __init__(self, deep_module, color_module, pooling_module):
        super(FeatureExtractor, self).__init__()
        self.deep_module = deep_module
        self.color_module = color_module
        self.pooling_module = pooling_module
        self.deep_module.eval()
        self.color_module.eval()
        self.pooling_module.eval()

    def forward(self, x):
        # for name, module in list(self.deep_module._modules.items())[:-1]:
        #     if name == 'fc':
        #         x = x.view(x.size(0), -1)
        #     x = module(x)
        cls, feat, conv_out = self.deep_module(x)
        color = self.color_module(x).cpu().data.numpy()  # N * C * 7 * 7
        weight = self.pooling_module(conv_out).cpu().data.numpy()  # N * 1 * 7 * 7
        result = []
        for i in range(cls.size(0)):
            weight_n = weight[i].reshape(-1)
            idx = np.argpartition(weight_n, -COLOR_TOP_N)[-COLOR_TOP_N:][::-1]
            color_n = color[i].reshape(color.shape[1], -1)
            color_selected = color_n[:, idx].reshape(-1)
            result.append(color_selected)
        return feat.cpu().data.numpy(), result

In [3]:
main_model = f_model(model_path=DUMPED_MODEL).cuda()
color_model = c_model().cuda()
pooling_model = p_model().cuda()
extractor = FeatureExtractor(main_model, color_model, pooling_model)

Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /root/.torch/models/resnet50-19c8e357.pth
102502400it [00:02, 41395473.50it/s]


## Extract deep features and color features from the trained model

In [4]:
def dump_dataset(loader, deep_feats, color_feats, labels):
    for batch_idx, (data, data_path) in enumerate(loader):
        data = Variable(data).cuda()
        deep_feat, color_feat = extractor(data)
        for i in range(len(data_path)):
            path = data_path[i]
            feature_n = deep_feat[i].squeeze()
            color_feature_n = color_feat[i]

            deep_feats.append(feature_n)
            color_feats.append(color_feature_n)
            labels.append(path)

        if batch_idx % LOG_INTERVAL == 0:
            print("{} / {}".format(batch_idx * EXTRACT_BATCH_SIZE, len(loader.dataset)))

In [5]:
def dump(pca=False):
    all_loader = torch.utils.data.DataLoader(
        Fashion_attr_prediction(type="all", transform=data_transform_test),
        batch_size=EXTRACT_BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=True
    )
    deep_feats = []
    color_feats = []
    labels = []
    dump_dataset(all_loader, deep_feats, color_feats, labels)

    feat_all = 'output/all_feat_pca.npy'
    color_feat_all = 'output/all_color_feat.npy'
    feat_list = 'output/all_feat.list'

    if pca==True:
        # Reduce dimensionality on deep features
        scaler = MinMaxScaler(feature_range=[-1, 1])
        deep_feats_rescaled = scaler.fit_transform(deep_feats)
        pca = PCA(n_components=30)
        deep_feats_reduced = pca.fit_transform(deep_feats_rescaled)

        with open(feat_list, "w") as fw:
            fw.write("\n".join(labels))
        np.save(feat_all, np.vstack(deep_feats_reduced))
        np.save(color_feat_all, np.vstack(color_feats))
        print("Dumped to all_feat_pca.npy, all_color_feat.npy and all_feat.list.")

    else:
        with open(feat_list, "w") as fw:
            fw.write("\n".join(labels))
        np.save(feat_all, np.vstack(deep_feats))
        np.save(color_feat_all, np.vstack(color_feats))
        print("Dumped to all_feat.npy, all_color_feat.npy and all_feat.list.")

### Option 1. (Most efficient) PCA + Naive Query (in the 03_retrieval notebook)
#### Call the function (with PCA = True) to save features with dimensionality reduction

In [7]:
dump(pca=True)

0 / 139709
12800 / 139709
25600 / 139709
38400 / 139709
51200 / 139709
64000 / 139709
76800 / 139709
89600 / 139709
102400 / 139709
115200 / 139709
128000 / 139709
Dumped to all_feat_pca.npy, all_color_feat.npy and all_feat.list.


### Option 2. (Less efficient) Full Features + KMeans Query
#### Call the function (with PCA = False) to save full features

In [5]:
from sklearn.cluster import KMeans
from sklearn.externals import joblib

In [9]:
dump(pca=False)

0 / 139709
12800 / 139709
25600 / 139709
38400 / 139709
51200 / 139709
64000 / 139709
76800 / 139709
89600 / 139709
102400 / 139709
115200 / 139709
128000 / 139709
Dumped to all_feat.npy, all_color_feat.npy and all_feat.list.


In [3]:
@timer_with_task("Loading feature database")
def load_feat_db():

    feat_all = os.path.join(FEATURES_DIR, 'all_feat.npy') 
    feat_list = os.path.join(FEATURES_DIR, 'all_feat.list')
    color_feat = os.path.join(FEATURES_DIR, 'all_color_feat.npy')
    
    if not os.path.isfile(feat_list) or not os.path.isfile(feat_all) or not os.path.isfile(color_feat):
        print("No feature db file! Please run feature_extraction notebook first.")
        return
    deep_feats = np.load(feat_all)
    color_feats = np.load(color_feat)
    with open(feat_list) as f:
        labels = list(map(lambda x: x.strip(), f.readlines()))
    return deep_feats, color_feats, labels

### Create the KMeans classifier and save it

In [4]:
feats, color_feats,labels = load_feat_db()

Loading feature database...
Loading feature database Done. Time: 2.346 sec


In [7]:
model = KMeans(n_clusters=N_CLUSTERS, random_state=0, n_jobs=-1).fit(feats)
model_path = os.path.join(OUTPUT_DIR, r'kmeans.m')
joblib.dump(model, model_path)

['output/kmeans.m']

In [6]:
OUTPUT_DIR = "output/"