# Data Preparation
## Image Tag Vector

In [18]:
import os
from sets import Set
import numpy as np

allTagFiles = [filename for filename in os.listdir('data/tags_train') if filename.endswith('.txt')]
tags = Set()
for tagFile in allTagFiles:
    f = open(os.path.join('data/tags_train',tagFile), 'r+')
    for line in f:
        tag = line.split(':')[1][:-1] if line.split(':')[1].endswith('\n') else line.split(':')[1]
        tags.add(tag)
tagList = [tag for tag in tags]

def getTagVecs(tagPath, tagList):
    tagVecs = []
    allTagFiles = [filename for filename in os.listdir(tagPath) if filename.endswith('.txt')]
    for tagFile in allTagFiles:
        f = open(os.path.join(tagPath,tagFile), 'r+')
        tagVec = [0 for i in range(len(tagList))]
        for line in f:
            tag = line.split(':')[1][:-1] if line.split(':')[1].endswith('\n') else line.split(':')[1]
            tagVec[tagList.index(tag)] += 1
        tagVecs.append(tagVec)
    return tagVecs
        
tagVecs_train = np.array(getTagVecs('data/tags_train',tagList))
tagVecs_test = np.array(getTagVecs('data/tags_test',tagList))


In [40]:
from sklearn import preprocessing
X_train = preprocessing.normalize(np.array([np.array(line.split(',')).astype(float) for line in open('data/BOW_vectors_train/BOW_train.txt','r+')]),norm='l1')
X_test = preprocessing.normalize(np.array([np.array(line.split(',')).astype(float) for line in open('data/BOW_vectors_test/BOW_test.txt','r+')]),norm='l1')

## CNN model

In [61]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from sklearn import linear_model, cross_validation, metrics

def trainDNNModel_CV(fold, model):
    train_indices, test_indices = fold
    X_train = X[train_indices]
    Y_train = Y[train_indices]
    X_test = X[test_indices]
    Y_test = Y[test_indices]
    model.fit(X_train, Y_train, nb_epoch=10, batch_size=32)
    score = model.evaluate(X_test, Y_test, batch_size=32)
    print(score)
    return model

def getCNNModel():
    model = Sequential()
    model.add(Dense(1000, activation='relu', input_dim=24471))
    model.add(Dropout(0.1))
    model.add(Dense(500, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(250, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(80, activation='sigmoid'))

    sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='binary_crossentropy',
                  optimizer=sgd)
    return model

X=X_train
Y=tagVecs_train
n_fold = 4
models=map(trainDNNModel_CV, cross_validation.KFold(len(X), n_fold), [getCNNModel() for i in range(n_fold)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

In [60]:
Y_pred = models[4][0].predict(X_test)
Y_pred[Y_pred>=0.5] = 1
Y_pred[Y_pred<0.5] = 0

from sklearn.metrics import pairwise_distances
def rank20(myRegressor, X_test, Y_test, metri = 'euclidean', cheat = False):
# 'euclidean', 'cityblock', 'cosine', 'hamming', 'sqeuclidean', 'seuclidean', V=None, 
# X_test 2000 * m
# Y_test 2000 * 2048
# Y_predict 2000 * 2048
# return Y_rank 2000 * 20
    Y_pred = myRegressor.predict(X_test)
    distanceM = pairwise_distances(Y_pred, Y_test, metric = metri)
    return np.argsort(distanceM, axis = 1)[:,:20]

r20 = rank20(models[2][0], X_test, tagVecs_test)

[[ 0.02560224  0.01794526  0.03230866 ...,  0.01880714  0.03034083
   0.01529059]
 [ 0.02556997  0.0179229   0.03228077 ...,  0.01878203  0.0303097
   0.01527175]
 [ 0.02557426  0.01792328  0.03227043 ...,  0.01878283  0.03029902
   0.0152684 ]
 ..., 
 [ 0.02558721  0.01793737  0.03229018 ...,  0.01879061  0.03031952
   0.01528253]
 [ 0.02559263  0.01794122  0.0323028  ...,  0.0188003   0.03033593
   0.01528688]
 [ 0.02557421  0.01792504  0.03228446 ...,  0.01878626  0.03031887
   0.01527514]]


In [58]:
X_test

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.00028985, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        ,  0.        ,  0.00096433, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])