In [1]:
import numpy as np
import pandas as pd

# Data Import and Preprocessing

In [2]:
test_f = pd.read_csv("data/features_test/features_resnet1000_test.csv", header=None)
test_f_med = pd.read_csv("data/features_test/features_resnet1000intermediate_test.csv", header=None)
train_f = pd.read_csv("data/features_train/features_resnet1000_train.csv", header=None)
train_f_med = pd.read_csv("data/features_train/features_resnet1000intermediate_train.csv", header=None)

In [16]:
train_size = 10000
v_len = 1000
test_size = 2000

## Getting Y_train and Y_test ready

In [20]:
Y_train_tmp = train_f.as_matrix()
Y_test_tmp = test_f.as_matrix()
Y_train = np.zeros((train_size, v_len))
Y_test = np.zeros((test_size, v_len))
for vector in Y_train_tmp:
    img = int(vector[0].split('/')[1].split('.')[0])
    Y_train[img] = vector[1:]
for vector in Y_test_tmp:
    img = int(vector[0].split('/')[1].split('.')[0])
    Y_test[img] = vector[1:]
print("Y_train has shape: " + str(Y_train.shape))
print("Y_test has shape: " + str(Y_test.shape))

Y_train has shape: (10000, 1000)
Y_test has shape: (2000, 1000)


## Getting Desp_train and Desp_test ready
Note: not all txt has 5 lines, some has empty lines.

In [21]:
Desp_train = []
Desp_test = []
for i in range(train_size):
    fname = "data/descriptions_train/"+str(i)+".txt"
    with open(fname, "r") as ins:
        text = []
        for line in ins:
            text.append(line)
        Desp_train.append(text)
for i in range(test_size):
    fname = "data/descriptions_test/"+str(i)+".txt"
    with open(fname, "r") as ins:
        text = []
        for line in ins:
            text.append(line)
        Desp_test.append(text)

In [23]:
print (len(Desp_train))
print (Desp_train[5])

10000
['A small bedroom with a desk and computer in it.\n', 'A bedroom that has a desk, chair, and bed in it.\n', 'A chair sitting in front of a brown desk.\n', 'A desk beside a window in a bedroom\n', 'a bed and a desk and chair by a big window\n']


## Convert Desp_train, Desp_test to X_train, X_test

In [31]:
def desp2vec(Desp):
# Desp is an array of array of strings
# Output a vector of size N*M
# N is len(Desp), M is by our design
    return np.zeros((len(Desp), 2000))

In [32]:
X_train = desp2vec(Desp_train)
X_test = desp2vec(Desp_test)

# Training with Regression Models
Multioutput regression
See http://scikit-learn.org/stable/modules/multiclass.html for more regression models

In [44]:
from sklearn.tree import DecisionTreeRegressor
# Fit regression model
tree_1 = DecisionTreeRegressor(max_depth=5)
tree_1.fit(X_train, Y_train)
print (tree_1.predict(X_test).shape)

(2000, 1000)


## KFold Cross-validation

In [43]:
from sklearn import cross_validation
K = 5
for trainKF, testKF in (cross_validation.KFold(train_size, n_folds=K)):
    Xr_KF = X_train[trainKF,:]
    Yr_KF = Y_train[trainKF,:]
    Xt_KF = X_train[testKF,:]
    Yt_KF = Y_train[testKF,:]
    tree = DecisionTreeRegressor(max_depth=5)
    tree.fit(Xr_KF, Yr_KF)
    print('RT Score: %f' % tree.score(Xt_KF, Yt_KF))

RT Score: -0.000666
RT Score: -0.000737
RT Score: -0.000695
RT Score: -0.000472
RT Score: -0.000511


# Predict the result

In [45]:
from sklearn.metrics.pairwise import pairwise_distances
def rank20(myRegressor, X_test, Y_test, metri = 'euclidean'):
# X_test 2000 * m
# Y_test 2000 * 1000
# Y_predict 2000 * 1000
# return Y_rank 2000 * 20
    Y_predict = myRegressor.predict(X_test)
    distanceM = pairwise_distances(Y_predict, Y_test, metric = metri)
    return np.argsort(distanceM, axis = 1)[:,:20]

In [55]:
def render(Y_rank, outName = 'result.csv'):
# Y_rank 2000 * 20
    head = []
    content = []
    for i in range(test_size):
        head.append(str(i)+".txt")
        tmp = ""
        for j in range(20):
            tmp = tmp + str(Y_rank[i, j])+".jpg"
            if j < 19:
                tmp = tmp + " "
        content.append(tmp)
    submission = pd.DataFrame({
            "Descritpion_ID": head,
            "Top_20_Image_IDs": content
        })
    submission.to_csv(outName, index=False)

In [56]:
Y_rank = rank20(tree, X_test, Y_test)
render(Y_rank, 'frame.csv')