# CS5785 final
## Xianhui Li, Zimeng Zhu

### First we load libraries, define our train/test split, and load the word2vec dictionary using gensim

In [1]:
import os
import csv
import random
import gensim
import numpy as np

num_train = 8000
num_dev = 2000
num_test = 2000
split_idx = list(range(num_train + num_dev))
random.shuffle(split_idx)
word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)
print("Loaded word vectors successfully!")

Loaded word vectors successfully!



### Next we parse the descriptions to form the X matrices


In [73]:
def parse_descriptions(data_dir, num_doc):
    docs = []
    for i in range(num_doc):
        path = os.path.join(data_dir, "%d.txt" % i)
        with open(path) as f:
            docs.append(f.read())
    return docs

def doc_to_vec(sentence, word2vec):
    # get list of word vectors in sentence
    word_vecs = [word2vec.get_vector(w) for w in sentence.split() if w in word2vec.vocab]
    # return average
    return np.stack(word_vecs).mean(0)

# build x matrices
train_dev_desc = parse_descriptions("descriptions_train", num_doc=(num_train+num_dev))
test_desc = parse_descriptions("descriptions_test", num_doc=num_test)
x_train = np.array([doc_to_vec(train_dev_desc[i], word2vec) for i in split_idx[:num_train]])
x_dev = np.array([doc_to_vec(train_dev_desc[i], word2vec) for i in split_idx[num_train:]])
x_test = np.array([doc_to_vec(d, word2vec) for d in test_desc])

print("Built all x matrices!")
print("x_train shape:", x_train.shape)
print("x_dev shape:", x_dev.shape)
print("x_test shape:", x_test.shape)


Built all x matrices!
x_train shape: (8000, 300)
x_dev shape: (2000, 300)
x_test shape: (2000, 300)



### In addition we parse the ResNet features to form the Y matrices


In [74]:
def parse_features(features_path):
    vec_map = {}
    with open(features_path) as f:
        for row in csv.reader(f):
            img_id = int(row[0].split("/")[1].split(".")[0])
            vec_map[img_id] = np.array([float(x) for x in row[1:]])
    return np.array([v for k, v in sorted(vec_map.items())])

# build y matrices
p = np.random.randn(1000, 400)
y_train_dev = parse_features("features_train/features_resnet1000_train.csv") @ p
y_train = y_train_dev[split_idx[:num_train]]
y_dev = y_train_dev[split_idx[num_train:]]
y_test = parse_features("features_test/features_resnet1000_test.csv") @ p

print("Built all y matrices!")
print("y_train shape:", y_train.shape)
print("y_dev shape:", y_dev.shape)
print("y_test shape:", y_test.shape)

Built all y matrices!
y_train shape: (8000, 400)
y_dev shape: (2000, 400)
y_test shape: (2000, 400)



### Parse the image tags to form the Z matrices


In [75]:
def tag_to_vec(sentence, word2vec):
    # get list of word vectors in sentence
    word_vecs = [word2vec.get_vector(w) for w in sentence.split(':') if w in word2vec.vocab]
    # return average
    if len(word_vecs) == 0:
        return [0 for x in list(range(300))]
    else:
        return np.stack(word_vecs).mean(0)

# build z matrices
train_dev_tags = parse_descriptions("tags_train", num_doc=(num_train+num_dev))
test_tags = parse_descriptions("tags_test", num_doc=num_test)
z_train = np.array([tag_to_vec(train_dev_tags[i], word2vec) for i in split_idx[:num_train]])
z_dev = np.array([tag_to_vec(train_dev_tags[i], word2vec) for i in split_idx[num_train:]])
z_test = np.array([tag_to_vec(d, word2vec) for d in test_tags])

print("Built all z matrices!")
print("z_train shape:", z_train.shape)
print("z_dev shape:", z_dev.shape)
print("z_test shape:", z_test.shape)

Built all z matrices!
z_train shape: (8000, 300)
z_dev shape: (2000, 300)
z_test shape: (2000, 300)


### Combine ResNet features (Y) and tags (Z)

In [28]:
y_train = np.hstack((y_train,z_train))
y_dev = np.hstack((y_dev,z_dev))
y_test = np.hstack((y_test,z_test))

print("Built all new x matrices!")
print("new x_train shape:", y_train.shape)
print("new x_dev shape:", y_dev.shape)
print("new x_test shape:", y_test.shape)

Built all new x matrices!
new x_train shape: (8000, 400)
new x_dev shape: (2000, 400)
new x_test shape: (2000, 400)



### Now we train a linear model to predict the ResNet features from the mean word vectors


In [76]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# train OLS model with regression
parameters = {"alpha": [0.1, 0.5, 1.0, 5.0]}
reg = GridSearchCV(Ridge(), parameters, cv=10)
reg.fit(x_train, y_train)
reg_best = reg.best_estimator_

print("Trained linear regression model!")
print("Summary of best model:")
print(reg_best)

Trained linear regression model!
Summary of best model:
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)



### Next we test out our linear model on our development data, computing its MAP@20, and investigating the quality of the rankings


In [78]:
def dist_matrix(x1, x2):
    return ((np.expand_dims(x1, 1) - np.expand_dims(x2, 0)) ** 2).sum(2) ** 0.5

# test performance on development set
y_dev_pred = reg.predict(x_dev)
dev_distances = dist_matrix(y_dev_pred, y_dev)
dev_scores = []
dev_pos_list = []

for i in range(num_dev):
    pred_dist_idx = list(np.argsort(dev_distances[i]))
    dev_pos = pred_dist_idx.index(i)
    dev_pos_list.append(dev_pos)
    if dev_pos < 20:
        dev_scores.append(1 / (dev_pos + 1))
    else:
        dev_scores.append(0.0)

print("Development MAP@20:", np.mean(dev_scores))
print("Mean index of true image", np.mean(dev_pos_list))
print("Median index of true image", np.median(dev_pos_list))

Development MAP@20: 0.13089665989540214
Mean index of true image 89.951
Median index of true image 26.0


### Train Random Forest

In [80]:
from sklearn.ensemble import RandomForestRegressor

# train RF model
param = { 
    'n_estimators': [50,100],
    'max_depth' : [5,10,20,30]
}
#clf = GridSearchCV(RandomForestRegressor(), param)
clf = RandomForestRegressor()
clf.fit(x_train, y_train)
#clf_best = clf.best_estimator_
#print("Trained linear regression model!")
#print("Summary of best model:")
#print(clf_best)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [81]:
# MAP@20 of random forest
y_dev_pred = clf.predict(x_dev)
dev_distances = dist_matrix(y_dev_pred, y_dev)
dev_scores = []
dev_pos_list = []
print(y_dev_pred.shape)
for i in range(num_dev):
    pred_dist_idx = list(np.argsort(dev_distances[i]))
    dev_pos = pred_dist_idx.index(i)
    dev_pos_list.append(dev_pos)
    if dev_pos < 20:
        dev_scores.append(1 / (dev_pos + 1))
    else:
        dev_scores.append(0.0)

print("Development MAP@20:", np.mean(dev_scores))
print("Mean index of true image", np.mean(dev_pos_list))
print("Median index of true image", np.median(dev_pos_list))

(2000, 400)
Development MAP@20: 0.06815187958326503
Mean index of true image 209.146
Median index of true image 70.0


### predict tags

In [83]:
# train OLS model with ridge regression
parameters = {"alpha": [0.1, 0.5, 1.0, 5.0]}
reg = GridSearchCV(Ridge(), parameters, cv=10)
reg.fit(x_train, z_train)
reg_best = reg.best_estimator_

print("Trained linear regression model!")
print("Summary of best model:")
print(reg_best)

Trained linear regression model!
Summary of best model:
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)


In [84]:
# test performance on development set
z_dev_pred = reg.predict(x_dev)
dev_distances = dist_matrix(z_dev_pred, z_dev)
dev_scores = []
dev_pos_list = []

for i in range(num_dev):
    pred_dist_idx = list(np.argsort(dev_distances[i]))
    dev_pos = pred_dist_idx.index(i)
    dev_pos_list.append(dev_pos)
    if dev_pos < 20:
        dev_scores.append(1 / (dev_pos + 1))
    else:
        dev_scores.append(0.0)

print("Development MAP@20:", np.mean(dev_scores))
print("Mean index of true image", np.mean(dev_pos_list))
print("Median index of true image", np.median(dev_pos_list))

Development MAP@20: 0.013009099637892207
Mean index of true image 293.384
Median index of true image 196.0


In [86]:
# train RF model
clf = RandomForestRegressor()
clf.fit(x_train, z_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [87]:
# MAP@20 of random forest
z_dev_pred = clf.predict(x_dev)
dev_distances = dist_matrix(z_dev_pred, z_dev)
dev_scores = []
dev_pos_list = []
for i in range(num_dev):
    pred_dist_idx = list(np.argsort(dev_distances[i]))
    dev_pos = pred_dist_idx.index(i)
    dev_pos_list.append(dev_pos)
    if dev_pos < 20:
        dev_scores.append(1 / (dev_pos + 1))
    else:
        dev_scores.append(0.0)

print("Development MAP@20:", np.mean(dev_scores))
print("Mean index of true image", np.mean(dev_pos_list))
print("Median index of true image", np.median(dev_pos_list))

Development MAP@20: 0.010140175955365585
Mean index of true image 380.895
Median index of true image 230.0



### Finally we use our model to compute top-20 predictions on the test data that can be submitted to Kaggle


In [31]:
# create test predictions
x_train_all = np.concatenate([x_train, x_dev])
y_train_all = np.concatenate([y_train, y_dev])
reg_best.fit(x_train_all, y_train_all)
y_test_pred = reg_best.predict(x_test)
test_distances = dist_matrix(y_test_pred, y_test)
pred_rows = []

for i in range(num_test):
    test_dist_idx = list(np.argsort(test_distances[i]))
    top_20 = test_dist_idx[:20]
    row = ["%d.jpg" % i for i in test_dist_idx[:20]]
    pred_rows.append(" ".join(row))

with open("test_submission.csv", "w") as f:
    f.write("Descritpion_ID,Top_20_Image_IDs\n")
    for i, row in enumerate(pred_rows):
        f.write("%d.txt,%s\n" % (i, row))

print("Output written!")

Output written!
