### 0. First load libraries, define train/test split

In [21]:
import numpy as np
import pickle
import csv
import seaborn
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import pairwise_distances
from sklearn.utils import shuffle
import warnings
warnings.filterwarnings('ignore')

work_dir = '/Users/ronghao/Mirror/Cornell-Tech/2018-Fa-Course/CS-5785/Homework/Final/'
data_dir = work_dir + 'all/'

In [2]:
X_dict = pickle.load(open(work_dir + "modeldata/X_dict.p", "rb"))
# Y_dict = pickle.load(open(work_dir + "modeldata/Y_dict.p", "rb"))

In [115]:
# X_data = 'only_fc', 'only_pool', 'fc_pool'
# Y_data = 'full_w2v_mean', 'nv_w2v_mean', 'n_w2v_mean', 'av_w2v_mean', 
#          'BOW_tagEnhanced_1', 'BOW_tagEnhanced_2', 'BOW_tagEnhanced_5'
#          'bow_1294', 'bow_4291'

X_data = 'only_pool'
x_train_dev = X_dict[X_data]['train_dev']
x_test = X_dict[X_data]['test']

# Y_data = 'full_w2v_mean'
# y_train_dev = Y_dict[Y_data]['train_dev']
# y_test = Y_dict[Y_data]['test']

In [22]:
def parse_features(features_path):
    vec_map = {}
    with open(features_path) as f:
        for row in csv.reader(f):
            img_id = int(row[0].split("/")[1].split(".")[0])
            vec_map[img_id] = np.array([float(x) for x in row[1:]])
    return np.array([v for k, v in sorted(vec_map.items())])

# build x matrices
x_train_dev = parse_features(data_dir + "features_train/features_resnet1000intermediate_train.csv")
x_test = parse_features(data_dir + "features_test/features_resnet1000intermediate_test.csv")

In [4]:
y_train_dev = pickle.load(open(work_dir + "data/800noun/y_train_dev.p", "rb"))
y_test = pickle.load(open(work_dir + "data/800noun/y_test.p", "rb"))

In [5]:
y_train_dev = (y_train_dev>0).astype(int) + (y_train_dev>1).astype(int)*0.5
y_test = (y_test>0).astype(int) + (y_test>1).astype(int)*0.5

In [116]:
num_train = 8000; num_dev = 2000; num_test = 2000

# Train dev test split
split_idx = list(range(num_train + num_dev))
split_idx = shuffle(split_idx, random_state=0)

x_train = x_train_dev[split_idx[:num_train]]
x_dev = x_train_dev[split_idx[num_train:]]
y_train = y_train_dev[split_idx[:num_train]]
y_dev = y_train_dev[split_idx[num_train:]]

### 1. Train different models to predict the description from the image features

In [96]:
from sklearn.linear_model import Ridge

# train OLS model with regression
parameters = {"alpha": [30, 90, 150, 210]}
ridge_reg = GridSearchCV(Ridge(), parameters, cv=10)
ridge_reg.fit(x_train, y_train)
ridge_reg_best = ridge_reg.best_estimator_
print("Trained linear regression model!")
print("Summary of best model:")
print(ridge_reg_best)

# ridge_reg = Ridge(alpha=90)
# ridge_reg.fit(x_train, y_train)

Trained linear regression model!
Summary of best model:
Ridge(alpha=210, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)


In [117]:
from sklearn.linear_model import Ridge

# train OLS model with regression

ridge_reg = Ridge(alpha=210)
ridge_reg.fit(x_train, y_train)

Ridge(alpha=210, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [58]:
desc_file = []
for desc_file_index in split_idx[8000:]:
    # find description
    with open(data_dir+'descriptions_train/'+str(desc_file_index)+'.txt') as f:
        desc_file.append(f.read().lower().replace(',',' ').replace('.',' ').replace('\n','').split(' '))

tag_file = []
for tag_file_index in split_idx[8000:]:
    with open(data_dir+'tags_train/'+str(tag_file_index)+'.txt') as f:
        tag_txt = f.read()
    tags = []
    for word in list(pair[pair.find(':')+1:] for pair in tag_txt.split('\n')[:-1]):
        tags += word.split(' ')
    tag_file.append(tags)

In [62]:
def tag_discount_dev():
    table = []
    for desc in desc_file:
        row = []
        for tags in tag_file:
            d_expo = 0
            for tag in tags:
                if tag in desc:
                    d_expo += 1
            row.append(d_expo)
        table.append(row)
    return table

In [46]:
y_dev_pred = ridge_reg.predict(x_dev)

In [63]:
discount_expo_table_dev = tag_discount_dev()

In [70]:
def discount_table_dev(expo_table, exp_rate=0.8):
    table = []
    for expo_row in expo_table:
        row = []
        for expo in expo_row:
            row.append(exp_rate**expo)
        table.append(row)
    return table

In [73]:
dist_discount = np.array(discount_table_dev(discount_expo_table_dev, 0.8))

### 2. Test out model on development data, computing its MAP@20, and investigating the quality of the rankings

In [144]:
def test_model_on_train(model):
    y_train_pred = model.predict(x_train)
    train_distances = pairwise_distances(y_train, y_train_pred)
    train_scores = []
    train_pos_list = []

    for i in range(num_train):
        pred_dist_idx = list(np.argsort(train_distances[i]))
        train_pos = pred_dist_idx.index(i)
        train_pos_list.append(train_pos)
        if train_pos < 20:
            train_scores.append(1 / (train_pos + 1))
        else:
            train_scores.append(0.0)

    print("Train MAP@20:", np.mean(train_scores))
    print("Mean position of true image", np.mean(train_pos_list))
    print("Median position of true image", np.median(train_pos_list))
    return train_distances
    
def test_model_on_dev(model):
    y_dev_pred = model.predict(x_dev)
    dev_distances = pairwise_distances(y_dev, y_dev_pred) * dist_discount
    dev_scores = []
    dev_pos_list = []

    for i in range(num_dev):
        pred_dist_idx = list(np.argsort(dev_distances[i]))
        dev_pos = pred_dist_idx.index(i)
        dev_pos_list.append(dev_pos)
        if dev_pos < 20:
            dev_scores.append(1 / (dev_pos + 1))
        else:
            dev_scores.append(0.0)

    print("Development MAP@20:", np.mean(dev_scores))
    print("Mean position of true image", np.mean(dev_pos_list))
    print("Median position of true image", np.median(dev_pos_list))
    return dev_distances

In [97]:
# +0.5, alpha=CV, fc
train_dist = test_model_on_train(ridge_reg_best)
print()
dev_dist = test_model_on_dev(ridge_reg_best)

Train MAP@20: 0.4341711737512144
Mean position of true image 21.3235
Median position of true image 3.0

Development MAP@20: 0.3177371248741798
Mean position of true image 17.744
Median position of true image 5.0


In [100]:
# +0.5, alpha=CV, dist_discount, fc
train_dist = test_model_on_train(ridge_reg_best)
print()
dev_dist = test_model_on_dev(ridge_reg_best)

Train MAP@20: 0.4341711737512144
Mean position of true image 21.3235
Median position of true image 3.0

Development MAP@20: 0.37578227602505854
Mean position of true image 13.513
Median position of true image 4.0


In [141]:
# +0.5, alpha=210, dist_discount, pool
train_dist = test_model_on_train(ridge_reg)
print()
dev_dist = test_model_on_dev(ridge_reg)

Train MAP@20: 0.6299895637574284
Mean position of true image 9.29025
Median position of true image 0.0

Development MAP@20: 0.3956479965102837
Mean position of true image 12.853
Median position of true image 4.0


In [143]:
# +0.5, alpha=210, dist_discount, pool
train_dist = test_model_on_train(ridge_reg)
print()
dev_dist = test_model_on_dev(ridge_reg)

Train MAP@20: 0.6299895637574284
Mean position of true image 9.29025
Median position of true image 0.0

Development MAP@20: 0.3956479965102837
Mean position of true image 12.853
Median position of true image 4.0


In [10]:
# +0.5, alpha=210
train_dist = test_model_on_train(ridge_reg)
print()
dev_dist = test_model_on_dev(ridge_reg)

Train MAP@20: 0.6299895637574284
Mean position of true image 9.29025
Median position of true image 0.0

Development MAP@20: 0.3403670830803184
Mean position of true image 16.292
Median position of true image 5.0


In [85]:
# +0.5, alpha=CV
train_dist = test_model_on_train(ridge_reg_best)
print()
dev_dist = test_model_on_dev(ridge_reg_best)

Train MAP@20: 0.6299895637574284
Mean position of true image 9.29025
Median position of true image 0.0

Development MAP@20: 0.3403670830803184
Mean position of true image 16.292
Median position of true image 5.0


In [79]:
# +0.2, alpha=90
train_dist = test_model_on_train(ridge_reg)
print()
dev_dist = test_model_on_dev(ridge_reg)

Train MAP@20: 0.7850865268004054
Mean position of true image 3.858375
Median position of true image 0.0

Development MAP@20: 0.33340503282192524
Mean position of true image 18.2465
Median position of true image 5.0


In [74]:
# +0.5, alpha=90
train_dist = test_model_on_train(ridge_reg)
print()
dev_dist = test_model_on_dev(ridge_reg)

Train MAP@20: 0.7422200892411468
Mean position of true image 4.89675
Median position of true image 0.0

Development MAP@20: 0.34154812759479947
Mean position of true image 17.5565
Median position of true image 5.0


In [69]:
# +0.6, alpha=90
train_dist = test_model_on_train(ridge_reg)
print()
dev_dist = test_model_on_dev(ridge_reg)

Train MAP@20: 0.728070688851697
Mean position of true image 5.247875
Median position of true image 0.0

Development MAP@20: 0.34055813558216813
Mean position of true image 17.567
Median position of true image 4.5


In [35]:
train_dist = test_model_on_train(ridge_reg)
print()
dev_dist = test_model_on_dev(ridge_reg)

Train MAP@20: 0.6828545557705968
Mean position of true image 6.5955
Median position of true image 0.0

Development MAP@20: 0.33738035383733916
Mean position of true image 18.1845
Median position of true image 5.0


In [20]:
train_dist = test_model_on_train(ridge_reg)
print()
dev_dist = test_model_on_dev(ridge_reg)

Train MAP@20: 0.8159132326667784
Mean position of true image 3.18225
Median position of true image 0.0

Development MAP@20: 0.32316708317052745
Mean position of true image 19.8755
Median position of true image 5.0


In [None]:
# X_data = 'only_fc', 'only_pool', 'fc_pool'
# Y_data = 'full_w2v_mean', 'nv_w2v_mean', 'n_w2v_mean', 'av_w2v_mean', 
#          'BOW_tagEnhanced_1', 'BOW_tagEnhanced_2', 'BOW_tagEnhanced_5'
#          'bow_1294', 'bow_4291'

In [90]:
def overfit_learner(X_data, Y_data):
    x_train_dev = X_dict[X_data]['train_dev']
    x_test = X_dict[X_data]['test']
    y_train_dev = Y_dict[Y_data]['train_dev']
    y_test = Y_dict[Y_data]['test']
    x_train = x_train_dev[split_idx[:num_train]]
    x_dev = x_train_dev[split_idx[num_train:]]
    y_train = y_train_dev[split_idx[:num_train]]
    y_dev = y_train_dev[split_idx[ncombum_train:]]
    
    ridge_reg = Ridge(alpha=90)
    ridge_reg.fit(x_train, y_train)
    
    y_dev_pred = ridge_reg.predict(x_dev)
    dev_distances = pairwise_distances(y_dev, y_dev_pred)
    dev_scores = []
    dev_pos_list = []

    for i in range(num_dev):
        pred_dist_idx = list(np.argsort(dev_distances[i]))
        dev_pos = pred_dist_idx.index(i)
        dev_pos_list.append(dev_pos)
        if dev_pos < 20:
            dev_scores.append(1 / (dev_pos + 1))
        else:
            dev_scores.append(0.0)
    
    return dev_distances, np.mean(dev_scores)

In [122]:
learner_combination = [('only_pool', 'bow_4291'),('only_pool', 'n_w2v_mean'),('only_pool', 'av_w2v_mean'),\
                      ('only_fc', 'bow_4291'),('only_fc', 'n_w2v_mean'),('only_fc', 'av_w2v_mean')]

learners = {}
for comb in learner_combination:
    dist_matrix, score = overfit_learner(comb[0],comb[1])
    learners[comb] = (dist_matrix, score)

In [125]:
learner_weight = {}
for k,v in learners.items():
    learner_weight[k] = v[1]

In [107]:
dev_distances = np.zeros((2000, 2000))
for result in learners.values():
    dev_distances += result[0] * result[1]


dev_scores = []
dev_pos_list = []

for i in range(num_dev):
    pred_dist_idx = list(np.argsort(dev_distances[i]))
    dev_pos = pred_dist_idx.index(i)
    dev_pos_list.append(dev_pos)
    if dev_pos < 20:
        dev_scores.append(1 / (dev_pos + 1))
    else:
        dev_scores.append(0.0)

print("Development MAP@20:", np.mean(dev_scores))
print("Mean position of true image", np.mean(dev_pos_list))
print("Median position of true image", np.median(dev_pos_list))

Development MAP@20: 0.3600585244691669
Mean position of true image 15.89
Median position of true image 4.0


In [None]:
def inspect_model_performance(image_index, model):
    

### 4. Finally use a model to compute top-20 predictions on the test data that can be submitted to Kaggle

In [126]:
def fullset_learner(X_data, Y_data):
    x_train_dev = X_dict[X_data]['train_dev']
    x_test = X_dict[X_data]['test']
    y_train_dev = Y_dict[Y_data]['train_dev']
    y_test = Y_dict[Y_data]['test']
    
    ridge_reg = Ridge(alpha=90)
    ridge_reg.fit(x_train_dev, y_train_dev)
    
    y_test_pred = ridge_reg.predict(x_test)
    dev_distances = pairwise_distances(y_test, y_test_pred)
    
    return dev_distances

In [127]:
learner_combination = [('only_pool', 'bow_4291'),('only_pool', 'n_w2v_mean'),('only_pool', 'av_w2v_mean'),\
                      ('only_fc', 'bow_4291'),('only_fc', 'n_w2v_mean'),('only_fc', 'av_w2v_mean')]

test_dist = {}
for comb in learner_combination:
    if comb[1]!='av_w2v_mean':
        dist_matrix = fullset_learner(comb[0],comb[1])
        test_dist[comb] = dist_matrix

In [129]:
test_distances = np.zeros((2000, 2000))
for comb in learner_combination:
    if comb[1]!='av_w2v_mean':
        test_distances += test_dist[comb] * learner_weight[comb]

In [130]:
test_distances

array([[5.53844671, 5.63639348, 5.40675457, ..., 5.48945771, 5.91745964,
        6.32867515],
       [5.61504727, 5.61525129, 5.71905203, ..., 5.63511242, 5.73045421,
        4.98068698],
       [3.92571399, 4.13838268, 3.68707737, ..., 3.79472255, 3.94624768,
        4.81389516],
       ...,
       [3.80531132, 4.27495197, 3.91405698, ..., 3.89063719, 4.38231861,
        4.04416667],
       [4.27510325, 4.78701014, 4.38586372, ..., 4.49792071, 4.90364611,
        5.20709396],
       [4.09080414, 3.67006583, 3.65421024, ..., 3.93422112, 4.33886528,
        4.55403355]])

In [131]:
pred_rows = []

for i in range(num_test):
    test_dist_idx = list(np.argsort(test_distances[i]))
    top_20 = test_dist_idx[:20]
    row = ["%d.jpg" % i for i in test_dist_idx[:20]]
    pred_rows.append(" ".join(row))

with open("ojbk_submission.csv", "w") as f:
    f.write("Descritpion_ID,Top_20_Image_IDs\n")
    for i, row in enumerate(pred_rows):
        f.write("%d.txt,%s\n" % (i, row))

print("Output written!")

Output written!


In [16]:
y_test.shape

(2000, 800)

In [25]:
# create test predictions
def output_submission(model):
    x_train_all = np.concatenate([x_train, x_dev])
    y_train_all = np.concatenate([y_train, y_dev])
    model.fit(x_train_all, y_train_all)
    y_test_pred = model.predict(x_test)
    test_distances = pairwise_distances(y_test, y_test_pred)
    pred_rows = []

    for i in range(num_test):
        test_dist_idx = list(np.argsort(test_distances[i]))
        top_20 = test_dist_idx[:20]
        row = ["%d.jpg" % i for i in test_dist_idx[:20]]
        pred_rows.append(" ".join(row))

    with open("test_submission.csv", "w") as f:
        f.write("Descritpion_ID,Top_20_Image_IDs\n")
        for i, row in enumerate(pred_rows):
            f.write("%d.txt,%s\n" % (i, row))

    print("Output written!")

In [27]:
output_submission(ridge_reg)

Output written!


### Discount

In [78]:
desc_file_test = []
for desc_file_index in range(2000):
    # find description
    with open(data_dir+'descriptions_test/'+str(desc_file_index)+'.txt') as f:
        desc_file_test.append(f.read().lower().replace(',',' ').replace('.',' ').replace('\n','').split(' '))

tag_file_test = []
for tag_file_index in range(2000):
    with open(data_dir+'tags_test/'+str(tag_file_index)+'.txt') as f:
        tag_txt = f.read()
    tags = []
    for word in list(pair[pair.find(':')+1:] for pair in tag_txt.split('\n')[:-1]):
        tags += word.split(' ')
    tag_file_test.append(tags)

In [81]:
def tag_discount_test():
    table = []
    for desc in desc_file_test:
        row = []
        for tags in tag_file_test:
            d_expo = 0
            for tag in tags:
                if tag in desc:
                    d_expo += 1
            row.append(d_expo)
        table.append(row)
    return table

In [82]:
discount_expo_table_test = tag_discount_test()

In [84]:
def discount_table_dev(expo_table, exp_rate=0.8):
    table = []
    for expo_row in expo_table:
        row = []
        for expo in expo_row:
            row.append(exp_rate**expo)
        table.append(row)
    return table

In [85]:
dist_discount_test = np.array(discount_table_dev(discount_expo_table_test, 0.8))

In [86]:
# create test predictions
def output_submission(model):
    x_train_all = np.concatenate([x_train, x_dev])
    y_train_all = np.concatenate([y_train, y_dev])
    model.fit(x_train_all, y_train_all)
    y_test_pred = model.predict(x_test)
    test_distances = pairwise_distances(y_test, y_test_pred) * dist_discount_test
    pred_rows = []

    for i in range(num_test):
        test_dist_idx = list(np.argsort(test_distances[i]))
        top_20 = test_dist_idx[:20]
        row = ["%d.jpg" % i for i in test_dist_idx[:20]]
        pred_rows.append(" ".join(row))

    with open("discount_submission.csv", "w") as f:
        f.write("Descritpion_ID,Top_20_Image_IDs\n")
        for i, row in enumerate(pred_rows):
            f.write("%d.txt,%s\n" % (i, row))

    print("Output written!")

In [None]:
output_submission(ridge_reg)

In [89]:
x_train_all = np.concatenate([x_train, x_dev])
y_train_all = np.concatenate([y_train, y_dev])
ridge_reg.fit(x_train_all, y_train_all)
y_test_pred = ridge_reg.predict(x_test)
pool_test_distances = pairwise_distances(y_test, y_test_pred) * dist_discount_test

In [102]:
x_train_all = np.concatenate([x_train, x_dev])
y_train_all = np.concatenate([y_train, y_dev])
ridge_reg_best.fit(x_train_all, y_train_all)
y_test_pred = ridge_reg_best.predict(x_test)
fc_test_distances = pairwise_distances(y_test, y_test_pred) * dist_discount_test

In [111]:
ensemble_test_distances = pool_test_distances*0.6702 + fc_test_distances*0.3757

In [112]:
ensemble_test_distances = ensemble_test_distances * dist_discount_test

In [113]:
pred_rows = []

for i in range(num_test):
    test_dist_idx = list(np.argsort(ensemble_test_distances[i]))
    top_20 = test_dist_idx[:20]
    row = ["%d.jpg" % i for i in test_dist_idx[:20]]
    pred_rows.append(" ".join(row))

with open("ensemble_discount_submission.csv", "w") as f:
    f.write("Descritpion_ID,Top_20_Image_IDs\n")
    for i, row in enumerate(pred_rows):
        f.write("%d.txt,%s\n" % (i, row))

print("Output written!")

Output written!


In [1]:
0.8 / 0.8**0.65

0.9248717100187196

In [2]:
pool_test_distances_maga = pool_test_distances / dist_discount_test**0.65
pred_rows = []

for i in range(num_test):
    test_dist_idx = list(np.argsort(pool_test_distances_maga[i]))
    top_20 = test_dist_idx[:20]
    row = ["%d.jpg" % i for i in test_dist_idx[:20]]
    pred_rows.append(" ".join(row))

with open("pool_discount_maga_submission.csv", "w") as f:
    f.write("Descritpion_ID,Top_20_Image_IDs\n")
    for i, row in enumerate(pred_rows):
        f.write("%d.txt,%s\n" % (i, row))

print("Output written!")

NameError: name 'pool_test_distances' is not defined