In [1]:
import csv
import numpy as np
xtrain_glove = np.loadtxt('query_glove.csv',delimiter=",", dtype=float)

from tqdm import tqdm


# Reading 2048-d train features 
features_train_2048 = np.zeros((10000,2048))
with open('./all/data/features_train/features_resnet1000intermediate_train.csv', 'r') as csvfile:
    csv_reader = csv.reader(csvfile)
    for line in csv_reader:
        image_name = line[0].strip(".jpg")[13:]
        row = []
        for i in range(len(line)):
            if i > 0:
                row.append(float(line[i]))
        features_train_2048[int(image_name)] = row
        
# Reading 1000-d train features 
features_train_1000 = np.zeros((10000,1000))
with open('./all/data/features_train/features_resnet1000_train.csv', 'r') as csvfile:
    csv_reader = csv.reader(csvfile)
    for line in csv_reader:
        image_name = line[0].strip(".jpg")[13:]
        row = []
        for i in range(len(line)):
            if i > 0:
                row.append(float(line[i]))
        features_train_1000[int(image_name)] = row
        
# Find set of supercategories, categories 
supercategory_set = set()
category_set = set()
for i in range (10000):
    file = open("./all/data/tags_train/" + str(i) + ".txt", "r")
    lines = file.readlines() 
    for  line in lines:
        words = line.strip().split(':')
        supercategory_set.add(words[0])
        category_set.add(words[1])
    file.close()

# mapping from (super) category to index
supercategory_dict = {item:val for val, item in enumerate(supercategory_set)}
category_dict = {item:val+1 for val, item in enumerate(category_set)}

# Vectorize train tags
train_tags = []
for i in range (10000):
    file = open("./all/data/tags_train/" + str(i) + ".txt", "r")
    lines = file.readlines() 
    row = np.zeros(len(supercategory_set))
    for line in lines:
        words = line.strip().split(':')
        supercategory_column = supercategory_dict.get(words[0])
        category_index = category_dict.get(words[1])
        row[supercategory_column] = category_index
    train_tags.append(row)
    file.close()

In [2]:
features_test_2048 = np.zeros((2000,2048))
with open('./all/data/features_test/features_resnet1000intermediate_test.csv', 'r') as csvfile:
    csv_reader = csv.reader(csvfile)
    for line in csv_reader:
        image_name = line[0].strip(".jpg")[12:]
        row = []
        for i in range(len(line)):
            if i > 0:
                row.append(float(line[i]))
        features_test_2048[int(image_name)] = row

features_test_1000 = np.zeros((2000,1000))    
with open('./all/data/features_test/features_resnet1000_test.csv', 'r') as csvfile:
    csv_reader = csv.reader(csvfile)
    for line in csv_reader:
        image_name = line[0].strip(".jpg")[12:]
        row = []
        for i in range(len(line)):
            if i > 0:
                row.append(float(line[i]))
        features_test_1000[int(image_name)] = row

# Vectorize test tags
test_tags = []
for i in range (2000):
    file = open("./all/data/tags_test/" + str(i) + ".txt", "r")
    lines = file.readlines() 
    row = np.zeros(len(supercategory_set))
    for line in lines:
        words = line.strip().split(':')
        supercategory_column = supercategory_dict.get(words[0])
        category_index = category_dict.get(words[1])
        row[supercategory_column] = category_index
    test_tags.append(row)
    file.close()
    

xtest_glove = np.loadtxt('query_glove_test.csv',delimiter=",", dtype=float)

In [3]:
from sklearn.neighbors import KNeighborsRegressor

features_train_concat = np.concatenate((np.array(features_train_2048),np.array(features_train_1000)), axis=1)
features_train_concat = np.concatenate((features_train_concat, np.array(train_tags)), axis=1)

In [4]:
clf_knn_reverse = KNeighborsRegressor(n_neighbors=5,n_jobs=-1)
clf_knn_reverse.fit(features_train_concat,xtrain_glove)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
          weights='uniform')

In [5]:
features_test_concat = np.concatenate((np.array(features_test_2048),np.array(features_test_1000)), axis=1)
features_test_concat = np.concatenate((features_test_concat, np.array(test_tags)), axis=1)

In [8]:
from sklearn.preprocessing import StandardScaler

features_train_concat = StandardScaler().fit_transform(features_train_concat)
features_test_concat = StandardScaler().fit_transform(features_test_concat)


clf_knn_reverse = KNeighborsRegressor(n_neighbors=5,n_jobs=-1)
clf_knn_reverse.fit(features_train_concat,xtrain_glove)


knn_predict_reverse= clf_knn_reverse.predict(features_test_concat)
rank_list = []
for i in range (2000):
    current_query = xtest_glove[i]
    dist = []
    for j in range(2000):
        dist.append((np.linalg.norm(knn_predict_reverse[j]-current_query),j))
    dist.sort()
    rank_list.append([v for (k,v) in dist[:20]])

def mergeTwenty(lst):
    result = str(lst[0])+".jpg"
    for i in range(1,len(lst)):
        result += " " + str(lst[i]) + ".jpg"
    return result

with open('submission_knn_std_reverse.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
    for i in range(len(rank_list)):
        csv_writer.writerow([str(i) + ".txt", mergeTwenty(rank_list[i])])

In [9]:
## DOES NOT WORK

from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer

def my_scorer(ground_truth, predictions):
    score = 0.0
    length = len(ground_truth)
    for i in range (length):
        dist = []
        for j in range(length):
            dist.append((np.linalg.norm(predictions[i]-ground_truth[j]),j))
        dist.sort()
        rank_list = [v for (k,v) in dist[:20]]
        if i in rank_list:
            rank = rank_list.index(i)
            score = score + (21.0-rank)/20.0
    return score/length

my_scorer = make_scorer(my_scorer, greater_is_better=True)
scores = cross_val_score(clf_knn_reverse, features_train_concat,xtrain_glove, cv=5,scoring=my_scorer)

KeyboardInterrupt: 

In [11]:
## 15% score

clf_knn = KNeighborsRegressor(n_neighbors=5,n_jobs=-1)
clf_knn.fit(xtrain_glove, features_train_concat)


knn_predict= clf_knn.predict(xtest_glove)



rank_list = []
for i in range (2000):
    guess_image = knn_predict[i]
    dist = []
    for j in range(2000):
        dist.append((np.linalg.norm(guess_image-features_test_concat[j]),j))
    dist.sort()
    rank_list.append([v for (k,v) in dist[:20]])

def mergeTwenty(lst):
    result = str(lst[0])+".jpg"
    for i in range(1,len(lst)):
        result += " " + str(lst[i]) + ".jpg"
    return result

with open('s5st.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
    for i in range(len(rank_list)):
        csv_writer.writerow([str(i) + ".txt", mergeTwenty(rank_list[i])])

In [13]:
clf_knn_reverse = KNeighborsRegressor(n_neighbors=5,n_jobs=-1)
clf_knn_reverse.fit(features_train_concat,xtrain_glove)


knn_predict_reverse= clf_knn_reverse.predict(features_test_concat)
rank_list = []
for i in range (2000):
    current_query = xtest_glove[i]
    dist = []
    for j in range(2000):
        dist.append((np.linalg.norm(knn_predict_reverse[j]-current_query, ord=1),j))
    dist.sort()
    rank_list.append([v for (k,v) in dist[:20]])

def mergeTwenty(lst):
    result = str(lst[0])+".jpg"
    for i in range(1,len(lst)):
        result += " " + str(lst[i]) + ".jpg"
    return result

with open('l1.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
    for i in range(len(rank_list)):
        csv_writer.writerow([str(i) + ".txt", mergeTwenty(rank_list[i])])

In [14]:
def my_scorer(ground_truth, predictions):
    score = 0.0
    length = len(ground_truth)
    for i in range (length):
        dist = []
        for j in range(length):
            dist.append((np.linalg.norm(predictions[i]-ground_truth[j]),j))
        dist.sort()
        rank_list = [v for (k,v) in dist[:20]]
        if i in rank_list:
            rank = rank_list.index(i)
            score = score + (21.0-rank)/20.0
    return score/length

In [15]:
clf_knn_reverse = KNeighborsRegressor(n_neighbors=5,n_jobs=-1)
clf_knn_reverse.fit(features_train_concat,xtrain_glove)

def my_scorer_reverse(ground_truth, predictions):
    score = 0.0
    length = len(ground_truth)
    for i in range (length):
        dist = []
        current_query = ground_truth[i]
        for j in range(length):
            dist.append((np.linalg.norm(predictions[j]-current_query),j))
        dist.sort()
        rank_list = [v for (k,v) in dist[:20]]
        if i in rank_list:
            rank = rank_list.index(i)
            score = score + (21.0-rank)/20.0
    return score/length

my_scorer_reverse = make_scorer(my_scorer_reverse, greater_is_better=True)
scores = cross_val_score(clf_knn_reverse, features_train_concat,xtrain_glove, cv=5,scoring=my_scorer_reverse)

In [17]:
print(scores.mean())

0.509219999999998


In [20]:
clf_knn = KNeighborsRegressor(n_neighbors=5,n_jobs=-1)
clf_knn.fit(xtrain_glove, features_train_concat)

def my_scorer(ground_truth, predictions):
    score = 0.0
    length = len(ground_truth)
    for i in range (length):
        dist = []
        for j in range(length):
            dist.append((np.linalg.norm(predictions[i]-ground_truth[j]),j))
        dist.sort()
        rank_list = [v for (k,v) in dist[:20]]
        if i in rank_list:
            rank = rank_list.index(i)
            score = score + (21.0-rank)/20.0
    return score/length

my_scorer = make_scorer(my_scorer, greater_is_better=True)
scores = cross_val_score(clf_knn, xtrain_glove, features_train_concat, cv=5,scoring=my_scorer)
print(scores.mean())

0.3492799999999996


In [21]:
clf_knn_reverse = KNeighborsRegressor(n_neighbors=5,n_jobs=-1)
clf_knn_reverse.fit(features_train_concat,xtrain_glove)

def my_scorer_reverse(ground_truth, predictions):
    score = 0.0
    length = len(ground_truth)
    for i in range (length):
        dist = []
        current_query = ground_truth[i]
        for j in range(length):
            dist.append((np.linalg.norm(predictions[j]-current_query, ord=1),j))
        dist.sort()
        rank_list = [v for (k,v) in dist[:20]]
        if i in rank_list:
            rank = rank_list.index(i)
            score = score + (21.0-rank)/20.0
    return score/length

my_scorer_reverse = make_scorer(my_scorer_reverse, greater_is_better=True)
scores = cross_val_score(clf_knn_reverse, features_train_concat,xtrain_glove, cv=5,scoring=my_scorer_reverse)
print(scores.mean())

0.508479999999998


In [22]:
from sklearn.linear_model import LinearRegression

clf_lr = LinearRegression(n_jobs=-1)
clf_lr.fit(features_train_concat,xtrain_glove)

def my_scorer_reverse(ground_truth, predictions):
    score = 0.0
    length = len(ground_truth)
    for i in range (length):
        dist = []
        current_query = ground_truth[i]
        for j in range(length):
            dist.append((np.linalg.norm(predictions[j]-current_query),j))
        dist.sort()
        rank_list = [v for (k,v) in dist[:20]]
        if i in rank_list:
            rank = rank_list.index(i)
            score = score + (21.0-rank)/20.0
    return score/length

my_scorer_reverse = make_scorer(my_scorer_reverse, greater_is_better=True)
scores = cross_val_score(clf_lr, features_train_concat,xtrain_glove, cv=5,scoring=my_scorer_reverse)
print(scores.mean())

0.564239999999998


In [23]:
clf_lr = LinearRegression(n_jobs=-1)
clf_lr.fit(xtrain_glove, features_train_concat)


def my_scorer(ground_truth, predictions):
    score = 0.0
    length = len(ground_truth)
    for i in range (length):
        dist = []
        for j in range(length):
            dist.append((np.linalg.norm(predictions[i]-ground_truth[j]),j))
        dist.sort()
        rank_list = [v for (k,v) in dist[:20]]
        if i in rank_list:
            rank = rank_list.index(i)
            score = score + (21.0-rank)/20.0
    return score/length


my_scorer = make_scorer(my_scorer, greater_is_better=True)
scores = cross_val_score(clf_lr, xtrain_glove, features_train_concat, cv=5,scoring=my_scorer)
print(scores.mean())

0.3548749999999997


In [25]:
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression(n_jobs=-1)
clf_lr.fit(features_train_concat,xtrain_glove)

def my_scorer_reverse(ground_truth, predictions):
    score = 0.0
    length = len(ground_truth)
    for i in range (length):
        dist = []
        current_query = ground_truth[i]
        for j in range(length):
            dist.append((np.linalg.norm(predictions[j]-current_query),j))
        dist.sort()
        rank_list = [v for (k,v) in dist[:20]]
        if i in rank_list:
            rank = rank_list.index(i)
            score = score + (21.0-rank)/20.0
    return score/length

my_scorer_reverse = make_scorer(my_scorer_reverse, greater_is_better=True)
scores = cross_val_score(clf_lr, features_train_concat,xtrain_glove, cv=5,scoring=my_scorer_reverse)
print(scores.mean())

ValueError: bad input shape (10000, 300)

In [28]:
clf_lr = LinearRegression(n_jobs=-1)
clf_lr.fit(features_train_concat,xtrain_glove)


# lr_predict_reverse = clf_lr.predict(features_test_concat)

# rank_list = []
# for i in range (2000):
#     current_query = xtest_glove[i]
#     dist = []
#     for j in range(2000):
#         dist.append((np.linalg.norm(lr_predict_reverse[j]-current_query),j))
#     dist.sort()
#     rank_list.append([v for (k,v) in dist[:20]])

# def mergeTwenty(lst):
#     result = str(lst[0])+".jpg"
#     for i in range(1,len(lst)):
#         result += " " + str(lst[i]) + ".jpg"
#     return result

# with open('linearR_std_reverse.csv', 'w', newline='') as csvfile:
#     csv_writer = csv.writer(csvfile, delimiter=',')
#     csv_writer.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
#     for i in range(len(rank_list)):
#         csv_writer.writerow([str(i) + ".txt", mergeTwenty(rank_list[i])])

def my_scorer_reverse(ground_truth, predictions):
    score = 0.0
    length = len(ground_truth)
    for i in range (length):
        dist = []
        current_query = ground_truth[i]
        for j in range(length):
            dist.append((np.linalg.norm(predictions[j]-current_query),j))
        dist.sort()
        rank_list = [v for (k,v) in dist[:20]]
        if i in rank_list:
            rank = rank_list.index(i)
            score = score + (21.0-rank)/20.0
    return score/length

my_scorer_reverse = make_scorer(my_scorer_reverse, greater_is_better=True)
scores = cross_val_score(clf_lr, features_train_concat,xtrain_glove, cv=5,scoring=my_scorer_reverse)
print(scores.mean())

0.564239999999998


In [29]:
features_test_concat = np.concatenate((np.array(features_test_2048),np.array(features_test_1000)), axis=1)
features_test_concat = np.concatenate((features_test_concat, np.array(test_tags)), axis=1)
features_test_concat = StandardScaler().fit_transform(features_test_concat)

In [30]:
print (features_test_concat)

[[-0.23401783  1.39268043 -0.82493173 ... -0.24929769  1.57460707
   1.21773325]
 [-0.44294778  1.45647002  1.12924375 ... -0.24929769 -0.51405711
  -0.34727543]
 [ 1.65110503 -0.86512397 -0.74672062 ... -0.24929769 -0.51405711
  -0.34727543]
 ...
 [-0.53760116 -0.37934819 -0.11341225 ... -0.24929769 -0.51405711
  -0.34727543]
 [ 0.36246406 -0.78344112 -0.81850142 ... -0.24929769 -0.51405711
  -0.34727543]
 [-0.86554779  1.21273214 -0.10830593 ... -0.24929769  1.08315667
  -0.34727543]]


In [34]:
lr_predict_reverse = clf_lr.predict(features_test_concat)



rank_list = []
for i in range (2000):
    current_query = xtest_glove[i]
    dist = []
    for j in range(2000):
        dist.append((np.linalg.norm(lr_predict_reverse[j]-current_query),j))
    dist.sort()
    rank_list.append([v for (k,v) in dist[:20]])

def mergeTwenty(lst):
    result = str(lst[0])+".jpg"
    for i in range(1,len(lst)):
        result += " " + str(lst[i]) + ".jpg"
    return result

with open('linearR_std_reverse.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
    for i in range(len(rank_list)):
        csv_writer.writerow([str(i) + ".txt", mergeTwenty(rank_list[i])])

In [33]:
print(xtest_glove[0][0])
print(xtest_glove[1][0])

2.1822255803272128e-05
-0.03950096294283867


In [38]:
rank_list = []
for i in range (2):
    current_query = xtest_glove[i]
    dist = []
    for j in range(2000):
        dist.append((np.linalg.norm(lr_predict_reverse[j]-current_query),j))
    rank_list.append(dist)

In [40]:
print (rank_list[0])

[(6063.846914288767, 0), (6137.761288000624, 1), (6410.282694850227, 2), (5249.765963469722, 3), (7769.853914525853, 4), (7213.273625559771, 5), (7409.5585550184305, 6), (8466.368211070378, 7), (7221.0072992937, 8), (7785.566193431518, 9), (6929.403529837654, 10), (6735.397511361993, 11), (8004.941031876703, 12), (5046.606510801163, 13), (9984.216070745792, 14), (6473.533981870391, 15), (8690.74687158556, 16), (6479.289264777016, 17), (10962.463843325688, 18), (5196.034910388163, 19), (7013.449027171948, 20), (6742.035147794993, 21), (7748.688857814173, 22), (7285.4891099515025, 23), (7176.938421090439, 24), (6586.127125518302, 25), (6797.262408010801, 26), (6535.006572911867, 27), (8135.7146958415115, 28), (7725.153644850838, 29), (11034.496687007912, 30), (7765.226591257152, 31), (8133.978559770079, 32), (9254.924899751028, 33), (7059.511496897138, 34), (7875.204607736954, 35), (6638.093452207155, 36), (7151.96104663333, 37), (7148.386082813891, 38), (6402.816717908674, 39), (5983.43

In [41]:
print (rank_list[1])

[(6063.884720247062, 0), (6137.826969357267, 1), (6410.105742488011, 2), (5249.663463716231, 3), (7769.8635695944395, 4), (7213.280879483748, 5), (7409.536344284558, 6), (8466.306463912131, 7), (7221.093526523913, 8), (7785.479989438215, 9), (6929.458532729958, 10), (6735.437300759289, 11), (8004.913438627248, 12), (5046.517246887604, 13), (9984.368539636182, 14), (6473.549760435899, 15), (8690.72021033899, 16), (6479.303373903211, 17), (10962.449954279378, 18), (5195.951001506703, 19), (7013.230983309842, 20), (6741.9679331346715, 21), (7748.672257314023, 22), (7285.413360497712, 23), (7176.962473665231, 24), (6585.968663588039, 25), (6797.281475949215, 26), (6534.957580124284, 27), (8135.503318853466, 28), (7724.998351479805, 29), (11034.49166452891, 30), (7765.272027689968, 31), (8133.923329050224, 32), (9254.821069711901, 33), (7059.444242532565, 34), (7875.195634263937, 35), (6638.053676790435, 36), (7151.931335103064, 37), (7148.458389048461, 38), (6402.896839316679, 39), (5983.4

In [42]:
print(features_test_concat[0])
print(features_test_concat[1])

[-0.23401783  1.39268043 -0.82493173 ... -0.24929769  1.57460707
  1.21773325]
[-0.44294778  1.45647002  1.12924375 ... -0.24929769 -0.51405711
 -0.34727543]
