In [1]:
import csv
import numpy as np
from tqdm import tqdm

xtrain_glove = np.loadtxt('query_glove.csv',delimiter=",", dtype=float)

# Reading 2048-d train features 
features_train_2048 = np.zeros((10000,2048))
with open('./all/data/features_train/features_resnet1000intermediate_train.csv', 'r') as csvfile:
    csv_reader = csv.reader(csvfile)
    for line in csv_reader:
        image_name = line[0].strip(".jpg")[13:]
        row = []
        for i in range(len(line)):
            if i > 0:
                row.append(float(line[i]))
        features_train_2048[int(image_name)] = row
        
# Reading 1000-d train features 
features_train_1000 = np.zeros((10000,1000))
with open('./all/data/features_train/features_resnet1000_train.csv', 'r') as csvfile:
    csv_reader = csv.reader(csvfile)
    for line in csv_reader:
        image_name = line[0].strip(".jpg")[13:]
        row = []
        for i in range(len(line)):
            if i > 0:
                row.append(float(line[i]))
        features_train_1000[int(image_name)] = row
        
# Find set of supercategories, categories 
supercategory_set = set()
category_set = set()
for i in range (10000):
    file = open("./all/data/tags_train/" + str(i) + ".txt", "r")
    lines = file.readlines() 
    for  line in lines:
        words = line.strip().split(':')
        supercategory_set.add(words[0])
        category_set.add(words[1])
    file.close()

# mapping from (super) category to index
supercategory_dict = {item:val for val, item in enumerate(supercategory_set)}
category_dict = {item:val+1 for val, item in enumerate(category_set)}

# Vectorize train tags
train_tags = []
for i in range (10000):
    file = open("./all/data/tags_train/" + str(i) + ".txt", "r")
    lines = file.readlines() 
    row = np.zeros(len(supercategory_set))
    for line in lines:
        words = line.strip().split(':')
        supercategory_column = supercategory_dict.get(words[0])
        category_index = category_dict.get(words[1])
        row[supercategory_column] = category_index
    train_tags.append(row)
    file.close()

In [2]:
features_test_2048 = np.zeros((2000,2048))
with open('./all/data/features_test/features_resnet1000intermediate_test.csv', 'r') as csvfile:
    csv_reader = csv.reader(csvfile)
    for line in csv_reader:
        image_name = line[0].strip(".jpg")[12:]
        row = []
        for i in range(len(line)):
            if i > 0:
                row.append(float(line[i]))
        features_test_2048[int(image_name)] = row

features_test_1000 = np.zeros((2000,1000))    
with open('./all/data/features_test/features_resnet1000_test.csv', 'r') as csvfile:
    csv_reader = csv.reader(csvfile)
    for line in csv_reader:
        image_name = line[0].strip(".jpg")[12:]
        row = []
        for i in range(len(line)):
            if i > 0:
                row.append(float(line[i]))
        features_test_1000[int(image_name)] = row

# Vectorize test tags
test_tags = []
for i in range (2000):
    file = open("./all/data/tags_test/" + str(i) + ".txt", "r")
    lines = file.readlines() 
    row = np.zeros(len(supercategory_set))
    for line in lines:
        words = line.strip().split(':')
        supercategory_column = supercategory_dict.get(words[0])
        category_index = category_dict.get(words[1])
        row[supercategory_column] = category_index
    test_tags.append(row)
    file.close()
    

xtest_glove = np.loadtxt('query_glove_test.csv',delimiter=",", dtype=float)

In [3]:
features_train_concat = np.concatenate((np.array(features_train_2048),np.array(features_train_1000)), axis=1)
features_train_concat = np.concatenate((features_train_concat, np.array(train_tags)), axis=1)

features_test_concat = np.concatenate((np.array(features_test_2048),np.array(features_test_1000)), axis=1)
features_test_concat = np.concatenate((features_test_concat, np.array(test_tags)), axis=1)

from sklearn.preprocessing import StandardScaler

features_train_concat = StandardScaler().fit_transform(features_train_concat)
features_test_concat = StandardScaler().fit_transform(features_test_concat)

In [8]:
from sklearn.linear_model import LinearRegression

clf_lr = LinearRegression(n_jobs=-1, fit_intercept = False)
clf_lr.fit(features_train_concat,xtrain_glove)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=-1, normalize=False)

In [10]:
lr_predict= clf_lr.predict(features_test_concat)

rank_list = []
for i in range (2000):
    current_query = xtest_glove[i]
    dist = []
    for j in range(2000):
        dist.append((np.linalg.norm(lr_predict[j]-current_query),j))
    dist.sort()
    rank_list.append([v for (k,v) in dist[:20]])

def mergeTwenty(lst):
    result = str(lst[0])+".jpg"
    for i in range(1,len(lst)):
        result += " " + str(lst[i]) + ".jpg"
    return result

with open('linearR_std_reverse.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
    for i in range(len(rank_list)):
        csv_writer.writerow([str(i) + ".txt", mergeTwenty(rank_list[i])])

In [7]:
# this works.....
from sklearn.neighbors import KNeighborsRegressor
clf_knn_reverse = KNeighborsRegressor(n_neighbors=5,n_jobs=-1)
clf_knn_reverse.fit(features_train_concat,xtrain_glove)
knn_predict_reverse= clf_knn_reverse.predict(features_test_concat)

rank_list = []
for i in range (2000):
    current_query = xtest_glove[i]
    dist = []
    for j in range(2000):
        dist.append((np.linalg.norm(knn_predict_reverse[j]-current_query),j))
    dist.sort()
    rank_list.append([v for (k,v) in dist[:20]])

def mergeTwenty(lst):
    result = str(lst[0])+".jpg"
    for i in range(1,len(lst)):
        result += " " + str(lst[i]) + ".jpg"
    return result

with open('knn_std_reverse.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
    for i in range(len(rank_list)):
        csv_writer.writerow([str(i) + ".txt", mergeTwenty(rank_list[i])])

In [12]:
from sklearn.kernel_ridge import KernelRidge
cls = KernelRidge(kernel='rbf',alpha=1.0)


cls.fit(features_train_concat,xtrain_glove)
lr_predict= cls.predict(features_test_concat)

rank_list = []
for i in range (2000):
    current_query = xtest_glove[i]
    dist = []
    for j in range(2000):
        dist.append((np.linalg.norm(lr_predict[j]-current_query),j))
    dist.sort()
    rank_list.append([v for (k,v) in dist[:20]])

def mergeTwenty(lst):
    result = str(lst[0])+".jpg"
    for i in range(1,len(lst)):
        result += " " + str(lst[i]) + ".jpg"
    return result

with open('linearR_std_reverse.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
    for i in range(len(rank_list)):
        csv_writer.writerow([str(i) + ".txt", mergeTwenty(rank_list[i])])

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer

cls = KernelRidge(kernel='rbf',alpha=1.0)
cls.fit(features_train_concat,xtrain_glove)

def my_scorer_reverse(ground_truth, predictions):
    score = 0.0
    length = len(ground_truth)
    for i in range (length):
        dist = []
        current_query = ground_truth[i]
        for j in range(length):
            dist.append((np.linalg.norm(predictions[j]-current_query),j))
        dist.sort()
        rank_list = [v for (k,v) in dist[:20]]
        if i in rank_list:
            rank = rank_list.index(i)
            score = score + (21.0-rank)/20.0
    return score/length

my_scorer_reverse = make_scorer(my_scorer_reverse, greater_is_better=True)
scores = cross_val_score(cls, features_train_concat,xtrain_glove, cv=5,scoring=my_scorer_reverse)

In [15]:
print (scores.mean())

0.6697999999999953


In [16]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer

cls = KernelRidge(kernel='rbf',alpha=1.0)

def my_scorer_reverse(ground_truth, predictions):
    score = 0.0
    length = len(ground_truth)
    for i in range (length):
        dist = []
        current_query = ground_truth[i]
        for j in range(length):
            dist.append((np.linalg.norm(predictions[j]-current_query),j))
        dist.sort()
        rank_list = [v for (k,v) in dist[:20]]
        if i in rank_list:
            rank = rank_list.index(i)
            score = score + (21.0-rank)/20.0
    return score/length

my_scorer_reverse = make_scorer(my_scorer_reverse, greater_is_better=True)
scores = cross_val_score(cls, features_train_concat,xtrain_glove, cv=5,scoring=my_scorer_reverse)

In [17]:
print (scores)

[0.665725 0.67845  0.68255  0.657625 0.66465 ]


In [18]:
print (scores.mean())

0.6697999999999953
