In [1]:
import numpy as np
import csv
from sklearn.ensemble import RandomForestRegressor

# Find set of supercategories, categories 
supercategory_set = set()
category_set = set()
for i in range (10000):
	file = open("tags_train/" + str(i) + ".txt", "r")
	lines = file.readlines() 
	for  line in lines:
		words = line.strip().split(':')
		supercategory_set.add(words[0])
		category_set.add(words[1])
	file.close()

# mapping from (super) category to index
supercategory_dict = {item:val for val, item in enumerate(supercategory_set)}
category_dict = {item:val+1 for val, item in enumerate(category_set)}

# Vectorize train tags
train_tags = []
for i in range (10000):
	file = open("tags_train/" + str(i) + ".txt", "r")
	lines = file.readlines() 
	row = np.zeros(len(supercategory_set))
	for line in lines:
		words = line.strip().split(':')		
		supercategory_column = supercategory_dict.get(words[0])
		category_index = category_dict.get(words[1])
		row[supercategory_column] = category_index
	train_tags.append(row)
	file.close()
    
print ("Finished loading tags")

  from numpy.core.umath_tests import inner1d


Finished loading tags


In [2]:
# Load train queries
query = []
with open('query_glove.csv', 'r') as csvfile:
	csv_reader = csv.reader(csvfile)
	for line in csv_reader:
		query.append(line)
print ("Finished loading queries")

Finished loading queries


In [5]:
# Train tags with RandomForest
clf_tags = RandomForestRegressor(max_depth=2, random_state=0,n_estimators=100)
clf_tags.fit(query,train_tags)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [4]:
# Reading 1000-d train features 
features_train = np.zeros((10000,1000))
with open('features_train/features_resnet1000_train.csv', 'r') as csvfile:
	csv_reader = csv.reader(csvfile)
	for line in csv_reader:
		image_name = line[0].strip(".jpg")[13:]
		row = []
		for i in range(len(line)):
			if i > 0:
				row.append(float(line[i]))		
		features_train[int(image_name)] = row

In [6]:
# Reading 2048-d train features 
features_train_2048 = np.zeros((10000,2048))
with open('features_train/features_resnet1000intermediate_train.csv', 'r') as csvfile:
	csv_reader = csv.reader(csvfile)
	for line in csv_reader:
		image_name = line[0].strip(".jpg")[13:]
		row = []
		for i in range(len(line)):
			if i > 0:
				row.append(float(line[i]))		
		features_train_2048[int(image_name)] = row

In [7]:
# Train 2048-d with RandomForest
clf_2048 = RandomForestRegressor(max_depth=2, random_state=0,n_estimators=100,verbose=3)
clf_2048.fit(query,features_train_2048)

building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.5s remaining:    0.0s


building tree 2 of 100


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   33.2s remaining:    0.0s


building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 27.9min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=0, verbose=3, warm_start=False)

In [19]:
# Load test queries
query_test = []
with open('query_glove_test.csv', 'r') as csvfile:
	csv_reader = csv.reader(csvfile)
	for line in csv_reader:
		query_test.append(line)
print ("Finished loading queries")
tags_predict = clf_tags.predict(query_test)

# Vectorize test tags
test_tags = []
for i in range (2000):
	file = open("tags_test/" + str(i) + ".txt", "r")
	lines = file.readlines() 
	row = np.zeros(len(supercategory_set))
	for line in lines:
		words = line.strip().split(':')		
		supercategory_column = supercategory_dict.get(words[0])
		category_index = category_dict.get(words[1])
		row[supercategory_column] = category_index
	test_tags.append(row)
	file.close()

Finished loading queries


In [29]:
rank_list_tags = []
for i in range (2000):
    guess = tags_predict[i]
    dist = []
    for j in range(2000):
        dist.append((np.linalg.norm(guess-test_tags[j]),j))
    dist.sort()
    rank_list_tags.append([v for (k,v) in dist[:20]])
with open('tags_submission.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
    for i in range(len(rank_list_tags)):
        csv_writer.writerow([str(i) + ".txt", mergeTwenty(rank_list_tags[i])])

In [28]:
feature_2048_predict = clf_2048.predict(query_test)
#Reading 2048-d test features 
features_test_2048 = np.zeros((2000,2048))
with open('features_test/features_resnet1000intermediate_test.csv', 'r') as csvfile:
	csv_reader = csv.reader(csvfile)
	for line in csv_reader:
		image_name = line[0].strip(".jpg")[12:]
		row = []
		for i in range(len(line)):
			if i > 0:
				row.append(float(line[i]))		
		features_test_2048[int(image_name)] = row


rank_list = []
for i in range (2000):
    guess = feature_2048_predict[i]
    dist = []
    for j in range(2000):
        dist.append((np.linalg.norm(guess-features_test_2048[j]),j))
    dist.sort()
    rank_list.append([v for (k,v) in dist[:20]])

def mergeTwenty(lst):
    result = str(lst[0])+".jpg"
    for i in range(1,len(lst)):
        result += " " + str(lst[i]) + ".jpg"
    return result

with open('2048_submission.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
    for i in range(len(rank_list)):
        csv_writer.writerow([str(i) + ".txt", mergeTwenty(rank_list[i])])

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.7s finished


In [46]:
from sklearn.neighbors import KNeighborsRegressor
clf_knn = KNeighborsRegressor(n_neighbors=5)
features_train_concat = np.concatenate((np.array(features_train_2048),np.array(features_train)), axis=1)
features_train_concat = np.concatenate((features_train_concat, np.array(train_tags)), axis=1)
clf_knn.fit(query,features_train_concat)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [47]:
knn_predict_2048 = clf_knn.predict(query_test)
features_test_2048 = np.zeros((2000,2048))
with open('features_test/features_resnet1000intermediate_test.csv', 'r') as csvfile:
	csv_reader = csv.reader(csvfile)
	for line in csv_reader:
		image_name = line[0].strip(".jpg")[12:]
		row = []
		for i in range(len(line)):
			if i > 0:
				row.append(float(line[i]))		
		features_test_2048[int(image_name)] = row

features_test_1000 = np.zeros((2000,1000))    
with open('features_test/features_resnet1000_test.csv', 'r') as csvfile:
	csv_reader = csv.reader(csvfile)
	for line in csv_reader:
		image_name = line[0].strip(".jpg")[12:]
		row = []
		for i in range(len(line)):
			if i > 0:
				row.append(float(line[i]))		
		features_test_1000[int(image_name)] = row
        
features_test_concat = np.concatenate((np.array(features_test_2048),np.array(features_test_1000)), axis=1)
features_test_concat = np.concatenate((features_test_concat, np.array(test_tags)), axis=1)

rank_list = []
for i in range (2000):
    guess = knn_predict_2048[i]
    dist = []
    for j in range(2000):
        dist.append((np.linalg.norm(guess-features_test_concat[j]),j))
    dist.sort()
    rank_list.append([v for (k,v) in dist[:20]])

def mergeTwenty(lst):
    result = str(lst[0])+".jpg"
    for i in range(1,len(lst)):
        result += " " + str(lst[i]) + ".jpg"
    return result

with open('2048_submission_knn_concat.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
    for i in range(len(rank_list)):
        csv_writer.writerow([str(i) + ".txt", mergeTwenty(rank_list[i])])

In [130]:
from sklearn import svm
clf_svm = svm.SVC(gamma=0.001)
train_query = np.array(query,dtype=float)
X_train_1 = np.concatenate((train_query,features_train_concat),axis=1)
Y_train_1 = np.ones(10000)

In [131]:
train_query_reordered = np.concatenate((train_query[5000:10000],train_query[0:5000]),axis=0)
X_train_0 = np.concatenate((train_query_reordered,features_train_concat),axis=1)

In [132]:
Y_train_0 = np.zeros(10000)
X_train = np.concatenate((X_train_1,X_train_0),axis=0)
Y_train = np.concatenate((Y_train_1,Y_train_0),axis=0)

In [None]:
clf_svm = svm.SVC(gamma=0.001,verbose=True,probability=True)
clf_svm.fit(X_train,Y_train)

[LibSVM]

In [153]:
rank_list = []
test_query = np.array(query_test,dtype=float)  #2000,300
#features_test_concat  => (2000,3060)
for i in range (2000):
    if ((i % 20) == 0):
        print ("Running Test"+ str(i))
    query_repeat = np.tile(test_query[i],(2000,1))
    X_test = np.concatenate((query_repeat,features_test_concat),axis=1)
    svm_predictions = clf_svm.predict(X_test)
    predictions_indices = np.concatenate((svm_predictions,np.arange(2000)),axis=1)
    predictions_indices.sort()
    rank_list.append([v for (k,v) in dist[:20]])

Running Test0


AxisError: axis 1 is out of bounds for array of dimension 1

In [None]:
def mergeTwenty(lst):
    result = str(lst[0])+".jpg"
    for i in range(1,len(lst)):
        result += " " + str(lst[i]) + ".jpg"
    return result

with open('svm_submission.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
    for i in range(len(rank_list)):
        csv_writer.writerow([str(i) + ".txt", mergeTwenty(rank_list[i])])