In [None]:
import numpy as np
import csv
from sklearn.ensemble import RandomForestRegressor

# Find set of supercategories, categories 
supercategory_set = set()
category_set = set()
for i in range (10000):
	file = open("tags_train/" + str(i) + ".txt", "r")
	lines = file.readlines() 
	for  line in lines:
		words = line.strip().split(':')
		supercategory_set.add(words[0])
		category_set.add(words[1])
	file.close()

# mapping from (super) category to index
supercategory_dict = {item:val for val, item in enumerate(supercategory_set)}
category_dict = {item:val+1 for val, item in enumerate(category_set)}

# Vectorize train tags
train_tags = []
for i in range (10000):
	file = open("tags_train/" + str(i) + ".txt", "r")
	lines = file.readlines() 
	row = np.zeros(len(supercategory_set))
	for line in lines:
		words = line.strip().split(':')		
		supercategory_column = supercategory_dict.get(words[0])
		category_index = category_dict.get(words[1])
		row[supercategory_column] = category_index
	train_tags.append(row)
	file.close()
    
print ("Finished loading tags")

In [4]:
# Load train queries
query = []
with open('query_glove.csv', 'r') as csvfile:
	csv_reader = csv.reader(csvfile)
	for line in csv_reader:
		query.append(line)
print ("Finished loading queries")

Finished loading queries


In [5]:
# Reading 1000-d train features 
features_train = np.zeros((10000,1000))
with open('features_train/features_resnet1000_train.csv', 'r') as csvfile:
	csv_reader = csv.reader(csvfile)
	for line in csv_reader:
		image_name = line[0].strip(".jpg")[13:]
		row = []
		for i in range(len(line)):
			if i > 0:
				row.append(float(line[i]))		
		features_train[int(image_name)] = row

In [6]:
# Reading 2048-d train features 
features_train_2048 = np.zeros((10000,2048))
with open('features_train/features_resnet1000intermediate_train.csv', 'r') as csvfile:
	csv_reader = csv.reader(csvfile)
	for line in csv_reader:
		image_name = line[0].strip(".jpg")[13:]
		row = []
		for i in range(len(line)):
			if i > 0:
				row.append(float(line[i]))		
		features_train_2048[int(image_name)] = row

In [8]:
# Load test queries
query_test = []
with open('query_glove_test.csv', 'r') as csvfile:
	csv_reader = csv.reader(csvfile)
	for line in csv_reader:
		query_test.append(line)
print ("Finished loading queries")
#tags_predict = clf_tags.predict(query_test)

# Vectorize test tags
test_tags = []
for i in range (2000):
	file = open("tags_test/" + str(i) + ".txt", "r")
	lines = file.readlines() 
	row = np.zeros(len(supercategory_set))
	for line in lines:
		words = line.strip().split(':')		
		supercategory_column = supercategory_dict.get(words[0])
		category_index = category_dict.get(words[1])
		row[supercategory_column] = category_index
	test_tags.append(row)
	file.close()

Finished loading queries


In [11]:
from sklearn.neighbors import KNeighborsRegressor
#clf_knn = KNeighborsRegressor(n_neighbors=5)
features_train_concat = np.concatenate((np.array(features_train_2048),np.array(features_train)), axis=1)
features_train_concat = np.concatenate((features_train_concat, np.array(train_tags)), axis=1)
#clf_knn.fit(query,features_train_concat)

In [9]:
features_test_2048 = np.zeros((2000,2048))
with open('features_test/features_resnet1000intermediate_test.csv', 'r') as csvfile:
	csv_reader = csv.reader(csvfile)
	for line in csv_reader:
		image_name = line[0].strip(".jpg")[12:]
		row = []
		for i in range(len(line)):
			if i > 0:
				row.append(float(line[i]))		
		features_test_2048[int(image_name)] = row

features_test_1000 = np.zeros((2000,1000))    
with open('features_test/features_resnet1000_test.csv', 'r') as csvfile:
	csv_reader = csv.reader(csvfile)
	for line in csv_reader:
		image_name = line[0].strip(".jpg")[12:]
		row = []
		for i in range(len(line)):
			if i > 0:
				row.append(float(line[i]))		
		features_test_1000[int(image_name)] = row
        
features_test_concat = np.concatenate((np.array(features_test_2048),np.array(features_test_1000)), axis=1)
features_test_concat = np.concatenate((features_test_concat, np.array(test_tags)), axis=1)


In [12]:
from sklearn import svm
clf_svm = svm.SVC(gamma=0.001)
train_query = np.array(query,dtype=float)
X_train_1 = np.concatenate((train_query,features_train_concat),axis=1)
Y_train_1 = np.ones(10000)

In [13]:
train_query_reordered = np.concatenate((train_query[5000:10000],train_query[0:5000]),axis=0)
X_train_0 = np.concatenate((train_query_reordered,features_train_concat),axis=1)

In [14]:
Y_train_0 = np.zeros(10000)
X_train = np.concatenate((X_train_1,X_train_0),axis=0)
Y_train = np.concatenate((Y_train_1,Y_train_0),axis=0)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier(max_depth=15, random_state=0,n_estimators=100,verbose=3)
clf_RF.fit(X_train,Y_train)

In [None]:
rank_list = []
test_query = np.array(query_test,dtype=float)  #2000,300
#features_test_concat  => (2000,3060)
for i in range (2000):
    query_repeat = np.tile(test_query[i],(2000,1))
    X_test = np.concatenate((query_repeat,features_test_concat),axis=1)
    svm_predictions = clf_RF.predict_proba(X_test)
    predictions_indices = [(svm_predictions[index][1],index) for index in range(2000)]
    predictions_indices.sort(reverse=True)
    rank_list.append([v for (k,v) in predictions_indices[:20]])
def mergeTwenty(lst):
    result = str(lst[0])+".jpg"
    for i in range(1,len(lst)):
        result += " " + str(lst[i]) + ".jpg"
    return result

with open('RF_full_data_submission.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
    for i in range(len(rank_list)):
        csv_writer.writerow([str(i) + ".txt", mergeTwenty(rank_list[i])])

In [108]:
from sklearn.neighbors import KNeighborsClassifier
clf_KNN = KNeighborsClassifier(n_neighbors=20,n_jobs=-1)
clf_KNN.fit(X_train,Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
           weights='uniform')

In [109]:
test_query = np.array(query_test,dtype=float) 
query_repeat = np.tile(test_query[0],(2000,1))
X_test = np.concatenate((query_repeat,features_test_concat),axis=1)
svm_predictions = clf_KNN.predict_proba(X_test)
predictions_indices = [(svm_predictions[index][1],index) for index in range(2000)]
predictions_indices.sort(reverse=True)
rank_list.append([v for (k,v) in predictions_indices[:20]])

In [None]:
rank_list = []
test_query = np.array(query_test,dtype=float)  #2000,300
#features_test_concat  => (2000,3060)
for i in range (2000):
    if i%40 == 0:
        print ("Running image" + str(i))
    query_repeat = np.tile(test_query[i],(2000,1))
    X_test = np.concatenate((query_repeat,features_test_concat),axis=1)
    svm_predictions = clf_KNN.predict_proba(X_test)
    predictions_indices = [(svm_predictions[index][1],index) for index in range(2000)]
    predictions_indices.sort(reverse=True)
    rank_list.append([v for (k,v) in predictions_indices[:20]])

def mergeTwenty(lst):
    result = str(lst[0])+".jpg"
    for i in range(1,len(lst)):
        result += " " + str(lst[i]) + ".jpg"
    return result

with open('KNN_full_data_submission.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
    for i in range(len(rank_list)):
        csv_writer.writerow([str(i) + ".txt", mergeTwenty(rank_list[i])])

In [113]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

train_query_std = StandardScaler().fit_transform(np.array(query,dtype=float))

pca_1000 = PCA(n_components=100)
features_train_1000_PCA = pca_1000.fit_transform(StandardScaler().fit_transform(features_train))

pca_2048 = PCA(n_components=200)
features_train_2048_PCA = pca_2048.fit_transform(StandardScaler().fit_transform(features_train_2048))

features_train_PCA = np.concatenate((np.array(features_train_2048_PCA),np.array(features_train_1000_PCA)), axis=1)
features_train_PCA = np.concatenate((features_train_PCA, np.array(StandardScaler().fit_transform(train_tags))), axis=1)
print(features_train_PCA.shape)

(10000, 312)


In [114]:
X_train_1_PCA = np.concatenate((train_query_std,features_train_PCA),axis=1)
train_query_reordered_PCA = np.concatenate((train_query_std[5000:10000],train_query_std[0:5000]),axis=0)
X_train_0_PCA = np.concatenate((train_query_reordered_PCA,features_train_PCA),axis=1)
X_train_PCA = np.concatenate((X_train_1_PCA,X_train_0_PCA),axis=0)
print (X_train_PCA.shape)

(20000, 612)


In [None]:
clf_RF_PCA = RandomForestClassifier(max_depth=15, random_state=0,n_estimators=100,verbose=3,n_jobs=-1)
clf_RF_PCA.fit(X_train_PCA,Y_train)

In [118]:
test_query_std = StandardScaler().fit_transform(np.array(test_query,dtype=float))
features_test_1000_PCA = pca_1000.transform((StandardScaler().fit_transform(features_test_1000)))
features_test_2048_PCA = pca_2048.transform((StandardScaler().fit_transform(features_test_2048)))
features_test_PCA = np.concatenate((np.array(features_test_2048_PCA),np.array(features_test_1000_PCA)), axis=1)
features_test_PCA = np.concatenate((features_test_PCA, np.array(StandardScaler().fit_transform(test_tags))), axis=1)
print(features_test_PCA.shape)

(2000, 312)


In [None]:
rank_list = []
for i in range (2000):
    if i%40 == 0:
        print ("Running image" + str(i))
    query_repeat = np.tile(test_query[i],(2000,1))
    X_test = np.concatenate((query_repeat,features_test_PCA),axis=1)
    svm_predictions = clf_RF_PCA.predict_proba(X_test)
    predictions_indices = [(svm_predictions[index][1],index) for index in range(2000)]
    predictions_indices.sort(reverse=True)
    rank_list.append([v for (k,v) in predictions_indices[:20]])

def mergeTwenty(lst):
    result = str(lst[0])+".jpg"
    for i in range(1,len(lst)):
        result += " " + str(lst[i]) + ".jpg"
    return result

with open('RF_PCA_submission.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
    for i in range(len(rank_list)):
        csv_writer.writerow([str(i) + ".txt", mergeTwenty(rank_list[i])])

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics.scorer import make_scorer

def my_scorer(ground_truth, predictions):
    score = 0.0
    length = len(ground_truth)
    for i in range (length):
        dist = []
        for j in range(length):
            dist.append((np.linalg.norm(predictions[i]-ground_truth[j]),j))
        dist.sort()
        rank_list = [v for (k,v) in dist[:20]]
        if i in rank_list:
            rank = rank_list.index(i)
            score = score + (21.0-rank)/20.0
    return score/length

my_scorer = make_scorer(my_scorer, greater_is_better=True)
scores = cross_val_score(clf_RF_PCA, X_train,Y_train, cv=5,scoring=my_scorer)