In [283]:
import os
import csv
import random
import gensim
import numpy as np

num_train = 8000
num_dev = 2000
num_test = 2000

def parse_descriptions(data_dir, num_doc):
    docs = []
    for i in range(num_doc):
        with open(os.path.join(data_dir, "%d.txt" % i)) as f:
            docs.append(f.read())
    return docs

train_desc = parse_descriptions("descriptions_train", num_doc=(num_train+num_dev))
train_tags = parse_descriptions("tags_train", num_doc=(num_train+num_dev))
test_desc =  parse_descriptions("descriptions_test", num_doc=(num_test))
test_tags = parse_descriptions("tags_test", num_doc=(num_test))

In [297]:
#parsing and building Y matrices
def parse_features(features_path):
    vec_map = {}
    with open(features_path) as f:
        for row in csv.reader(f):
            img_id = int(row[0].split("/")[1].split(".")[0])
            vec_map[img_id] = np.array([float(x) for x in row[1:]])
    return np.array([v for k, v in sorted(vec_map.items())])

# build y matrices
y_train = parse_features("features_train/features_resnet1000_train.csv") 
y_test = parse_features("features_test/features_resnet1000_test.csv") 

print("Built all y matrices!")
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

Built all y matrices!
y_train shape: (10000, 1000)
y_test shape: (2000, 1000)


In [285]:
#parsing and building Y matrices of pool5 vectors
def parse_features(features_path):
    vec_map = {}
    with open(features_path) as f:
        for row in csv.reader(f):
            img_id = int(row[0].split("/")[1].split(".")[0])
            vec_map[img_id] = np.array([float(x) for x in row[1:]])
    return np.array([v for k, v in sorted(vec_map.items())])

# build y matrices
yint_train = parse_features("features_train/features_resnet1000intermediate_train.csv") 
yint_test = parse_features("features_test/features_resnet1000intermediate_test.csv") 

print("Built all y matrices!")
print("yint_train shape:", yint_train.shape)
print("yint_test shape:", yint_test.shape)

Built all y matrices!
yint_train shape: (10000, 2048)
yint_test shape: (2000, 2048)


In [298]:
#Description to fc1000
lines = [line.rstrip('\n')[1:-1] for line in open('fc1000_labels.csv')]
lines = [line.split(": ") for line in lines]
labels = []
for line in lines:
    y = line[1][1:-1]
    y = y.replace(', ',' ')
    y = y.replace('-','')
    y = y.split(' ')
    labels.append(y)

#labels to word_vectors
label_vec = []
for y in labels:
    vecs = [word2vec.get_vector(w) for w in y if w in word2vec.vocab]
    label_vec.append(np.stack(vecs).mean(0))

label_vec = np.array(label_vec)

In [299]:
def normalize(y_set):
    y_set_ = []
    for y in y_set:
        y_ = [i if i>0 else 0 for i in y]
        y_set_.append(y_)
    return np.array(y_set_)

def normalize2(y_set):
    y_set_ = []
    for y in y_set:
        maxval = np.max(y)
        y_ = [i if i>0.2*maxval else 0 for i in y]
        y_set_.append(y_)
    return np.array(y_set_)

y_train_ = normalize(y_train)
y_test_ = normalize(y_test)

y_train_max_ = np.max(y_train_, axis = 0)
y_test_max_ = np.max(y_test_, axis = 0)
y_train_norm = [np.divide(y,y_train_max_) for y in y_train_]
y_test_norm = [ np.divide(y,y_test_max_) for y in y_test_]

#pool5 vectors
yint_train_ = normalize2(yint_train)
yint_test_ = normalize2(yint_test)

In [351]:
#get labels for y_train_norm
def flatten(l):
    return [item for sublist in l for item in sublist]

def get_label(y):
    max_val = np.max(y)
    return flatten([labels[i] for i in range(len(y)) if y[i]>0])

y_train_labels = [get_label(y) for y in y_train_norm]
y_test_labels = [get_label(y) for y in y_test_norm]
y_train_lv = np.array([doc_to_vec(tags, word2vec) for tags in y_train_labels])
y_test_lv = np.array([doc_to_vec(tags, word2vec) for tags in y_test_labels])

In [287]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [346]:
from collections import Counter

def filter(tags):
    counter = Counter(tags)
    return counter.most_common(10)
    
def doc_to_vec(tags, word2vec):
    # get list of word vectors in sentence
    tags = [tag for tag in tags if tag in word2vec.vocab]
    tags = filter(get_lemmatize(tags))
    wordvecs = np.zeros(300)
    if tags == []:
        return wordvecs
    n=0
    for w,i in tags:
        if w in word2vec.vocab:
            n+=i
            wordvecs = wordvecs + i*word2vec.get_vector(w)
            
    return wordvecs/n

In [333]:
#Preprocessing descriptions
from textblob import TextBlob,Word
from gensim.parsing.preprocessing import strip_punctuation

colors = ['blue','orange','yellow','green','black','brown','grey','white','purple']
def get_nouns(s):
    return [strip_punctuation(word) for word,tag in TextBlob(s).tags if tag in ["NN","NNS","NNP","ADJ"] and word not in colors]
    

#Transforming descriptions to a list of nouns and stripping puncutation
x_train = [get_nouns(s) for s in train_desc]
x_test = [get_nouns(s) for s in test_desc]

#lemmatizing tags
#x_train = [get_lemmatize(tags) for tags in x_train]
#x_test = [get_lemmatize(tags) for tags in x_test]
print("x_train length", len(x_train))
print("x_test length", len(x_test))



x_train length 10000
x_test length 2000


NameError: name 'x_train_' is not defined

In [350]:
#converting training description tags to vectors 
X_train = np.array([doc_to_vec(tags, word2vec) for tags in x_train])
X_test = np.array([doc_to_vec(tags, word2vec) for tags in x_test])

print("x_train shape:", X_train.shape)
print("x_test shape:", X_test.shape)

x_train shape: (10000, 300)
x_test shape: (2000, 300)


In [366]:
#training_desc BOW
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(flatten(x_train)+flatten(x_test))
print("BOW dimension:", len(vectorizer.get_feature_names()))

BOW dimension: 7143


In [388]:
x_train_b = [" ".join(x) for x in x_train]
x_test_b = [" ".join(x) for x in x_test]
x_train_bow =vectorizer.transform(x_train_b)
x_test_bow = vectorizer.transform(x_test_b)
print(x_train_bow.shape)
print(x_test_bow.shape)

(10000, 7143)
(2000, 7143)


<10000x7143 sparse matrix of type '<class 'numpy.int64'>'
	with 102785 stored elements in Compressed Sparse Row format>

In [325]:
#preprocessing tags associated with images
tag_train = np.array([doc_to_vec(strip_punctuation(s).split(),word2vec) for s in train_tags])
tag_test = np.array([doc_to_vec(strip_punctuation(s).split(),word2vec) for s in test_tags])

print("tag_train shape:", (tag_train.shape))
print("tag_test shape:", (tag_test.shape))

tag_train shape: (10000, 300)
tag_test shape: (2000, 300)


In [329]:
train_tags_  = [flatten([tag.split(":")[1].split() for tag in tags.split("\n")[:-1]]) for tags in train_tags]
test_tags_  = [flatten([tag.split(":")[1].split() for tag in tags.split("\n")[:-1]]) for tags in test_tags]

train_tags_vec = np.array([doc_to_vec(s,word2vec) for s in train_tags_])
test_tags_vec = np.array([doc_to_vec(s,word2vec) for s in test_tags_])
print("tag_train shape:", (train_tags_vec.shape))
print("tag_test shape:", (test_tags_vec.shape))

tag_train shape: (10000, 300)
tag_test shape: (2000, 300)


In [294]:
#setting up k-fold evaluation 
from sklearn.model_selection import KFold
from sklearn.metrics.pairwise import cosine_distances

In [None]:
#Another combination approach

def eval_map20_3(dev_distances):
    dev_scores = []
    dev_pos_list = []
    for i in range(num_dev):
        pred_dist_idx = list(np.argsort(dev_distances[i]))
        dev_pos = pred_dist_idx.index(i)
        dev_pos_list.append(dev_pos)
        if dev_pos < 20:
            dev_scores.append(1 / (dev_pos + 1))
        else:
            dev_scores.append(0.0)    
    print(np.mean(dev_scores), np.mean(dev_pos_list),np.median(dev_pos_list))
    return(np.mean(dev_scores), np.mean(dev_pos_list),np.median(dev_pos_list))

def cross_val3(reg, xt,yt):
    map20 = []
    mean_i = []
    median_i = []
    for train_index, test_index in kf.split(X_train):
        distances = []
        xtrain = xt[train_index]
        x_test = xt[test_index]
        
        ytrain1 =  yt[train_index]
        y_test1 =  yt[test_index]
        reg.fit(xtrain, ytrain1)
        y_pred1 = reg.predict(x_test)
        d1 = cosine_distances(y_pred1, y_test1)
        
        ytrain2 =  train_tags_vec[train_index]
        y_test2 =  train_tags_vec[test_index]
        reg.fit(xtrain, ytrain2)
        y_pred2 = reg.predict(x_test)
        d2 = cosine_distances(y_pred2, y_test2)
        
        
        """ytrain3 =  y_train_lv[train_index]
        y_test3 =  y_train_lv[test_index]
        reg.fit(xtrain, ytrain3)
        y_pred3 = reg.predict(x_test)
        d3 = cosine_distances(y_pred3, y_test3)
        #print("d3")
        #eval_map20_3(d3)

        
        ytrain4 =  yint_train[train_index]
        y_test4 =  yint_train[test_index]
        reg.fit(xtrain, ytrain4)
        y_pred4 = reg.predict(x_test)
        d4 = cosine_distances(y_pred4, y_test4)
        #print("d4")
        #eval_map20_3(d4)"""
    
        dev_distances = np.multiply(d1,d2)
        map20_score,mean,median = eval_map20_3(dev_distances)
        map20.append(map20_score)
        mean_i.append(mean)
        median_i.append(median)
    print("Development MAP@20:", np.mean(map20))
    print("Mean index of true image", np.mean(mean_i))
    print("Median index of true image", np.median(median_i))


In [None]:
from sklearn.linear_model import Ridge
reg= Ridge(alpha=0.001)
cross_val3(reg,X_train,np.array(y_train_norm))

from sklearn.neighbors import KNeighborsRegressor
reg2= KNeighborsRegressor(n_neighbors=20)
#cross_val3(reg2, X_train,y_train_)

In [226]:
from sklearn.neighbors import KNeighborsRegressor
reg= KNeighborsRegressor()
#cross_val3(reg,X_train,y_train_)

from sklearn.model_selection import GridSearchCV

k = 5
parameters = {"n_neighbors": [1,3,5,7,9,11,13,15]}
reg = GridSearchCV(reg, parameters, cv=5)
reg.fit(X_train, y_train_)
reg_best = reg.best_estimator_

print("Trained kNN regression model!")
print("Summary of best model:")
print(reg_best)







Trained kNN regression model!
Summary of best model:
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                    weights='uniform')


In [83]:
from sklearn.model_selection import GridSearchCV

k = 5
parameters = {"alpha": [0.1,0.3,0.5,1,3,5]}
reg = GridSearchCV(Ridge(), parameters, cv=5)
reg.fit(X_train, y_train_)
reg_best = reg.best_estimator_

print("Trained linear regression model!")
print("Summary of best model:")
print(reg_best)





Trained linear regression model!
Summary of best model:
Ridge(alpha=3, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)


In [81]:
# create test predictions
reg=Ridge()

xtrain = X_train
x_test = X_test
    

ytrain1 =  y_train_
y_test1 =  y_test_
reg.fit(xtrain, ytrain1)
y_pred1 = reg.predict(x_test)   
d1 = (cosine_distances(y_pred1, y_test1))
        
               
ytrain2 =  train_tags_vec
y_test2 =  test_tags_vec
reg.fit(xtrain, ytrain2)
y_pred2 = reg.predict(x_test)
d2 = (cosine_distances(y_pred2, y_test2))

test_distances = np.multiply(d1,d2)

pred_rows = []
pred_row_i = []

for i in range(num_test):
    test_dist_idx = list(np.argsort(test_distances[i]))
    top_20 = test_dist_idx[:20]
    row = ["%d.jpg" % i for i in test_dist_idx[:20]]
    pred_rows.append(" ".join(row))
    pred_row_i.append(top_20)

with open("test_submission_best.csv", "w") as f:
    f.write("Descritpion_ID,Top_20_Image_IDs\n")
    for i, row in enumerate(pred_rows):
        f.write("%d.txt,%s\n" % (i, row))

print("Output written!")

Output written!
