### 0. Load libraries, define our train/test split, and load the word2vec dictionary using gensim

In [1]:
import os
import csv
import random
import gensim
import numpy as np
import pickle
from collections import Counter
from sklearn.utils import shuffle

work_dir = '/Users/ronghao/Mirror/Cornell-Tech/2018-Fa-Course/CS-5785/Homework/Final/'
data_dir = work_dir + 'all/'
processed_data_dir = work_dir + 'data/'

In [2]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format(work_dir +\
                            "library/GoogleNews-vectors-negative300.bin", binary=True)
print("Loaded word vectors successfully!")

Loaded word vectors successfully!


In [3]:
num_train = 8000; num_dev = 2000; num_test = 2000

In [None]:
# X_data = 'only_fc', 'only_pool', 'fc_pool'
# Y_data = 'full_w2v_mean', 'n_full_w2v_mean', 'a_full_w2v_mean', 'v_full_w2v_mean',
#                           'n_top800_w2v_mean', 'a_top400_w2v_mean', 'v_top400_w2v_mean',
#                           'n_top400_w2v_mean', 'a_top200_w2v_mean', 'v_top200_w2v_mean',
#          'full_bow', 'n_full_bow', 'a_full_bow', 'v_full_bow',
#                      'n_top800_bow', 'a_top400_bow', 'v_top400_bow',
#                      'n_top400_bow', 'a_top200_bow', 'v_top200_bow',

### 1. Parse the ResNet features to form the X matrices

In [35]:
def parse_features(features_path):
    vec_map = {}
    with open(features_path) as f:
        for row in csv.reader(f):
            img_id = int(row[0].split("/")[1].split(".")[0])
            vec_map[img_id] = np.array([float(x) for x in row[1:]])
    return np.array([v for k, v in sorted(vec_map.items())])

X_dict = {}

# build x matrices
X_dict['train_dev'] = {}
X_dict['test'] = {}

X_dict['train_dev']['only_fc'] = parse_features(data_dir + "features_train/features_resnet1000_train.csv")
X_dict['test']['only_fc'] = parse_features(data_dir + "features_test/features_resnet1000_test.csv")
X_dict['train_dev']['only_pool'] = parse_features(data_dir + "features_train/features_resnet1000intermediate_train.csv")
X_dict['test']['only_pool'] = parse_features(data_dir + "features_test/features_resnet1000intermediate_test.csv")
X_dict['train_dev']['fc_pool'] = np.concatenate((X_dict['train_dev']['only_fc'], X_dict['train_dev']['only_pool']), axis=1)
X_dict['test']['fc_pool'] = np.concatenate((X_dict['test']['only_fc'], X_dict['test']['only_pool']), axis=1)

In [36]:
pickle.dump(X_dict, open(work_dir + "modeldata/X_dict.p", "wb"))

In [None]:
# X_data = 'only_fc', 'only_pool', 'fc_pool'
# Y_data = 'full_w2v_mean', 'n_full_w2v_mean', 'a_full_w2v_mean', 'v_full_w2v_mean',
#                           'n_top800_w2v_mean', 'a_top400_w2v_mean', 'v_top400_w2v_mean',
#                           'n_top400_w2v_mean', 'a_top200_w2v_mean', 'v_top200_w2v_mean',
#          'full_bow', 'n_full_bow', 'a_full_bow', 'v_full_bow',
#                      'n_top800_bow', 'a_top400_bow', 'v_top400_bow',
#                      'n_top400_bow', 'a_top200_bow', 'v_top200_bow',

### 2. Parse the descriptions to form the Y matrices

In [4]:
Y_dict = {}; Y_dict['train_dev'] = {}; Y_dict['test'] = {}

In [5]:
# w2v on full description
def parse_descriptions(data_dir, num_doc):
    docs = []
    for i in range(num_doc):
        path = os.path.join(data_dir, "%d.txt" % i)
        with open(path) as f:
            docs.append(f.read())
    return docs

def doc_to_vec(sentence, word2vec):
    # get list of word vectors in sentence
    if len(sentence) == 0:
        return np.zeros(300)
    if type(sentence) == str:
        word_vecs = [word2vec.get_vector(w) for w in sentence.split() if w in word2vec.vocab]
    else:
        word_vecs = [word2vec.get_vector(w) for w in sentence if w in word2vec.vocab]
    # return average
    if len(word_vecs) == 0:
        return np.zeros(300)
    return np.stack(word_vecs).mean(0)

In [6]:
# full words
train_dev_desc = parse_descriptions(data_dir + "descriptions_train", num_doc=(num_train+num_dev))
test_desc = parse_descriptions(data_dir + "descriptions_test", num_doc=num_test)

Y_dict['train_dev']['full_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in train_dev_desc])
Y_dict['test']['full_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in test_desc])

In [7]:
Y_dict['train_dev']['full_bow'] = np.array(pickle.load(open(processed_data_dir + "n_a_v_bow/y_train_all.p", "rb")))
Y_dict['test']['full_bow'] = np.array(pickle.load(open(processed_data_dir + "n_a_v_bow/y_test_all.p", "rb")))

In [8]:
# loads seperated word list file
train_dev_desc_n = pickle.load(open(processed_data_dir + "n_a_v_list/noun_list_train.p", "rb"))
test_desc_n = pickle.load(open(processed_data_dir + "n_a_v_list/noun_list_test.p", "rb"))
train_dev_desc_a = pickle.load(open(processed_data_dir + "n_a_v_list/adj_list_train.p", "rb"))
test_desc_a = pickle.load(open(processed_data_dir + "n_a_v_list/adj_list_test.p", "rb"))
train_dev_desc_v = pickle.load(open(processed_data_dir + "n_a_v_list/verb_list_train.p", "rb"))
test_desc_v = pickle.load(open(processed_data_dir + "n_a_v_list/verb_list_test.p", "rb"))

In [9]:
t = pickle.load(open(processed_data_dir + "taglist.p", "rb"))
taglist = []
for word in t:
    taglist += word.split(' ')
len(taglist)

95

In [10]:
def bag_of_words_feature(train_dev_desc, test_desc, vocabulary):
    train_dev_counters = []
    for doc in train_dev_desc:
        c = Counter()
        for word in doc:
            if word in vocabulary:
                c[word] += 1
        train_dev_counters.append(c)

    test_counters = []
    for doc in test_desc:
        c = Counter()
        for word in doc:
            if word in vocabulary:
                c[word] += 1
        test_counters.append(c)

    table = []
    for c in train_dev_counters:
        row = []
        for word in vocabulary:
            row.append(c[word])
        table.append(row)
    y_train_dev = np.array(table)

    table = []
    for c in test_counters:
        row = []
        for word in vocabulary:
            row.append(c[word])
        table.append(row)
    y_test = np.array(table)
    
    return y_train_dev, y_test

In [11]:
def top_words_desc(train_dev_desc_origin, vocab_top):
    train_dev_desc_n_top = []
    for d in train_dev_desc_origin:
        doc = [word for word in d if word in vocab_top]
        train_dev_desc_n_top.append(doc)
    return train_dev_desc_n_top

In [12]:
# noun words
flatten_train_dev_n = [item for sublist in train_dev_desc_n for item in sublist]
flatten_test_n = [item for sublist in test_desc_n for item in sublist]
n_vocab = Counter()
for word in (flatten_train_dev_n + flatten_test_n):
    n_vocab[word] += 1

n_full_vocab = list(n_vocab.keys())
n_800_vocab = list(t[0] for t in n_vocab.most_common(772))
n_800_vocab = list(set.union(set(taglist), set(n_800_vocab)))
n_400_vocab = list(t[0] for t in n_vocab.most_common(367))
n_400_vocab = list(set.union(set(taglist), set(n_400_vocab)))

In [13]:
# noun bag of words feature
Y_dict['train_dev']['n_full_bow'], Y_dict['test']['n_full_bow'] =\
    bag_of_words_feature(train_dev_desc_n, test_desc_n, n_full_vocab)
Y_dict['train_dev']['n_top800_bow'], Y_dict['test']['n_top800_bow'] =\
    bag_of_words_feature(train_dev_desc_n, test_desc_n, n_800_vocab)
Y_dict['train_dev']['n_top400_bow'], Y_dict['test']['n_top400_bow'] =\
    bag_of_words_feature(train_dev_desc_n, test_desc_n, n_400_vocab)

In [14]:
# noun word2vector feature
train_dev_desc_n_top800 = top_words_desc(train_dev_desc_n, n_800_vocab)
train_dev_desc_n_top400 = top_words_desc(train_dev_desc_n, n_400_vocab)
test_desc_n_top800 = top_words_desc(test_desc_n, n_800_vocab)
test_desc_n_top400 = top_words_desc(test_desc_n, n_400_vocab)

Y_dict['train_dev']['n_full_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in train_dev_desc_n])
Y_dict['test']['n_full_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in test_desc_n])
Y_dict['train_dev']['n_top800_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in train_dev_desc_n_top800])
Y_dict['test']['n_top800_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in test_desc_n_top800])
Y_dict['train_dev']['n_top400_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in train_dev_desc_n_top400])
Y_dict['test']['n_top400_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in test_desc_n_top400])

In [15]:
# adjective words
flatten_train_dev_a = [item for sublist in train_dev_desc_a for item in sublist]
flatten_test_a = [item for sublist in test_desc_a for item in sublist]
a_vocab = Counter()
for word in (flatten_train_dev_a + flatten_test_a):
    a_vocab[word] += 1
    
a_full_vocab = list(a_vocab.keys())
a_400_vocab = list(t[0] for t in a_vocab.most_common(400))
a_200_vocab = list(t[0] for t in a_vocab.most_common(200))

In [16]:
# adjective bag of words feature
Y_dict['train_dev']['a_full_bow'], Y_dict['test']['a_full_bow'] =\
    bag_of_words_feature(train_dev_desc_a, test_desc_a, a_full_vocab)
Y_dict['train_dev']['a_top400_bow'], Y_dict['test']['a_top400_bow'] =\
    bag_of_words_feature(train_dev_desc_a, test_desc_a, a_400_vocab)
Y_dict['train_dev']['a_top200_bow'], Y_dict['test']['a_top200_bow'] =\
    bag_of_words_feature(train_dev_desc_a, test_desc_a, a_200_vocab)

In [17]:
# adjective word2vector feature
train_dev_desc_a_top400 = top_words_desc(train_dev_desc_a, a_400_vocab)
train_dev_desc_a_top200 = top_words_desc(train_dev_desc_a, a_200_vocab)
test_desc_a_top400 = top_words_desc(test_desc_a, a_400_vocab)
test_desc_a_top200 = top_words_desc(test_desc_a, a_200_vocab)

Y_dict['train_dev']['a_full_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in train_dev_desc_a])
Y_dict['test']['a_full_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in test_desc_a])
Y_dict['train_dev']['a_top400_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in train_dev_desc_a_top400])
Y_dict['test']['a_top400_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in test_desc_a_top400])
Y_dict['train_dev']['a_top200_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in train_dev_desc_a_top200])
Y_dict['test']['a_top200_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in test_desc_a_top200])

In [18]:
# verb words
flatten_train_dev_v = [item for sublist in train_dev_desc_v for item in sublist]
flatten_test_v = [item for sublist in test_desc_v for item in sublist]
v_vocab = Counter()
for word in (flatten_train_dev_v + flatten_test_v):
    v_vocab[word] += 1
    
v_full_vocab = list(v_vocab.keys())
v_400_vocab = list(t[0] for t in v_vocab.most_common(400))
v_200_vocab = list(t[0] for t in v_vocab.most_common(200))

In [19]:
# verb bag of words feature
Y_dict['train_dev']['v_full_bow'], Y_dict['test']['v_full_bow'] =\
    bag_of_words_feature(train_dev_desc_v, test_desc_v, v_full_vocab)
Y_dict['train_dev']['v_top400_bow'], Y_dict['test']['v_top400_bow'] =\
    bag_of_words_feature(train_dev_desc_v, test_desc_v, v_400_vocab)
Y_dict['train_dev']['v_top200_bow'], Y_dict['test']['v_top200_bow'] =\
    bag_of_words_feature(train_dev_desc_v, test_desc_v, v_200_vocab)

In [20]:
# verb word2vector feature
train_dev_desc_v_top400 = top_words_desc(train_dev_desc_v, v_400_vocab)
train_dev_desc_v_top200 = top_words_desc(train_dev_desc_v, v_200_vocab)
test_desc_v_top400 = top_words_desc(test_desc_v, v_400_vocab)
test_desc_v_top200 = top_words_desc(test_desc_v, v_200_vocab)

Y_dict['train_dev']['v_full_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in train_dev_desc_v])
Y_dict['test']['v_full_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in test_desc_v])
Y_dict['train_dev']['v_top400_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in train_dev_desc_v_top400])
Y_dict['test']['v_top400_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in test_desc_v_top400])
Y_dict['train_dev']['v_top200_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in train_dev_desc_v_top200])
Y_dict['test']['v_top200_w2v_mean'] = np.array([doc_to_vec(d, word2vec) for d in test_desc_v_top200])

In [21]:
pickle.dump(Y_dict, open(work_dir + "modeldata/Y_dict.p", "wb"))

### 3. Compute discount table
* on dev set of different random split (rand_state=[0,9])
* on test set

In [21]:
beta = 0.8

In [22]:
def tag_discount(desc_file, tag_file):
    table = []
    for desc in desc_file:
        row = []
        for tags in tag_file:
            d_expo = 0
            for tag in tags:
                if tag in desc:
                    d_expo += 1
            row.append(d_expo)
        table.append(row)
    return table

In [9]:
def discount_table(expo_table, exp_rate=0.8):
    table = []
    for expo_row in expo_table:
        row = []
        for expo in expo_row:
            row.append(exp_rate**expo)
        table.append(row)
    return table

In [17]:
# Compute Dev Discount on different split
num_train = 8000; num_dev = 2000; num_test = 2000
split_idx = {}
for rand_state in range(10):
    split_idx[rand_state] = list(range(num_train + num_dev))
    split_idx[rand_state] = shuffle(split_idx[rand_state], random_state=rand_state)

In [18]:
desc_file = {}
tag_file = {}

for rand_state in range(10):
    desc_file[rand_state] = []
    for desc_file_index in split_idx[rand_state][num_train:]:
        # find description
        with open(data_dir+'descriptions_train/'+str(desc_file_index)+'.txt') as f:
            desc_file[rand_state].append(f.read().lower().\
                                         replace(',',' ').replace('.',' ').replace('\n','').split(' '))

    tag_file[rand_state] = []
    for tag_file_index in split_idx[rand_state][num_train:]:
        with open(data_dir+'tags_train/'+str(tag_file_index)+'.txt') as f:
            tag_txt = f.read()
        tags = []
        for word in list(pair[pair.find(':')+1:] for pair in tag_txt.split('\n')[:-1]):
            tags += word.split(' ')
        tag_file[rand_state].append(tags)

In [23]:
dist_discount_dev = {}
for rand_state in range(10):
    discount_expo_table = tag_discount(desc_file[rand_state], tag_file[rand_state])
    dist_discount_dev[rand_state] = np.array(discount_table(discount_expo_table, beta))

In [3]:
# Compute Test Discount
desc_file_test = []
for desc_file_index in range(2000):
    # find description
    with open(data_dir+'descriptions_test/'+str(desc_file_index)+'.txt') as f:
        desc_file_test.append(f.read().lower().replace(',',' ').replace('.',' ').replace('\n','').split(' '))

tag_file_test = []
for tag_file_index in range(2000):
    with open(data_dir+'tags_test/'+str(tag_file_index)+'.txt') as f:
        tag_txt = f.read()
    tags = []
    for word in list(pair[pair.find(':')+1:] for pair in tag_txt.split('\n')[:-1]):
        tags += word.split(' ')
    tag_file_test.append(tags)

In [20]:
discount_expo_table_test = tag_discount(desc_file_test, tag_file_test)
dist_discount_test = np.array(discount_table(discount_expo_table_test, 0.8))

In [24]:
pickle.dump(dist_discount_dev, open(work_dir + "modeldata/dist_discount_dev_dict.p", "wb"))
pickle.dump(dist_discount_test, open(work_dir + "modeldata/dist_discount_test.p", "wb"))

### 3. Save or load current data

In [None]:
# from sklearn.cross_decomposition import PLSRegression

# # train PLSRegression model with regression
# parameters = {"n_components": [10, 20, 30, 40]}
# PLSreg = GridSearchCV(PLSRegression(), parameters, cv=10)
# PLSreg.fit(x_train, y_train)
# PLSreg_best = PLSreg.best_estimator_

# print("Trained PLS regression model!")
# print("Summary of best model:")
# print(PLSreg_best)

# BOW tranformed description feature
# Y_dict['bow_1294'] = {}
# Y_dict['bow_1294']['train_dev'] = np.array(pickle.\
#                     load(open(processed_data_dir + "BOW1294/y_train.p", "rb")))
# Y_dict['bow_1294']['test'] = np.array(pickle.\
#                     load(open(processed_data_dir + "BOW1294/y_test.p", "rb")))

# Y_dict['BOW_tagEnhanced_1'] = {}
# Y_dict['BOW_tagEnhanced_1']['train_dev'] = np.array(pickle.\
#                     load(open(processed_data_dir + "BOW_tagEnhanced/y_train_v1.p", "rb")))
# Y_dict['BOW_tagEnhanced_1']['test'] = np.array(pickle.\
#                     load(open(processed_data_dir + "BOW_tagEnhanced/y_test_v1.p", "rb")))

# Y_dict['BOW_tagEnhanced_2'] = {}
# Y_dict['BOW_tagEnhanced_2']['train_dev'] = np.array(pickle.\
#                     load(open(processed_data_dir + "BOW_tagEnhanced/y_train_v2.p", "rb")))
# Y_dict['BOW_tagEnhanced_2']['test'] = np.array(pickle.\
#                     load(open(processed_data_dir + "BOW_tagEnhanced/y_test_v2.p", "rb")))

# Y_dict['BOW_tagEnhanced_5'] = {}
# Y_dict['BOW_tagEnhanced_5']['train_dev'] = np.array(pickle.\
#                     load(open(processed_data_dir + "BOW_tagEnhanced/y_train_v5.p", "rb")))
# Y_dict['BOW_tagEnhanced_5']['test'] = np.array(pickle.\
#                     load(open(processed_data_dir + "BOW_tagEnhanced/y_test_v5.p", "rb")))

# Y_dict['bow_4291'] = {}
# Y_dict['bow_4291']['train_dev'] = np.array(pickle.\
#                     load(open(processed_data_dir + "BOW4291/y_train_v3.p", "rb")))
# Y_dict['bow_4291']['test'] = np.array(pickle.\
#                     load(open(processed_data_dir + "BOW4291/y_test_v3.p", "rb")))