# CS5785 final
## Xianhui Li, Zimeng Zhu

In [1]:
import os
import csv
import random
import gensim
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords 
import string

num_train = 8000
num_dev = 2000
num_test = 2000
split_idx = list(range(num_train + num_dev))
random.shuffle(split_idx)

In [2]:
def parse_descriptions(data_dir, num_doc):
    docs = []
    for i in range(num_doc):
        path = os.path.join(data_dir, "%d.txt" % i)
        with open(path) as f:
            docs.append(f.read())
    return docs

# build x matrices
train_dev_desc = parse_descriptions("descriptions_train", num_doc=(num_train+num_dev))
test_desc = parse_descriptions("descriptions_test", num_doc=num_test)
x_train = np.array([train_dev_desc[i] for i in split_idx[:num_train]])
x_dev = np.array([train_dev_desc[i] for i in split_idx[num_train:]])
x_test = np.array([d for d in test_desc])

In [43]:
def parse_features(features_path):
    vec_map = {}
    with open(features_path) as f:
        for row in csv.reader(f):
            img_id = int(row[0].split("/")[1].split(".")[0])
            vec_map[img_id] = np.array([float(x) for x in row[1:]])
    return np.array([v for k, v in sorted(vec_map.items())])

# build y matrices
p = np.random.randn(2048, 2048)
y_train_dev = parse_features("features_train/features_resnet1000intermediate_train.csv") @ p
y_train = y_train_dev[split_idx[:num_train]]
y_dev = y_train_dev[split_idx[num_train:]]
y_test = parse_features("features_test/features_resnet1000intermediate_test.csv") @ p

print("Built all y matrices!")
print("y_train shape:", y_train.shape)
print("y_dev shape:", y_dev.shape)
print("y_test shape:", y_test.shape)

Built all y matrices!
y_train shape: (8000, 2048)
y_dev shape: (2000, 2048)
y_test shape: (2000, 2048)


In [4]:
## Preprocess x train dataset
new_x_train=[]
for i in x_train:
    ## Lowercase all of the words
    i = i.lower()
    
    ## strip punctuation
    i = i.translate(str.maketrans('','',string.punctuation))
    i = i.translate(str.maketrans('','','1234567890'))
    
    ## strip stop words
    parse = i.split()
    parse = [word for word in parse if word not in stopwords.words('english')]
    
    ## Lemmatization of all the words
    lmtzr = WordNetLemmatizer()
    parse = [lmtzr.lemmatize(a) for a in parse]
    
    new_x_train.append(parse)

In [5]:
## Preprocess x_test dataset
new_x_test=[]
for i in x_test:
    ## Lowercase all of the words
    i = i.lower()
    
    ## strip punctuation
    i = i.translate(str.maketrans('','',string.punctuation))
    i = i.translate(str.maketrans('','','1234567890'))
    
    ## strip stop words
    parse = i.split()
    parse = [word for word in parse if word not in stopwords.words('english')]
    
    ## Lemmatization of all the words
    lmtzr = WordNetLemmatizer()
    parse = [lmtzr.lemmatize(a) for a in parse]
    ##st = LancasterStemmer()
    ##parse = [st.stem(a) for a in parse]
    
    new_x_test.append(parse)

In [6]:
## Preprocess x_dev dataset
new_x_dev=[]
for i in x_dev:
    ## Lowercase all of the words
    i = i.lower()
    
    ## strip punctuation
    i = i.translate(str.maketrans('','',string.punctuation))
    i = i.translate(str.maketrans('','','1234567890'))
    
    ## strip stop words
    parse = i.split()
    parse = [word for word in parse if word not in stopwords.words('english')]
    
    ## Lemmatization of all the words
    lmtzr = WordNetLemmatizer()
    parse = [lmtzr.lemmatize(a) for a in parse]
    ##st = LancasterStemmer()
    ##parse = [st.stem(a) for a in parse]
    
    new_x_dev.append(parse)

In [7]:
vectorizer = CountVectorizer()
x_train = []
for i in range(len(new_x_train)):
    x_train.append(' '.join(new_x_train[i]))

x_train_bow = vectorizer.fit_transform(x_train)
vocab = np.array(vectorizer.get_feature_names())

transformer = TfidfTransformer()
x_train_tfidf = transformer.fit_transform(x_train_bow).toarray()

x_test = []
for i in range(len(new_x_test)):
    x_test.append(' '.join(new_x_test[i]))
    
vectorizer2 = CountVectorizer(vocabulary = vocab)
x_test_bow = vectorizer2.fit_transform(x_test)
x_test_tfidf = transformer.fit_transform(x_test_bow).toarray()

x_dev = []
for i in range(len(new_x_dev)):
    x_dev.append(' '.join(new_x_dev[i]))

x_dev_bow = vectorizer2.fit_transform(x_dev)
x_dev_tfidf = transformer.fit_transform(x_dev_bow).toarray()

In [8]:
print(x_train_tfidf.shape)
print(x_dev_bow.shape)
type(x_dev_bow)

(8000, 7495)
(2000, 7495)


scipy.sparse.csr.csr_matrix

In [None]:
def parse_tags(data_dir, num_doc):
    docs = []
    for i in range(num_doc):
        path = os.path.join(data_dir, "%d.txt" % i)
        with open(path) as f:
            docs.append(f.read())
    return docs

# build x matrices
train_dev_tag = parse_tags("tags_train", num_doc=(num_train+num_dev))
test_tag = parse_tags("tags_test", num_doc=num_test)
z_train = np.array([train_dev_tag[i] for i in split_idx[:num_train]])
z_dev = np.array([train_dev_tag[i] for i in split_idx[num_train:]])
z_test = np.array([d for d in test_tag])

In [None]:
vectorizer_z = CountVectorizer()
z_train_bow = vectorizer_z.fit_transform(z_train)
vocab = np.array(vectorizer_z.get_feature_names())
z_train_tfidf = transformer.fit_transform(z_train_bow).toarray()

vectorizer_z2 = CountVectorizer(vocabulary = vocab)
z_test_bow = vectorizer_z2.fit_transform(z_test)
z_test_tfidf = transformer.fit_transform(z_test_bow).toarray()

z_dev_bow = vectorizer_z2.fit_transform(z_dev)
z_dev_tfidf = transformer.fit_transform(z_dev_bow).toarray()

In [49]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# train OLS model with regression
parameters = {"alpha": [10.0]}
reg = GridSearchCV(Ridge(), parameters)
reg.fit(x_train_bow, y_train)
reg_best = reg.best_estimator_

print("Trained linear regression model!")
print("Summary of best model:")
print(reg_best)

Trained linear regression model!
Summary of best model:
Ridge(alpha=10.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)


In [50]:
import sklearn.metrics
def dist_matrix(x1, x2):
    return sklearn.metrics.pairwise.cosine_distances(x1, x2)
# def dist_matrix(x1, x2):
#     return ((np.expand_dims(x1, 1) - np.expand_dims(x2, 0)) ** 2).sum(2) ** 0.5

# test performance on development set
y_dev_pred = reg.predict(x_dev_bow)
dev_distances = dist_matrix(y_dev_pred, y_dev)
dev_scores = []
dev_pos_list = []

for i in range(num_dev):
    pred_dist_idx = list(np.argsort(dev_distances[i]))
    dev_pos = pred_dist_idx.index(i)
    dev_pos_list.append(dev_pos)
    if dev_pos < 20:
        dev_scores.append(1 / (dev_pos + 1))
    else:
        dev_scores.append(0.0)

print("Development MAP@20:", np.mean(dev_scores))
print("Mean index of true image", np.mean(dev_pos_list))
print("Median index of true image", np.median(dev_pos_list))

Development MAP@20: 0.28184251942373073
Mean index of true image 32.328
Median index of true image 7.0


In [46]:
# create test predictions
x_train_all = np.concatenate([x_train_bow.toarray(), x_dev_bow.toarray()])
y_train_all = np.concatenate([y_train, y_dev])
reg_best.fit(x_train_all, y_train_all)
y_test_pred = reg_best.predict(x_test_bow.toarray())
test_distances = dist_matrix(y_test_pred, y_test)
pred_rows = []

for i in range(num_test):
    test_dist_idx = list(np.argsort(test_distances[i]))
    top_20 = test_dist_idx[:20]
    row = ["%d.jpg" % i for i in test_dist_idx[:20]]
    pred_rows.append(" ".join(row))

with open("test_submission_f.csv", "w") as f:
    f.write("Descritpion_ID,Top_20_Image_IDs\n")
    for i, row in enumerate(pred_rows):
        f.write("%d.txt,%s\n" % (i, row))

print("Output written!")

Output written!
