In [1]:
import numpy as np
import string
import math
import glob
import os
import itertools
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from PIL import Image
from IPython.display import display, Markdown
import nltk
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import svm
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors as KNN
from sklearn.neighbors import DistanceMetric
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_decomposition import PLSRegression,PLSCanonical,CCA
from scipy import misc
import re
import csv
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vianne/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  from numpy.core.umath_tests import inner1d


### Read Image features

In [2]:
# get feature
train_feat_fc, test_feat_fc = [], []
for line in open("./data_final/features_train/features_resnet1000_train.csv"):
    l = line.strip().split(",")
    train_feat_fc.append(l)
train_feat_fc = np.array(train_feat_fc)
for line in open("./data_final/features_test/features_resnet1000_test.csv"):
    l = line.strip().split(",")
    test_feat_fc.append(l)
test_feat_fc = np.array(test_feat_fc)
train_feat_p, test_feat_p = [], []
for line in open("./data_final/features_train/features_resnet1000intermediate_train.csv"):
    l = line.strip().split(",")
    train_feat_p.append(l)
train_feat_p = np.array(train_feat_p)
for line in open("./data_final/features_test/features_resnet1000intermediate_test.csv"):
    l = line.strip().split(",")
    test_feat_p.append(l)
test_feat_p = np.array(test_feat_p)

In [3]:
train_feat_fc_dict, test_feat_fc_dict,train_feat_p_dict, test_feat_p_dict = dict(),dict(),dict(),dict()
def get_feat_dict(feat,feat_dict):
    for i in range(len(feat)):
        name = int(feat[i][0].split("/")[1].split(".")[0])
        feat_dict[name] = np.array(feat[i][1:], dtype=float)
        

In [4]:
get_feat_dict(train_feat_fc,train_feat_fc_dict)
get_feat_dict(test_feat_fc,test_feat_fc_dict)
get_feat_dict(train_feat_p,train_feat_p_dict)
get_feat_dict(test_feat_p,test_feat_p_dict)

# img features in: train_feat_fc_dict, test_feat_fc_dict,train_feat_p_dict, test_feat_p_dict
# key: sample name, val: feature

### Descriptions to Bag of words

In [5]:
#analyze words in descriptions and create a word dictionary 
des_word_dict = dict()
for filename in glob.glob(os.path.join("./data_final/descriptions_train/", "*.txt")):
    with open(filename, "r") as des_file:
        des = des_file.read()
        des = np.char.lower(des)
        des = re.sub('[^\w\s]', ' ', str(des))
        stemming = PorterStemmer()
        for word in des.split():
            try:
                if word not in stopwords.words("english"):
                    w = stemming.stem(word)
                    if w in des_word_dict:
                        des_word_dict[w] += 1
                    else:
                        des_word_dict[w] = 1
            except:
                pass
# des_word_dict

In [6]:
# create a BoW model for train and test descriptions
word_dict = dict()
ind = 0
for word in des_word_dict:
    if des_word_dict[word] >= 2:
        word_dict[word] = ind
        ind += 1

def get_bow_des(path,word_dict):
    des_vec = dict()
    for filename in glob.glob(os.path.join(path, "*.txt")):
        with open(filename, "r") as des_file:
            des = des_file.read()
            des = np.char.lower(des)
            des = re.sub('[^\w\s]', ' ', str(des))
            stemming = PorterStemmer()
            sent_bow = [0] * len(word_dict)
            for word in des.split():
                try:
                    if word not in stopwords.words("english"):
                        w = stemming.stem(word)
                        if w in word_dict:
                            sent_bow[word_dict[w]] += 1
                except:
                    pass
            des_vec[int(os.path.splitext(filename.split('/')[-1])[0])] = sent_bow
    return des_vec

train_des = get_bow_des("./data_final/descriptions_train/",word_dict)
test_des = get_bow_des("./data_final/descriptions_test/",word_dict)

# word_dict
# key: word, value: indec

#train_des, test_des
#key: sample name, value: bow

In [9]:
train_samplename = np.array(train_des.keys())
test_samplename = np.array(test_des.keys())

# train_samplename,test_samplename
# sample names in order

In [11]:
#TF-IDF transformation

xTr_des = np.array([train_des[i] for i in train_samplename])
xTe_des = np.array([test_des[i] for i in test_samplename])
transformer_des = TfidfTransformer(smooth_idf=False)

xTr_des_norm = transformer_des.fit_transform(xTr_des).toarray()
xTe_des_norm = transformer_des.transform(xTe_des).toarray()

# xTr_des_norm, xTe_des_norm
# bow matrix after tfidf, order corresponds to train_samplename,test_samplename

Generated:
    xTr_des_norm, xTe_des_norm (description input)
    train_samplename, test_samplename (file name/label)

# Tag-based prediction

### Tags to Bag of words

In [12]:
#analyze words in tags and create a word dictionary 
tag_word_dict = dict()
for filename in glob.glob(os.path.join("./data_final/tags_train/", "*.txt")):
    with open(filename, "r") as tag_file:
        tag = tag_file.read()
        tag = np.char.lower(tag)
        tag = re.sub('[^\w\s]', ' ', str(tag))
        tag = re.sub(':', ' ', str(tag))
        stemming = PorterStemmer()
        for word in tag.split():
            try:
                if word not in stopwords.words("english"): #not stop word
                    if nltk.pos_tag([word])[0][1] in ['NN','NNP','NNS','NNPS']: #is noun
                        w = stemming.stem(word)
                        if w in tag_word_dict:
                            tag_word_dict[w] += 1
                        else:
                            tag_word_dict[w] = 1
            except:
                pass

# create a BoW model for train and test descriptions
tag_dict = dict()
ind = 0
for word in tag_word_dict:
    if tag_word_dict[word] >=2 :
        tag_dict[word] = ind
        ind += 1
        
def get_bow_tag(path,tag_dict):
    tag_vec = dict()
    for filename in glob.glob(os.path.join(path, "*.txt")):
        with open(filename, "r") as tag_file:
            tag = tag_file.read()
            tag = np.char.lower(tag)
            tag = re.sub('[^\w\s]', ' ', str(tag))
            stemming = PorterStemmer()
            tag_bow = [0] * len(tag_dict)
            for word in tag.split():
                try:
                    if word not in stopwords.words("english"):
                        w = stemming.stem(word)
                        if w in tag_dict:
                            tag_bow[tag_dict[w]] = 1
                except:
                    pass
            tag_vec[int(os.path.splitext(filename.split('/')[-1])[0])] = tag_bow
    return tag_vec
train_tag = get_bow_tag("./data_final/tags_train/",tag_dict)
test_tag = get_bow_tag("./data_final/tags_test/",tag_dict)

# tag_dict
# key: word, values: indec

#train_tag, test_tag
# key: sample, values: bow_tags

In [14]:
train_tagname = np.array(train_tag.keys())
test_tagname = np.array(test_tag.keys())

#sample names

In [15]:
#TF-IDF transformation

xTr_tag = np.array([train_tag[i] for i in train_tagname])
xTe_tag = np.array([test_tag[i] for i in test_tagname])
#transformer_tag = TfidfTransformer(smooth_idf=False)

#xTr_tag_norm = transformer_tag.fit_transform(xTr_tag).toarray()
#xTe_tag_norm = transformer_tag.transform(xTe_tag).toarray()

# xTr_tag, xTe_tag
# bow_tag matrix with order in train_tagname, test_tagname

### Description_bow to tags_bow

In [16]:
#Linear SVC to predict tags from description
svm_yTe = []
for i in range(len(train_tag[4000])):
    y = xTr_tag[:,i]
    clf = svm.LinearSVC(random_state=0, tol=1e-5,C=2)
#     clf = svm.SVC(random_state=0, tol=1e-5,C=1)
    clf.fit(xTr_des_norm, y)
    tag = clf.predict(xTe_des_norm)
    svm_yTe.append(tag)
svm_yTe = np.array(svm_yTe).T

# svm_yTe: predicted tags for each description

### Predicted tags to img tags using kNN

In [25]:
# test descriptions: test_samplename (y)
# test tags: test_tagname
knn = KNN(n_neighbors=20,metric='mahalanobis',metric_params={'V': np.cov(xTe_tag,rowvar=False),'VI':np.linalg.pinv(np.cov(xTe_tag,rowvar=False))})
knn = knn.fit(xTe_tag,test_tagname)
tag_predictions = knn.kneighbors(svm_yTe, return_distance = False)
tag_predictions = test_tagname[tag_predictions]



In [48]:
knn_dist = KNN(n_neighbors=2000,metric='mahalanobis',metric_params={'V': np.cov(xTe_tag,rowvar=False),'VI':np.linalg.pinv(np.cov(xTe_tag,rowvar=False))})
knn_dist = knn_dist.fit(xTe_tag,test_tagname)
tag_predictions_dist = knn_dist.kneighbors(svm_yTe, return_distance = True)
tag_similarity = tag_predictions_dist[0]

#tag_predictions_dist = test_tagname[tag_predictions_dist[1]]
nn_2000_tag = tag_predictions_dist[1]

In [42]:
# def write_dist(test_samplename,predictions,nn_2000,output='p5_no_noun_distance.csv'):
#     sorted_dist = []
#     for i in range(2000):
#         sorted_samplename, sorted_predictions = zip(*sorted(zip(nn_2000[i,:],predictions[i,:])))
#         sorted_dist.append(sorted_predictions)
    
#     nn_list = []
#     for i, row in enumerate(sorted_dist):
#         temp = str(test_samplename[i])
#         for j, val in enumerate(row):
#             temp = temp + ' ' + str(val)
#         nn_list.append(temp)
#     index = []
#     for i in sorted_samplename:
#         index.append(str(i))
#     with open(output, 'wb') as f:
#         writer = csv.writer(f)
#         writer.writerows(itertools.izip(index, nn_list))

In [None]:
# write_result(test_samplename,tag_predictions,output='final_tags_1205.csv')
# write_dist(test_samplename,tag_similarity,nn_2000_tag,'tag_distance.csv')
# sorted_tag_similarity = []
# for i in range(2000):
#     t_sorted_samplename, t_sorted_predictions = zip(*sorted(zip(nn_2000_tag[i,:],tag_similarity[i,:])))
#     sorted_tag_similarity.append(t_sorted_predictions)

In [None]:
# print bow for debugging
# print(test_samplename[0])
# print(test_des.keys()[0])

# for ind in (np.where(np.array(test_des[str(test_des.keys()[0])])!=0)[0]):
#     print(np.array(word_dict.keys())[np.where(np.array(word_dict.values()==ind))[0]])
# print("====")
# for ind in (np.where(td_tree_yTe[0]!=0)[0]):
#     print(tag_dict.keys()[ind])
# print("====")
# for ind in (np.where(xTe_tag_norm[0]!=0)[0]):
#     print(tag_dict.keys()[ind])

# Image-based prediction
## Image Pool5 features to Bag of words (all words)

In [29]:
#image feature from pool5 to bow_all
p5Tr = []
p5Te = []
desTr = xTr_des_norm
desTe = xTe_des_norm
yTr = train_samplename
yTe = test_samplename
for i in train_samplename:
    p5Tr.append(train_feat_p_dict[i])

for j in test_samplename:
    p5Te.append(test_feat_p_dict[j])


In [30]:
pls2 = PLSRegression(n_components=400)
pls2.fit(p5Tr, desTr)



PLSRegression(copy=True, max_iter=500, n_components=400, scale=True,
       tol=1e-06)

In [32]:
pls2_pred = pls2.predict(p5Te)

In [36]:
#find knn cosine dist
yTe = np.array(yTe)
# test descriptions: test_samplename (y)
# test tags: test_tagname
knn_cos = KNN(n_neighbors=20,metric='cosine')
knn_cos = knn_cos.fit(pls2_pred,yTe)

img_predictions = knn_cos.kneighbors(desTe, return_distance = False)
img_predictions = yTe[img_predictions]

###


In [37]:
knn_img_dist = KNN(n_neighbors=2000,metric='cosine')
knn_img_dist = knn_img_dist.fit(pls2_pred,yTe)
img_predictions_dist = knn_img_dist.kneighbors(desTe, return_distance = True)
#####
img_similarity = img_predictions_dist[0]
nn_2000_img =img_predictions_dist[1]

### Ensembling

In [38]:
# write_result(test_samplename,img_predictions,output='final_pool5_pls_allwords_results.csv')
# write_dist(test_samplename,img_similarity,nn_2000_img,'pool5_pls_all_distance.csv')
# sorted_img_similarity = []
# for i in range(2000):
#     t_sorted_samplename, t_sorted_predictions = zip(*sorted(zip(nn_2000_img[i,:],img_similarity[i,:])))
#     sorted_img_similarity.append(t_sorted_predictions)
    

In [31]:
# def write_result(test_samplename,predictions,output='tune_result_bowall.csv'):
#     sorted_test_samplename, sorted_predictions = zip(*sorted(zip(test_samplename,predictions)))
#     nn_list = []
#     for i, row in enumerate(sorted_predictions):
#         temp = ''
#         for j, val in enumerate(row):
#             temp = temp + ' ' + (str(val) + ".jpg")
#         nn_list.append(temp)
#     index = []
#     for i in sorted_test_samplename:
#         index.append(str(i)+ ".txt")
#     with open(output, 'wb') as f:
#         writer = csv.writer(f)
#         writer.writerows(itertools.izip(index, nn_list))
# def get_acc(test_samplename,predictions):
#     acc = 0
#     score = 0
#     for i in range(len(test_samplename)):
#         if test_samplename[i] in predictions[i]:
#             acc+=1
#             score += float(20-np.where(predictions[i]==test_samplename[i])[0])/20
#     print(float(score)/len(test_samplename))
#     print(acc)

In [69]:
# from sklearn.preprocessing import scale
# from sklearn.preprocessing import MinMaxScaler,minmax_scale

In [70]:
# #scaler = MinMaxScaler()
# scaled_img_sim,scaled_tag_sim = [],[]
# for row in sorted_img_similarity:
#     scaled_img_sim.append(minmax_scale(np.array(row)))
# for rov in sorted_tag_similarity:
#     scaled_tag_sim.append(minmax_scale(np.array(rov)))

In [82]:
# ensemble_sim = np.add(np.asarray(scaled_img_sim), np.asarray(scaled_tag_sim))
# ensemble_nn = np.argsort(ensemble_sim, axis=1)
# write_result(test_samplename,ensemble_nn[:,0:20],output='ensemble_tag_pls_all_v1.csv')

In [95]:
# fc_data = np.loadtxt("distance/distance_pls_400_fc.csv", delimiter=',', usecols=range(1,2001))
# #scaler = MinMaxScaler()
# scaled_fc_sim = []
# for rou in fc_data:
#     scaled_fc_sim.append(minmax_scale(np.array(rou)))
# ensemble_sim2 = np.add(ensemble_sim, np.asarray(scaled_fc_sim))
# ensemble_nn2 = np.argsort(ensemble_sim2, axis=1)
# write_result(test_samplename,ensemble_nn2[:,0:20],output='ensemble_tag_plsall_fc_v2.csv')

In [104]:
# p5nn_data = np.loadtxt("distance/pool5_pls_no_nouns_cosinedistance.csv", delimiter=' ', usecols=range(1,2001))
# scaled_p5nn_sim = []
# for rox in p5nn_data:
#     scaled_p5nn_sim.append(minmax_scale(np.array(rox)))
# ensemble_sim3 = np.add(ensemble_sim2, np.asarray(scaled_p5nn_sim))
# ensemble_nn3 = np.argsort(ensemble_sim3, axis=1)
# write_result(test_samplename,ensemble_nn3[:,0:20],output='ensemble_tag_plsall_plsnn_fc_v3.csv')



In [125]:
tag2_data = np.loadtxt("distance/distance_pls_100_tag.csv", delimiter=',', usecols=range(1,2001))
p750_data = np.loadtxt("distance/distance_pls_750_pool.csv", delimiter=',', usecols=range(1,2001))
scaled_tag2_sim = []
for r1 in tag2_data:
    scaled_tag2_sim.append(minmax_scale(np.array(r1)))
scaled_p750_sim = []
for r2 in p750_data:
    scaled_p750_sim.append(minmax_scale(np.array(r2)))

    
ensemble_sim4 = np.add(0.5*np.asarray(scaled_tag_sim), np.asarray(scaled_fc_sim))
ensemble_sim4 = np.add(ensemble_sim4, 0.5*np.asarray(scaled_tag2_sim))
ensemble_sim4 = np.add(ensemble_sim4, np.asarray(scaled_p750_sim))

ensemble_nn4 = np.argsort(ensemble_sim4, axis=1)

write_result(test_samplename,ensemble_nn4[:,0:20],output='ensemble_tag_plsall_plstag_fc_v7.csv')
