In [None]:
## libraries
import json
import csv
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import scipy.spatial.distance
import nltk
from nltk.corpus import wordnet
import copy

In [None]:
## data loading
qa = np.genfromtxt("uid_qa.txt", delimiter = ",", names = True, dtype = [('int64'), ('int64'), ('int64'), ('U256'), ('U128'),])
fe = np.genfromtxt("uid_pre_elim.txt", delimiter = ",", names = True, dtype = [('int64'), ('int64'), ('int64'), ('U256'), ('U256'),])
ftd = np.genfromtxt("face_id_descr.txt", delimiter = ";", skip_header = 1 , usecols = np.arange(0,2), dtype = [('U16'), ('U2056')])

In [None]:
#changing or adding names for the dtypes
fe.dtype.names = ('uniqueID', 'bn', 'qn', 'pre_que', 'curr_elim')
ftd.dtype.names = ('img_id', 'description')


In [None]:
## first convert strings to list of rows equal to rows in fe
def mk_list(str_vector):
    des_byid = str_vector
    all_x = []
    for i in des_byid:
        j = i.split()
        #if len(j) > 0:
        all_x.append(j)

    return all_x


In [None]:
all_x = mk_list(fe['pre_que'])


In [None]:
##check blanks
## As there are some rows where there is only one image or blank/erronous question. We check that and save those indices.
## create blank list

bl =[]

for img,q in zip(enumerate(all_x), mk_list(qa['que'])):
    if (len(img[1]) <2 or len(q) < 2) and img[0] not in bl:
        bl.append(img[0])
        


In [None]:
## blank list and other list with original indices retained
original_ind = []
bl_ind = []
all_i_txt1 = []
for rown in range(len(all_x)):
    if rown in bl:
        bl_ind.append(rown)
    else:
        #print(all_x.index)
        int_i_txt = []
        for img in all_x[rown]:
            if img in ftd['img_id']:
                d = str(ftd['description'][list(ftd['img_id']).index(img)])
                int_i_txt.append(str(d))
        original_ind.append(rown)
        all_i_txt1.append(int_i_txt)
        
            

In [None]:
## For each question, find the synonyms for all the adjectives, NN, NNS and CD. Then add that list of synonyms in the 
##column called question 

new_table_fd = [] ## for frequency distribution
new_que_syn = []
for rown in qa['que'] :
    #print(rown[-2])
    #new_row = []
    sents = nltk.sent_tokenize(rown.strip())
    
    lw = [nltk.word_tokenize(s) for s in sents]
    #p =[nltk.pos_tag(w) for w in lw]
    x = [kv[0] for w in lw for kv in nltk.pos_tag(w) if kv[1] in ['JJ', 'NN', 'NNS', 'CD']]
    fd = [kv[1] for w in lw for kv in nltk.pos_tag(w)]
    #y = [wordnet.synsets(words) for words in x]
    syn = [j.name() for words in x for i in wordnet.synsets(words) for j in i.lemmas()]
    new_que_syn.append(list(set(syn)))
    new_table_fd.extend([kv[0] for w in lw for kv in nltk.pos_tag(w)])

In [None]:
all_i_txt = [row + [' '.join(map(str, new_que_syn[rn])), qa['ans'][rn].strip()] for rn, row in zip(original_ind, all_i_txt1)]

   

In [None]:
#print(all_i_txt[0])
##basic model
tf = TfidfVectorizer(sublinear_tf = True)

In [None]:
all_sim_tf = []
for l in range(len(all_i_txt)):
    H = tf.fit_transform(all_i_txt[l])
    sim = cosine_similarity(H)
    all_sim_tf.append(sim)


In [None]:
#similarities[-2] meaning taking only the array for question
all_sim_q_tf = []
for i in range(len(all_sim_tf)):
    all_sim_q_tf.append(all_sim_tf[i][-2][:-2])


In [None]:
#get the number of images eliminated
all_cheat_raw = mk_list(fe['curr_elim'])

In [None]:
#Filter the above for indices not blank

all_cheat_tf = [all_cheat_raw[i] for i in original_ind]


In [None]:
## Sort all the above similarities with argsort which gives indices
## with peeking the length of target.

sorted_ind_tf = []
for si, ln in zip(all_sim_q_tf, all_cheat_tf):
    y_top = si.argsort()[::-1][:len(ln)]
    sorted_ind_tf.append(y_top)

In [None]:
##clean_all_x(not blank)
all_x_clean_tf = [all_x[i] for i in original_ind]

In [None]:
## convert to img_id from index to match with actual image ids selcted by participant
def mk_imgid(arr):
    y_pred = []
    for i,j in arr:
        yp = []
        for s in j:
            yp.append(i[s])
        y_pred.append(yp)
    return y_pred

In [None]:
pred_img_tf = mk_imgid(zip(all_x_clean_tf, sorted_ind_tf))


In [None]:
pre_all= [len(set(a).intersection(set(p)))/len(set(p)) if len(p) > 0 else 0 for a,p in zip(all_cheat_tf, pred_img_tf)]
rec_all = [len(set(a).intersection(set(p)))/len(set(a)) if len(a) > 0 else 0 for a,p in zip(all_cheat_tf, pred_img_tf)]



In [None]:
fs_all= [(0.5*0.5 + 1)*pr*re/ (0.5*0.5*pr + re) if pr > 0 or re >0 else 0 for pr, re in zip(pre_all, rec_all)]


In [None]:
pre_mean = np.mean(np.array(pre_all))
rec_mean = np.mean(np.array(rec_all))
fs_mean = np.mean(np.array(fs_all))

In [None]:
#with open('Results with description.txt', 'w') as f:
#    f.write("No., Model, Parameter, Decision Rules, Precision, Recall, F0.5"+ '\n')

In [None]:
op3 = ["42", "TFIDF NLTK", "sublinear","Cosinesimilarity Peek",str(pre_mean), str(rec_mean), str(fs_mean)]

In [None]:
with open('Results with description.txt', 'a') as f:
    f.write('; '.join(op3)+ '\n')

In [None]:
##Another model(CountVectorizer starts here)

In [None]:
cv = CountVectorizer(ngram_range = (2,3))

In [None]:
## all similarities
all_sim_cv = []
for l in range(len(all_i_txt)):
    H = cv.fit_transform(all_i_txt[l])
    sim = cosine_similarity(H)
    all_sim_cv.append(sim)

In [None]:
#similarities[-2] meaning taking only the array for question
all_sim_q_cv = []
for i in range(len(all_sim_cv)):
    all_sim_q_cv.append(all_sim_cv[i][-2][:-2])


In [None]:
all_cheat_cv = [all_cheat_raw[i] for i in original_ind]

In [None]:
## Sort all the above similarities with argsort which gives indices
## or we should not take top5 and make a threshold above which we will include it in the list of output?
sorted_ind_cv = []
for si,ln in zip(all_sim_q_cv, all_cheat_cv):
    y_top = si.argsort()[::-1][:len(ln)]
    sorted_ind_cv.append(y_top)


In [None]:
all_x_clean_cv = [all_x[i] for i in original_ind]

In [None]:
pred_img_cv = mk_imgid(zip(all_x_clean_cv, sorted_ind_cv))

In [None]:
pre_all_cv= [len(set(a).intersection(set(p)))/len(set(p)) if len(p) > 0 else 0 for a,p in zip(all_cheat_cv, pred_img_cv)]
rec_all_cv = [len(set(a).intersection(set(p)))/len(set(a)) if len(a) > 0 else 0 for a,p in zip(all_cheat_cv, pred_img_cv)]



In [None]:
fs_all_cv= [(0.5*0.5 + 1)*pr*re/ (0.5*0.5*pr + re) if pr > 0 or re >0 else 0 for pr, re in zip(pre_all_cv, rec_all_cv)]


In [None]:
pre_mean_cv = np.mean(np.array(pre_all_cv))
rec_mean_cv = np.mean(np.array(rec_all_cv))
fs_mean_cv = np.mean(np.array(fs_all_cv))


In [None]:
op3 = ["48", "cv, NLTK, POS", "ngram 2,2", "Cosinesimilarity and ranking difference",str(pre_mean_cv), str(rec_mean_cv), str(fs_mean_cv)]

In [None]:
with open('Results with description.txt', 'a') as f:
    f.write('; '.join(op3)+ '\n')