# Text-based Information Retrieval

## Assignment PART II
### Using wordembedding
We can use the semantic similarity of wordembeddings, such as GloVe and Word2Vec, to obtain better results.
In this part of the exercise, we will the addition analogy (similar to Part I of this assignment) to rank the given documents.


In [8]:
# Loading modules
import os, re
import pandas as pd
from numpy import dot, sum
from gensim import matutils, models, corpora, similarities
import gensim

# Set up logger that logs (works in jupyter 3!) in console and outputs in file
'''
import logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='part_II_logs.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)
'''

"\nimport logging\nlogger = logging.getLogger()\nfhandler = logging.FileHandler(filename='part_II_logs.log', mode='a')\nformatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')\nfhandler.setFormatter(formatter)\nlogger.addHandler(fhandler)\nlogger.setLevel(logging.DEBUG)\n"

#### Load in word model

In [9]:
# Load Googles' pre-trained Word2Vec vector set
# Note: This will take a lot of memory and can take a while.
# Note II: Depending on your RAM, do not load all models at the same time
w2v_model = models.Word2Vec.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True)
#w2v_model.init_sims(replace=True) # Normalize; Trims unneeded model memory = use (much) less RAM.


FileNotFoundError: [Errno 2] No such file or directory: 'data/GoogleNews-vectors-negative300.bin.gz'

In [10]:
import smart_open
import os.path

def glove2word2vec(glove_filename):
    def get_info(glove_filename): 
        num_lines = sum(1 for line in smart_open.smart_open(glove_filename))
        dims = glove_filename.split('.')[2].split('d')[0] # file name contains the number of dimensions
        return num_lines, dims
    
    def prepend_info(infile, outfile, line): # Function to prepend lines using smart_open
        with open(infile, 'r', encoding="utf8") as original: data = original.read()
        with open(outfile, 'w', encoding="utf8") as modified: modified.write(line + '\n' + data)
        return outfile
    
    word2vec_filename = glove_filename[:-3] + "word2vec.txt"
    if os.path.isfile(word2vec_filename):
        model = models.Word2Vec.load_word2vec_format(word2vec_filename)
    else:
        num_lines, dims = get_info(glove_filename)
        gensim_first_line = "{} {}".format(num_lines, dims)
        model_file = prepend_info(glove_filename, word2vec_filename, gensim_first_line)
        model = models.Word2Vec.load_word2vec_format(model_file)
    
    model.init_sims(replace = True)  # normalize all word vectors
    return model

# Load GloVes' pre-trained model
# These vectors are stored in a plain text - vector dimensionality 50, 100, 200 and 300
# only the vectors pre-trained on Wikipedia.
glove50d_model = glove2word2vec('data/glove.6B.50d.txt')

FileNotFoundError: [Errno 2] No such file or directory: 'data/glove.6B.50d.txt'

#### Images to wordvectors

We will use the similarity of wordmodels such as Word2Vec and GloVe to make vectors of each image. These vectors will look like 
>s = w1 + w2 + ... + wn

> With s = the image vector and {w1 .. wn} the words for each image


In [11]:
# Load in a stopword list from
# http://www.lextek.com/manuals/onix/stopwords2.html
stopwords = []
with open('data/stopwordlist.txt', 'r') as f:
    lines = ''.join(f.readlines())
    stopwords = [x for x in lines.split('\n')[2:]]


In [12]:
# Translate text to avg vector
def sentence_to_vector(model, sentence):
    v1 = []
    for word in sentence.split(' '):
        try:
            v1.append(model[word])
        except:
            if "-" in word: # attempt dash removing or replacing with space
                try:
                     v1.append(model[word.replace("-", "")])
                except:
                    try:
                        v1.append(model[word.split("-")[0]])
                        v1.append(model[word.split("-")[1]])
                    except:
                        print 'word not in model:', word
                        continue
            else:
                print 'word not in model:', word
                continue
    # return matutils.unitvec(array(v1).mean(axis=0))
    return matutils.unitvec(sum(v1,axis=0))


SyntaxError: Missing parentheses in call to 'print' (<ipython-input-12-f7732056e666>, line 16)

In [13]:
# Clean input because the wordmodels can not contain every possible combination words and signs
def clean_input(text, stopwords):
    # lowecase and remove linebreaks
    text = text.lower().rstrip()
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stopwords])
    # Remove punctuation
    text = re.sub('[!@#$:;%&?,_\.\'\`\"\\\/\(\)\[\]]', '', text)
    text = re.sub('[\-]+', '-', text)
    # Remove sole numbers, dashes or extra spaces
    text = re.sub('[\s][\-]+[\s]', '', text)
    text = re.sub('[0-9]+', '', text)
    text = re.sub('[\s]+', ' ', text)
    # British to American English - at this moment still hardcoded due to lack of library
    text = text.replace('grey', 'gray')
    text = text.replace('colour', 'color')
    text = text.replace('tyre', 'tire')
    text = text.replace('centre', 'center')
    text = text.replace('theatre', 'theater')
    text = text.replace('jewellery','jewelry')
    text = text.replace('aeroplane', 'plane')
    text = text.replace('harbour', 'harbor')
    text = text.replace('moustache','mustache')
    text = text.replace(' axe', ' hatchet')
    text = text.replace('armour', 'armor')
    text = text.replace('stylised', 'stylized')
    text = text.replace('organise', 'organize')
    text = text.replace('plough', 'plow')
    text = text.replace('neighbourhood', 'neighborhood')
    text = text.replace('vapour', 'vapor')
    # some manual fixes of lemmatizing
    text = text.replace('watersid ', 'waterside ')
    text = text.replace('figur ', 'figure ')
    text = text.replace(' graz ', ' graze ')
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text


In [55]:
# Text file parser
# Returns a dictionary with imageid - text in lowercase without stopwords or punctuation
def text_file_parser(filename, stopwords):
    corpus = dict()
    #corpus = pd.DataFrame(columns=('id', 'imageid', 'vec'))
    with open(filename) as f:
        next(f) # skip first line with the headings
        for doc in f:
            '''
            # Normal:
            doc_parts = doc.split('\t')
            doc_parts[2] = clean_input(doc_parts[2])
            doc_parts.append(sentence_to_vector(model, doc_parts[2]))
            corpus[doc_parts[0]] = doc_parts
            
            # If use of parsed
            '''
            # Split on spaces
            doc_parts = doc.split(" ", 1)
            # If first part is the ID (needed for the queries file)
            if(len(doc_parts[0]) < 6):
                doc_parts = doc.split(" ", 2)
                doc_parts.pop(0)
            # Clean the caption text (remove puctuation etc)
            doc_parts[1] = clean_input(doc_parts[1], stopwords)
            # add the array (or row) to an array
            corpus[len(corpus) + 1] = doc_parts        
    # Transform to dataframe
    df = pd.DataFrame.from_dict(corpus, orient='index')
    df = df.reset_index()
    df.columns = ['index', 'img_id', 'caption']
    return df

In [99]:
# images file to docs dict
print('Parsing documents')
training_docs = text_file_parser('data/target_collection_parsed.txt', stopwords)
docs = []
for text in training_docs['caption']:
    docs.append(text.split())
dictionary = corpora.Dictionary(docs)
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(doc) for doc in docs]   
# generate LDA model
lda = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word = dictionary, passes=10)
queries = text_file_parser('data/queries_val_parsed20.txt', stopwords)

# Preview
queries

Parsing documents


Unnamed: 0,index,img_id,caption
0,1,XBZPztvt67qkMUdI,man white shirt sit table cut meat plate front...
1,2,PaqtOaYmQmXkqW2i,woman red dress posing hatchet
2,3,IPcFtNL-7EQ6Z0yu,soccer play stand soccer ball front
3,4,IMAD0sq2Fz7HpSgX,white yellow train track
4,5,-gqRDDfPZTGlCfJa,view tall building city
5,6,xsrYb57vl4qiMLDG,hand pick flower vine
6,7,BCjxgJlQ3TD5T8ST,picture army ready sail
7,8,LGxwsl9CtRQ8wW3Y,brick roof house picture
8,9,9LtOvyiygFYoxU8S,man clean mess street
9,10,8usTLD-Wg5EHCShk,group kid play playground accompany adult


#### Check similarity

In [10]:
# function to calculate the similarity between 2 documents
def similarity(v1, v2):
    """
    Compute cosine similarity between two documents.
    Example:
      >>> trained_model.similarity('doc1', 'doc2')
      0.73723527
      >>> trained_model.similarity('doc2', 'doc2')
      1.0
    """
    #return dot(matutils.unitvec(v1), matutils.unitvec(v1))
    return dot(v1, v2)

In [109]:
# Calculate similarity
#training_docs["sim"] = 0.0
queries['recall'] = 0.0
queries['precision'] = 0.0
tfidf = models.TfidfModel(corpus)
length = len(corpus)
index = similarities.Similarity('./tmp/tst',corpus, num_features=length)
print('querying')
for i,r in queries.iterrows():
    
    # Calculate similarity per vector document
    vec_bow = dictionary.doc2bow(r['caption'].split())
    vec_lda = lda[vec_bow]
    sims = index[vec_lda]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    sims = [sim[0] for sim in sims[:1000]]
    
    # Recall
    total_to_find = float(len(training_docs[training_docs.img_id == r['img_id']]))
    correct = 0
    for sim in sims:
        if training_docs[training_docs.index == sim].values[0][1] == r['img_id']:
            correct += 1
    
    amount_found = correct
    recall = float(amount_found/total_to_find)
    queries.set_value(i, 'recall', recall)
    
    # Precision
    precision = float(amount_found/float(1000))
    queries.set_value(i, 'precision', precision)


querying


#### Results

In [110]:
print("Displaying results:")

# Remove irrelevant columns

# Save results
queries.to_csv(path_or_buf='data/results_part2_wordembedding.csv')

print("--")
print("AVG recall", queries.recall.mean())
print("AVG precision:",  queries.precision.mean())
print("--")

# print results
print(queries)


Displaying results:
--
AVG recall 0.04727684596105649
AVG precision: 0.0004210526315789474
--
    index            img_id  \
0       1  XBZPztvt67qkMUdI   
1       2  PaqtOaYmQmXkqW2i   
2       3  IPcFtNL-7EQ6Z0yu   
3       4  IMAD0sq2Fz7HpSgX   
4       5  -gqRDDfPZTGlCfJa   
5       6  xsrYb57vl4qiMLDG   
6       7  BCjxgJlQ3TD5T8ST   
7       8  LGxwsl9CtRQ8wW3Y   
8       9  9LtOvyiygFYoxU8S   
9      10  8usTLD-Wg5EHCShk   
10     11  kH59MJp3nWyFfFB2   
11     12  tluPF-CA6dN6LACF   
12     13  HKXYBXXObkt_yi7s   
13     14  mBJjuuB0ukfKmnRH   
14     15  cqpTaVCZZe5OpuIk   
15     16  Anry0qU5NFes6Twh   
16     17  ByjuNEqsLcbUk5OH   
17     18  5dBiwoDEpY6gWRek   
18     19  ZFdHuWXCk662UDAW   

                                              caption    recall  precision  
0   man white shirt sit table cut meat plate front...  0.000000      0.000  
1                      woman red dress posing hatchet  0.000000      0.000  
2                 soccer play stand soccer ball front 

### Results from calculations on server
For lowering the workload on our computers, we let a server do the calculations and write the results to a csv file.


In [None]:
# Read in results file
df = pd.DataFrame.from_csv('results/results.csv')

# averages
print "--"
print "AVG recall", df.recall.mean()
print "AVG precision:",  df.precision.mean()
print "--"

# Preview dataframe
df

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

# Sort by recall
df = df.sort_values(by=['recall'], ascending=[1])

# Show and save chart
qx = df.plot(x='recall', y='precision')
fig = qx.get_figure()
fig.savefig('results/part2_embedding_precision-recall.png')