In [3]:
import json
import re
import os
import argparse

import pandas as pd


In [2]:
# Function to remove HTML tags using BeautifulSoup
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [6]:
# queries (topics) and documents (answers) filenames
topics_filename = 'data/inputs/topics_1.json'
topics_expanded_filename = 'data/outputs/topics_1_expanded.json'
answers_filename = 'data/in/Answers.json'

# Outputted results from system and qrel file
trec_tsv_results_filename = 'result_binary_1.tsv'
trec_qrel_filename = 'data/in/qrel_1.tsv'

In [7]:
topics_df = pd.read_json(topics_filename)
topics_df.head()

Unnamed: 0,Id,Title,Body,Tags
0,3,How can I set the Software Center to install s...,<p>How can I set the Software Center to allow ...,"['software-center', 'software-installation', '..."
1,196614,How do I edit applications panel menu in Lubuntu?,<p>Main menu (<code>alacarte</code>) cannot re...,"['lubuntu', 'menu', 'editing', 'alacarte']"
2,6,How to graphically interface with a headless s...,<p>I have a ubuntu development server at work....,"['server', 'ssh', 'security', 'remote-desktop'..."
3,7,How do I run a successful Ubuntu Hour?,<p>I'm taking my be-stickered laptop to a coff...,"['community', 'locoteams']"
4,9,How do I enable automatic updates?,<p>Update Manager is constantly offering me up...,"['updates', 'unattended-upgrades']"


In [8]:
topics_exp_df = pd.read_json(topics_expanded_filename)
topics_exp_df.head()

Unnamed: 0,Id,Title,Body,Tags
0,3,How can I set the Software Center to install s...,<p>How can I set the Software Center to allow ...,"['software-center', 'software-installation', '..."
1,196614,How do I edit applications panel menu in Lubun...,<p>Main menu (<code>alacarte</code>) cannot re...,"['lubuntu', 'menu', 'editing', 'alacarte']"
2,6,How to graphically interface with a headless s...,<p>I have a ubuntu development server at work....,"['server', 'ssh', 'security', 'remote-desktop'..."
3,7,How do I run a successful Ubuntu Hour? - Ubunt...,<p>I'm taking my be-stickered laptop to a coff...,"['community', 'locoteams']"
4,9,How do I enable automatic updates? How do I kn...,<p>Update Manager is constantly offering me up...,"['updates', 'unattended-upgrades']"


In [19]:
topics_exp_df[topics_df['Id']==6]['Title'][2]

"How to graphically interface with a headless server? [closed]\nI'm trying to interface with a headless server using a Python script. I'm able to connect to the server, but I'm not able to graphically interface with it.\nIs there a way to do this? I'm trying to use PyQT5.\nI'm using the following command:\nfrom PyQt5 import QtWidgets\nfrom PyQt5.QtCore import *\nfrom PyQt5.QtGui import *\nfrom PyQt5.QtWidgets import *\nimport sys\napp = QtWidgets.QApplication(sys.argv"

In [6]:
binary_results = pd.read_csv(trec_tsv_results_filename, sep='\t', header=None)
binary_results.columns = ['query_id', 'q0', 'doc_id', 'rank', 'score', 'model']
binary_results.head()

Unnamed: 0,query_id,q0,doc_id,rank,score,model
0,49160,Q0,4481,1,0.166667,boolean_retrieval
1,49160,Q0,30542,2,0.111111,boolean_retrieval
2,49160,Q0,46568,3,0.111111,boolean_retrieval
3,49160,Q0,30120,4,0.095238,boolean_retrieval
4,49160,Q0,8991,5,0.083333,boolean_retrieval


In [7]:
qrels = pd.read_csv(trec_qrel_filename, sep='\t', header=None)
qrels.columns = ['query_id', 'q0', 'doc_id', 'relevance']
qrels.head()


Unnamed: 0,query_id,q0,doc_id,relevance
0,67308,0,67494,2
1,67308,0,67311,2
2,67308,0,67312,2
3,67308,0,67421,2
4,67308,0,67325,1


In [39]:
# Get a random query
query_id = binary_results['query_id'].sample(1).values[0]

# Show the query text
query = topics_df[topics_df['Id'] == query_id]
query_title = query['Title']
query_body = (query['Body'])
query_tags = query['Tags'].values[0]
print("Query title: ", query_title.values[0])
print("Query body: ", query_body.values[0])
print("Query tags: ", query_tags)
print()

# Get all corresponding documents
query_results = binary_results[binary_results['query_id'] == query_id]

# Show the document text for the most relevant document
top_doc_id = query_results.iloc[0]['doc_id']
bottom_doc_id = query_results.iloc[99]['doc_id']

# Get qrels for this query
query_qrels = qrels[qrels['query_id'] == query_id]


query_qrels = query_qrels[['doc_id', 'relevance']]
query_results = query_results[['doc_id', 'rank']]

query_results = query_results.merge(query_qrels, on='doc_id', how='left')

# Are there any relevant documents?
relevant_docs = query_results[query_results['relevance'] > 0]
relevant_docs

Query title:  We use this everyday without noticing, but we hate it when we feel it
Query body:  <p><strong>Here is a word with an uncertain number of letters <em>(hint: it's not 4 letters)</em>: ####</strong></p><ul><li>We use #### everyday.</li><li>We use #### when we write, play video games, exercise, eat, heck, I'm using #### as I'm typing!</li><li>#### is something that we all can give to others, yet it is not good to do so.</li><li>We hate to feel ####, but we all have in our lives.</li></ul><p><strong>What is ####?</strong></p>
Query tags:  ['riddle', 'word']



Unnamed: 0,doc_id,rank,relevance
26,105372,27,1.0
59,105353,60,2.0


In [41]:
# Print out answer strings for the relevant docs
print(remove_html_tags(answers_df[answers_df['Id'] == 105372]['Text'].values[0]))

Is the answer: It ?We use #### everyday. Well of course we use it everydayWe use #### when we write, play video games, exercise, eat, heck, I'm using #### as I'm typing!! It is written at the begining of the riddle#### is something that we all can give to others, yet it is not good to do so. I guess giving it to others is not a good thingWe hate to feel ####, but we all have in our lives. Here I am having some troubles, but I guess we hate to feel it, but we have it in our lives ?


In [42]:
# What about an irrelevant document (NaN relevance)
irrelevant_docs = query_results[query_results['relevance'].isnull()]
irrelevant_docs

Unnamed: 0,doc_id,rank,relevance
0,30542,1,
1,23036,2,
2,35423,3,
3,121241,4,
4,4481,5,
...,...,...,...
95,44076,96,
96,11097,97,
97,35164,98,
98,43650,99,


In [43]:
print(remove_html_tags(answers_df[answers_df['Id'] == 30542]['Text'].values[0]))

The answer to the riddle is:   your word
