In [7]:
from collections import defaultdict
import numpy as np
import re
import pickle

PATH_TO_CRAN_TXT = '../cran/cran.all.1400'
PATH_TO_CRAN_QRY = '../cran/cran.qry'
PATH_TO_CRAN_REL = '../cran/cranqrel_bin.txt'

# Regex to split the text into chuncks at the start of the marker
I_marker = re.compile('\.I.')# for articles and queries
ABTW_marker = re.compile('\.[A,B,T,W]')# for articles
W_marker = re.compile('\.[W]') # for queries

def import_data(PATH_TO_FILE, marker_docId):
  """
  Reads the file and splits the text into entries at the ID marker '.I'.
  The first entry is empty, so it is removed.

  Input:
    PATH_TO_FILE: path to the file to be read
    marker_docId: regex at which we want to split the text
  Output:
    lines: list of strings, each string is an entry of the file

  """
  lines = []
  try:
    with open (PATH_TO_FILE,'r') as f:
      text = f.read().replace('\n'," ")
      lines = re.split(marker_docId,text)
      lines.pop(0) # removes the first empty entry

  except:
      print("File doesn't exist")
      
  return lines
 

def get_text_only(doc, marker_text):
  """
    Splits the text into chunks at the start of each tag, and saves only the entries included in the '.W' tag. 
    Then it removes non-alphabetic characters and non-whitespace.

    Input:
      doc: list of strings, each string is an entry of the file
      marker_text: regex at which we want to split the text
    Output:
      doc_tokens: list of lists, each list contains the text of an entry of the file
  """
  doc_tokens = []
  for line in doc:
    #Split the text into chunks at the start of each tag
    entries= re.split(marker_text,line) 
    #Save only entries included in .W tag
    text = entries[4 if len(entries) > 2 else 1]
    #Remove non-alphabetic characters nd non-whitespace characters from text
    text = re.sub(r'[^a-zA-Z\s]+', '', text)
    #text = text.lower() # convert to lowercase
    #For each id, append a list of lists containing the text in articles
    doc_tokens.append(text.split()) # split the text into words removing the whitespaces
  return doc_tokens


def import_relevance(PATH_TO_FILE):
  """
  Imports all the relevant articles for each query, returning a dictionary.
  The keys are the IDs (numbers) of the queries and the values are the docIDs only 
  of the relevant documents to that query (aka the one with forth column value 1).
  It is used to give user feedback in the relevance feedback.

  Input:
    PATH_TO_FILE: path to the file to be read
  Output:
    relevance: dictionary
  """
  cran_rel_data = None

  try:
    cran_rel_data = open(PATH_TO_FILE, 'r')
  except:
    print("File doesn't exist")

  cran_np = np.loadtxt(cran_rel_data, dtype=int)

  relevance = defaultdict(set)
  for row in cran_np:
    if row[3] == 1:
      relevance[row[0]-1].add(row[2]-1)
    
  return relevance

In [8]:
txt_list = import_data(PATH_TO_CRAN_TXT, I_marker)
qry_list = import_data(PATH_TO_CRAN_QRY, I_marker)

articles = get_text_only(txt_list, ABTW_marker)
queries = get_text_only(qry_list, W_marker)
relevance = import_relevance(PATH_TO_CRAN_REL)

In [9]:
# Get the list of docIDs for first term in relevance dictionary
print(list(relevance.values())[0])
# Print 

{11, 12, 13, 14, 141, 28, 29, 30, 36, 50, 51, 183, 184, 56, 55, 65, 194, 461, 857, 858, 94, 101, 874, 875, 878, 879, 496, 377}


In [10]:
print(articles[12])

['similarity', 'laws', 'for', 'stressing', 'heated', 'wings', 'it', 'will', 'be', 'shown', 'that', 'the', 'differential', 'equations', 'for', 'a', 'heated', 'plate', 'with', 'large', 'temperature', 'gradient', 'and', 'for', 'a', 'similar', 'plate', 'at', 'constant', 'temperature', 'can', 'be', 'made', 'the', 'same', 'by', 'a', 'proper', 'modification', 'of', 'the', 'thickness', 'and', 'the', 'loading', 'for', 'the', 'isothermal', 'plate', 'this', 'fact', 'leads', 'to', 'the', 'result', 'that', 'the', 'stresses', 'in', 'the', 'heated', 'plate', 'can', 'be', 'calculated', 'from', 'measured', 'strains', 'on', 'the', 'unheated', 'plate', 'by', 'a', 'series', 'of', 'relations', 'called', 'the', 'similarity', 'laws', 'the', 'application', 'of', 'this', 'analog', 'theory', 'to', 'solid', 'wings', 'under', 'aerodynamic', 'heating', 'is', 'discussed', 'in', 'detail', 'the', 'loading', 'on', 'the', 'unheated', 'analog', 'wing', 'is', 'however', 'complicated', 'and', 'involves', 'the', 'novel', '

In [11]:
# Get first query
print(queries[0])

['what', 'similarity', 'laws', 'must', 'be', 'obeyed', 'when', 'constructing', 'aeroelastic', 'models', 'of', 'heated', 'high', 'speed', 'aircraft']


In [12]:
with open("articles.pkl",'wb') as f:
    pickle.dump(articles,f)

with open("queries.pkl",'wb') as f:
    pickle.dump(queries,f)

with open("relevance.pkl",'wb') as f:
    pickle.dump(relevance,f)