In [34]:
import re
from collections import defaultdict, OrderedDict
import pickle
import numpy as np

In [2]:
PATH_TO_CRAN_TXT = '../cran/cran.all.1400'
PATH_TO_CRAN_QRY = '../cran/cran.qry'
PATH_TO_CRAN_REL = '../cran/cranqrel'

In [4]:
ID_marker = re.compile('\.I.')# regex to split the text at the ID marker

def get_data(PATH_TO_FILE, marker_docId):
  """
  Reads the file and splits the text into entries at the ID marker '.I'.
  The first entry is empty, so it is removed.
  'marker' contains the regex at which we want to split
  """
  lines = []
  try:
    with open (PATH_TO_FILE,'r') as f:
      text = f.read().replace('\n'," ")
      lines = re.split(marker_docId,text)
      lines.pop(0) # removes the first empty entry

  except:
      print("File doesn't exist")
      
  return lines
 
txt_list = get_data(PATH_TO_CRAN_TXT, ID_marker)
qry_list = get_data(PATH_TO_CRAN_QRY, ID_marker)

In [18]:
#View the first entry of the list
print(txt_list[0])
print(qry_list[0])
print(len(txt_list))
print(len(qry_list))

1 .T experimental investigation of the aerodynamics of a wing in a slipstream . .A brenckman,m. .B j. ae. scs. 25, 1958, 324. .W experimental investigation of the aerodynamics of a wing in a slipstream .   an experimental study of a wing in a propeller slipstream was made in order to determine the spanwise distribution of the lift increase due to slipstream at different angles of attack of the wing and at different free stream to slipstream velocity ratios .  the results were intended in part as an evaluation basis for different theoretical treatments of this problem .   the comparative span loading curves, together with supporting evidence, showed that a substantial part of the lift increment produced by the slipstream was due to a /destalling/ or boundary-layer-control effect .  the integrated remaining lift increment, after subtracting this destalling lift, was found to agree well with a potential flow theory .   an empirical evaluation of the destalling effects was made for the spe

In [15]:
chunk_start = re.compile('\.[A,B,T,W]')# regex to split the text into chunks at the start of each tag
text_start = re.compile('\.[W]')

def get_useful_data(doc, marker_text):
  """
  Reads the file and splits the text into entries at the ID marker '.I'.
  The first entry is empty, so it is removed.
  'marker' contains the regex at which we want to split
  """
  doc_tokens = []
  for line in doc:
    entries= re.split(marker_text,line) # splits the text into chunks at the start of each tag
    #Save only entries included in .W tag
    text = entries[4 if len(entries) > 2 else 1]
    #Remove non-alphabetic characters nd non-whitespace characters from text
    text = re.sub(r'[^a-zA-Z\s]+', '', text)
    #text = text.lower() # convert to lowercase
    #For each id, append a list of lists containing the text in articles
    doc_tokens.append(text.split()) # split the text into words removing the whitespaces
  return doc_tokens

 
articles = get_useful_data(txt_list, chunk_start)
queries = get_useful_data(qry_list, text_start)

['1 ', ' experimental investigation of the aerodynamics of a wing in a slipstream . ', ' brenckman,m. ', ' j. ae. scs. 25, 1958, 324. ', ' experimental investigation of the aerodynamics of a wing in a slipstream .   an experimental study of a wing in a propeller slipstream was made in order to determine the spanwise distribution of the lift increase due to slipstream at different angles of attack of the wing and at different free stream to slipstream velocity ratios .  the results were intended in part as an evaluation basis for different theoretical treatments of this problem .   the comparative span loading curves, together with supporting evidence, showed that a substantial part of the lift increment produced by the slipstream was due to a /destalling/ or boundary-layer-control effect .  the integrated remaining lift increment, after subtracting this destalling lift, was found to agree well with a potential flow theory .   an empirical evaluation of the destalling effects was made f

In [20]:
# Lenght of the list articles
print(len(articles))
print(len(queries))
print(articles[0])
print(queries[0])

1400
225
['experimental', 'investigation', 'of', 'the', 'aerodynamics', 'of', 'a', 'wing', 'in', 'a', 'slipstream', 'an', 'experimental', 'study', 'of', 'a', 'wing', 'in', 'a', 'propeller', 'slipstream', 'was', 'made', 'in', 'order', 'to', 'determine', 'the', 'spanwise', 'distribution', 'of', 'the', 'lift', 'increase', 'due', 'to', 'slipstream', 'at', 'different', 'angles', 'of', 'attack', 'of', 'the', 'wing', 'and', 'at', 'different', 'free', 'stream', 'to', 'slipstream', 'velocity', 'ratios', 'the', 'results', 'were', 'intended', 'in', 'part', 'as', 'an', 'evaluation', 'basis', 'for', 'different', 'theoretical', 'treatments', 'of', 'this', 'problem', 'the', 'comparative', 'span', 'loading', 'curves', 'together', 'with', 'supporting', 'evidence', 'showed', 'that', 'a', 'substantial', 'part', 'of', 'the', 'lift', 'increment', 'produced', 'by', 'the', 'slipstream', 'was', 'due', 'to', 'a', 'destalling', 'or', 'boundarylayercontrol', 'effect', 'the', 'integrated', 'remaining', 'lift', 'i

In [24]:
cran_rel = defaultdict(list)
 
with open (PATH_TO_CRAN_REL,'r') as f:
  for line in f:
    line = re.split(' ',line)
    cran_rel[int(line[0])].append(line[1]) # append the relevant documents to the query id

In [32]:
# Print the first entry of the dictionary
print(list(cran_rel.items())[0])


(1, ['184', '29', '31', '12', '51', '102', '13', '14', '15', '57', '378', '859', '185', '30', '37', '52', '142', '195', '875', '56', '66', '95', '462', '497', '858', '876', '879', '880', '486'])


In [38]:
cran_rel_data = open(PATH_TO_CRAN_REL)
cran_np = np.loadtxt(cran_rel_data, dtype=int)
 
relevance = defaultdict(set)
for row in cran_np:
  relevance[row[0]].add(tuple(row[1:]))

In [40]:
#Print the first entry of the dictionary
print(list(relevance.items())[0])

(1, {(497, 3), (14, 4), (37, 3), (30, 3), (57, 2), (12, 3), (102, 3), (56, 3), (879, 3), (378, 2), (51, 3), (875, 2), (13, 4), (486, -1), (462, 4), (15, 4), (184, 2), (95, 3), (185, 3), (52, 4), (876, 3), (195, 4), (859, 2), (66, 3), (858, 3), (880, 3), (29, 2), (31, 2), (142, 4)})


In [42]:
with open("../articles.pkl",'wb') as f:
    pickle.dump(articles,f)

with open("../queries.pkl",'wb') as f:
    pickle.dump(queries,f)

with open("../relevance.pkl",'wb') as f:
    pickle.dump(relevance,f)