In [1]:
from collections import defaultdict
import numpy as np
import re
import pickle
import string
import sys
# adding utils to the system path
sys.path.insert(0, '../../utils')
from functions import make_tokens


PATH_TO_CRAN_TXT = '../cran/cran.all.1400'
PATH_TO_CRAN_QRY = '../cran/cran.qry'
PATH_TO_CRAN_REL = '../cran/cranqrel_bin.txt'

punctuation = set(string.punctuation)

# Regex to split the text into chuncks at the start of the marker
I_marker = re.compile('\.I.')# for articles and queries
ABTW_marker = re.compile('\.[ABTW]')# for articles
W_marker = re.compile('\.[W]') # for queries

def import_data(PATH_TO_FILE, marker_docId):
  """
  Reads the file and splits the text into entries at the ID marker '.I'.
  The first entry is empty, so it is removed.

  Input:
    PATH_TO_FILE: path to the file to be read
    marker_docId: regex at which we want to split the text
  Output:
    lines: list of strings, each string is an entry of the file

  """
  lines = []
  try:
    with open (PATH_TO_FILE,'r') as f:
      text = f.read().replace('\n'," ")
      lines = re.split(marker_docId,text)
      lines.pop(0) # removes the first empty entry

  except:
      print("File doesn't exist")
      
  return lines
 

def get_text_only(txt_list, marker_text):
  """
    It removes punctuation, divide strings by char '-', converts the text to lowercase and removes non alphabetical characters.
    It returns a list of lists, each list contains the text content (the one inside .W tag) of an entry of the file txt_list.

    Input:
      txt_list: list of strings, each string is an entry of the file.
      marker_text: regex at which we want to split the text
    Output:
      doc_tokens: list of list of tokens for each article. Each token is a lowercase string and punctuation is removed.
  """

  docs_tokens = []

  for line in txt_list:
      # Split the text into chunks at the start of each tag
      entries = re.split(marker_text, line)

      # Save only entries included in .W tag
      text_content = entries[4] if len(entries) > 2 else entries[1]

      text_content = make_tokens(text_content)
      docs_tokens.append(text_content)

  return docs_tokens


def import_relevance(PATH_TO_FILE):
  """
  Imports all the relevant articles for each query, returning a dictionary.
  The keys are the IDs (numbers) of the queries and the values are the docIDs only 
  of the relevant documents to that query (aka the one with forth column value 1).
  It is used to give user feedback in the relevance feedback.

  Input:
    PATH_TO_FILE: path to the file to be read
  Output:
    relevance: dictionary
  """
  cran_rel_data = None

  try:
    cran_rel_data = open(PATH_TO_FILE, 'r')
  except:
    print("File doesn't exist")

  cran_np = np.loadtxt(cran_rel_data, dtype=int)

  relevance = defaultdict(set)
  for row in cran_np:
    if row[3] == 1:
      relevance[row[0]-1].add(row[2]-1)
    
  return relevance


In [2]:
txt_list = import_data(PATH_TO_CRAN_TXT, I_marker)
qry_list = import_data(PATH_TO_CRAN_QRY, I_marker)

articles = get_text_only(txt_list, ABTW_marker)
queries = get_text_only(qry_list, W_marker)
relevance = import_relevance(PATH_TO_CRAN_REL)

In [48]:
# Get the list of docIDs for first term in relevance dictionary
print(list(relevance.values())[0])
# Print 

{11, 12, 13, 14, 141, 28, 29, 30, 36, 50, 51, 183, 184, 56, 55, 65, 194, 461, 857, 858, 94, 101, 874, 875, 878, 879, 496, 377}


In [3]:
#print(articles[13])
# Print articles[13] by a table of 10 words per row
for i in range(0, len(articles[13]), 10):
    print(articles[13][i:i+10])
print(len(articles[13]))

['piston', 'theory', 'a', 'new', 'aerodynamic', 'tool', 'for', 'the', 'aeroelastician', 'representative']
['applications', 'are', 'described', 'which', 'illustrate', 'the', 'extent', 'to', 'which', 'simplifications']
['in', 'the', 'solutions', 'of', 'high', 'speed', 'unsteady', 'aeroelastic', 'problems', 'can']
['be', 'achieved', 'through', 'the', 'use', 'of', 'certain', 'aerodynamic', 'techniques', 'known']
['collectively', 'as', 'piston', 'theory', 'based', 'on', 'a', 'physical', 'model', 'originally']
['proposed', 'by', 'hayes', 'and', 'lighthill', 'piston', 'theory', 'for', 'airfoils', 'and']
['finite', 'wings', 'has', 'been', 'systematically', 'developed', 'by', 'landahl', 'utilizing', 'expansions']
['in', 'powers', 'of', 'the', 'thickness', 'ratio', 'and', 'the', 'inverse', 'of']
['the', 'flight', 'mach', 'number', 'm', 'when', 'contributions', 'of', 'orders', 'and']
['are', 'negligible', 'the', 'theory', 'predicts', 'a', 'point', 'function', 'relationship', 'between']
['the', 'l

In [50]:
# Print articles[13] in a smart way
print(' '.join(articles[13]))


piston theory a new aerodynamic tool for the aeroelastician representative applications are described which illustrate the extent to which simplifications in the solutions of high speed unsteady aeroelastic problems can be achieved through the use of certain aerodynamic techniques known collectively as piston theory based on a physical model originally proposed by hayes and lighthill piston theory for airfoils and finite wings has been systematically developed by landahl utilizing expansions in powers of the thickness ratio and the inverse of the flight mach number m when contributions of orders and are negligible the theory predicts a point function relationship between the local pressure on the surface of a wing and the normal component of fluid velocity produced by the wing s motion the computation of generalized forces in aeroelastic equations such as the flutter determinant is then always reduced to elementary integrations of the assumed modes of motion essentially closed form sol

In [7]:
# Get first query
print(' '.join(queries[19]))

has anyone formally determined the influence of joule heating produced by the induced current in magnetohydrodynamic free convection flows under general conditions


In [12]:
with open("articles.pkl",'wb') as f:
    pickle.dump(articles,f)

with open("queries.pkl",'wb') as f:
    pickle.dump(queries,f)

with open("relevance.pkl",'wb') as f:
    pickle.dump(relevance,f)