In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install nltk==3.4

Collecting nltk==3.4
  Downloading nltk-3.4.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 4.2 MB/s 
Collecting singledispatch
  Downloading singledispatch-3.7.0-py2.py3-none-any.whl (9.2 kB)
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.4-py3-none-any.whl size=1436395 sha256=61cfbaa3445ffc42e68afb96f516f99ecdb5ac5065d1050e9c448f31708be263
  Stored in directory: /root/.cache/pip/wheels/13/b8/81/2349be11dd144dc7b68ab983b58cd2fae353cdc50bbdeb09d0
Successfully built nltk
Installing collected packages: singledispatch, nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.4 singledispatch-3.7.0


In [None]:
import json
import string
import re
from nltk import ngrams
import collections
from typing import List

In [None]:
def normalize_answer(s: str) -> str:
  """Lower text and remove punctuation, articles and extra whitespace."""

  def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)

  def white_space_fix(text):
    return ' '.join(text.split())

  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)

  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

In [None]:
def compute_f1_from_tokens(gold_toks: List[str], pred_toks: List[str]) -> float:
  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
  num_same = sum(common.values())

  if len(gold_toks) == 0 or len(pred_toks) == 0:
    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
    return int(gold_toks == pred_toks)

  if num_same == 0:
    return 0

  precision = 1.0 * num_same / len(pred_toks)
  recall = 1.0 * num_same / len(gold_toks)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1

In [None]:
paras = {}  # base_id -> [(p1, content1), (p2, content2)...]

for i in range(4):
  with open('/content/drive/MyDrive/Datasets/QReCC/collection_paragraph/wayback/'+str(i)+'.jsonl') as f:
    for line in f:
      p = json.loads(line)
      contents = normalize_answer(p['contents'])
      id = p['id']
      base_id = '_'.join(id.split('web.archive')[1].split('_/')[1].split('_')[:-1])
      tup = (id, contents)
      if base_id in paras:
        paras[base_id].append(tup)
      else:
        l = []
        l.append(tup)
        paras[base_id] = l

In [None]:
def make_examples(path, d):

  with open(path) as f:
    data = json.load(f)
    example_list = []

    for example in data:
      mod_example = {}
      context = ' '.join(example['Context'])
      context = normalize_answer(context)
      question = normalize_answer(example['Question'])
      rewrite = normalize_answer(example['Rewrite'])

      answer = normalize_answer(example['Answer'])
      n = len(answer.split())

      url = example['Answer_URL']
      if url == '' or url not in d: continue

      score = -1
      for tup in d[url]:
        p = tup[1]
        n_grams = ngrams(p.split(), n)  # breaking passage into ngrams
        for gram in n_grams:
          s = compute_f1_from_tokens(answer.split(), gram)
          if s > score:
            score = s
            passage = p
            f1_answer = ' '.join(gram)

      if score >= 0.8:  # take example if it has an answer (relevant passage)
        mod_example['f1_answer'] = f1_answer
        mod_example['answer'] = answer
      else:
        mod_example['f1_answer'] = 'no_ans'
        mod_example['answer'] = 'no_ans'
 
      mod_example['context'] = context
      mod_example['question'] = question
      mod_example['rewrite'] = rewrite
      #mod_example['answer'] = answer
      mod_example['url'] = url
      mod_example['passage'] = passage
      mod_example['turn_no'] = example['Turn_no']
      mod_example['conversation_no'] = example['Conversation_no']

      example_list.append(mod_example)
    
    return example_list
    
qrecc = make_examples('/content/drive/MyDrive/Datasets/QReCC/qrecc_train.json', paras)
qrecc_test = make_examples('/content/drive/MyDrive/Datasets/QReCC/qrecc_test.json', paras)

In [None]:
len(qrecc)

24037

In [None]:
len(qrecc)

24037

In [None]:
qrecc[600]

{'answer': 'in interview with zone magazine m john harrison says i liked anything bizarre from about four years old i started on dan dare and worked up to absurdists',
 'context': 'what was m john harrisons first novel committed men is m john harrisons first novel did m john harrison write in any other genre before writing science fiction in m john harrisons early years his hobbies included writing pastiches of h h munro how popular were m john harrisons first stories m john harrisons first short story was published during 1966 by kyril bonfiglioli at science fantasy magazine on strength of which he relocated to london',
 'conversation_no': 215,
 'f1_answer': 'in interview with zone magazine harrison says i liked anything bizarre from being about four years old i started on dan dare and worked up to absurdists at',
 'passage': 'in interview with zone magazine harrison says i liked anything bizarre from being about four years old i started on dan dare and worked up to absurdists at 15 y

In [None]:
qrecc[0]

{'Answer': "During that era, a single taken from 1988's OU812, When It's Love, reached the Top Five, peaking at No. 5. In addition, Van Halen was nominated for two Grammy Awards.",
 'Answer_URL': 'https://en.wikipedia.org/wiki/Van_Halen',
 'Context': ['What can you tell me about Gary Cherone?',
  'Gary Francis Caine Cherone is an American rock singer and songwriter, known for his work as the lead vocalist of Extreme and for his short stint for Van Halen.',
  'Did Gary Cherone sing well?',
  'Yes, Gary Cherone is also known for his work as the lead vocalist of the Boston rock group Extreme.',
  'What significant fact can you tell me about Gary Cherone that you liked?',
  'I like that Gary Cherone remained in contact and on good terms with Van Halen.',
  'What did Gary Cherone do after Van Halen?',
  'After his departure from Van Halen, Gary Cherone returned to Boston and put together a new project, Tribe of Judah.',
  'Did they release any albums after Gary Cherone left?',
  'After Gary

In [None]:
with open('qrecc_train.json', 'a') as f:
  json.dump(qrecc, f)

with open('qrecc_test.json', 'a') as f:
  json.dump(qrecc_test, f)

In [None]:
len(qrecc)

24037

In [None]:
count = 0

for i in range(len(qrecc)):
  if qrecc[i]['answer'] == 'no_ans':
    count += 1

print(count)

8726
