# Parser

Code to take some files in the output format of the URL scraper, and translate them to the input format of the Python ML models

In [2]:
import json

import nltk

import random

In [4]:
FILES = ["dailymail.json", "guardian.json", "bbc.json"]
OUTPUT = "pairs_unlabelled.json"

TRASH = {
    'dailymail.json': 1,
    'guardian.json': 1,
    'bbc.json': 3,
}

In [6]:
random.seed(42)

result = {"data": []}
for file in FILES:
    with open(file) as f:
        all_entries = json.load(f)
        
        random.shuffle(all_entries)
        
        num_entries = len(all_entries)
        
        entries = all_entries[:num_entries // 2]
        for entry in entries:
            # Trash the trash
            notrash_entry = entry['text'][:-TRASH[file]]
            
            sentences = nltk.sent_tokenize(" ".join(notrash_entry))
            for s1, s2 in zip(sentences[1:], sentences[:-1]):
                result["data"].append(
                    {
                        "sen1": s1,
                        "sen2": s2,
                        "ans": 0,  # This means sentences are a continuation of each other
                    }
                )
                
            
        # Takes the random articles and combines their text into pairs
        random_entries = all_entries[num_entries // 2:]
        sen1 = []
        sen2 = []
        
        for entry in random_entries:
            notrash_entry = entry['text'][:-TRASH[file]]
            sentences = nltk.sent_tokenize(" ".join(notrash_entry))
            if len(sen1) < len(sen2):
                sen1.extend(sentences)
            else:
                sen2.extend(sentences)
        
        min_len = min(len(sen1), len(sen2))
        for s1, s2 in zip(sen1[:min_len], sen2[:min_len]):
            result["data"].append(
                {
                    "sen1": s1,
                    "sen2": s2,
                    "ans": 1,  # This means sentences are a continuation of each other
                }
            )

with open(OUTPUT, "w") as f:
    json.dump(result, f)