In [3]:
from collections import Counter
import csv, os, tqdm, nltk
import sqlite3, random

# This folder can be obtained directly from the Newsela team
# by contacting them at: https://newsela.com/about/resources/research/

newsela_folder = "/home/davidbeauchemin/Github/keep_it_simple/datastore"

keys = ["slug", "language", "title", "grade_level", "version", "filename"]
slug2articles = {}
min_passage_length = 40

with open(os.path.join(newsela_folder, "articles_metadata.csv"), "r") as f:
    objs = csv.reader(f)
    for i, obj in tqdm.tqdm_notebook(enumerate(objs)):
        if i == 0:
            continue

        article = {k: obj[j] for j, k in enumerate(keys)}
        if article["language"] != "en":
            continue

        article["grade_level"] = float(article["grade_level"])
        article["version"] = float(article["version"])

        with open(
            os.path.join(newsela_folder, "articles/" + article["filename"]), "r"
        ) as f2:
            article["content"] = f2.read()
        paragraphs = [
            p.strip() for p in article["content"].split("\n\n") if "##" not in p
        ]

        article["paragraphs"] = []
        build_up = ""
        for p in paragraphs:
            build_up += p + "\n\n"
            if build_up.count(" ") > min_passage_length:
                article["paragraphs"].append(build_up.strip())
                build_up = ""

        # The last bit
        if build_up.count(" ") > min_passage_length:
            article["paragraphs"].append(build_up.strip())

        if article["slug"] not in slug2articles:
            slug2articles[article["slug"]] = []
        slug2articles[article["slug"]].append(article)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, obj in tqdm.tqdm_notebook(enumerate(objs)):


0it [00:00, ?it/s]

In [4]:
version_paired = "0.2"
dataset_file = "/home/davidbeauchemin/Github/keep_it_simple/datastore/newsela_paired_%s.db" % (version_paired)
if os.path.isfile(dataset_file):
    os.remove(dataset_file)
conn = sqlite3.connect(dataset_file, detect_types=sqlite3.PARSE_DECLTYPES)
conn.row_factory = sqlite3.Row
c = conn.cursor()

In [5]:
# CREATE TABLE

sql_create = "CREATE TABLE articles (id INTEGER PRIMARY KEY AUTOINCREMENT, p1 TEXT NOT NULL, p2 TEXT NOT NULL, version1 INTEGER NOT NULL,  version2 INTEGER NOT NULL, slug TEXT NOT NULL, cut TEXT NOT NULL);"
c.execute(sql_create)

c.execute("CREATE INDEX idx_source ON articles(cut);")
c.execute("CREATE INDEX idx_date ON articles(slug);")

conn.commit()

In [7]:
import Levenshtein, utils_misc

sql_insert = "INSERT INTO articles (p1, p2, version1, version2, slug, cut) VALUES (?, ?, ?, ?, ?, ?)"

same = 0
for i, slug in enumerate(tqdm.tqdm_notebook(slug2articles)):
    cut = "dev" if i % 40 == 0 else "train"
    arts = slug2articles[slug]
    for a in arts:
        a["paras"] = [
            p.strip() for p in a["content"].split("\n\n") if "##" not in p
        ]  # Own non-merged paragraphs
        a["paras"] = [
            p for p in a["paras"] if p.count(" ") >= 18 and p.count(" ") <= 80
        ]

    for a1 in arts:
        for a2 in arts:
            if a1["version"] >= a2["version"]:
                continue
            for p1 in a1["paras"]:
                for p2 in a2["paras"]:
                    R = float(len(p1)) / len(p2)
                    if R <= 0.6 or R >= 1.5 or p1 == p2:
                        continue

                    ratio = Levenshtein.ratio(p1, p2)
                    if ratio >= 0.6 and ratio <= 0.92:
                        #                         print("[%.3f] %s" % (ratio, utils_misc.show_diff(p1, p2)))
                        c.execute(
                            sql_insert,
                            (p1, p2, a1["version"], a2["version"], slug, cut),
                        )
                    if ratio >= 0.97:  # assume it is the same
                        c.execute(
                            sql_insert,
                            (p1, p2, a1["version"], a1["version"], slug, cut),
                        )
                        same += 1

print("Number of same samples: %d" % (same))
conn.commit()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i, slug in enumerate(tqdm.tqdm_notebook(slug2articles)):


  0%|          | 0/1911 [00:00<?, ?it/s]

Number of same samples: 10125


In [9]:
import pandas as pd

conn = sqlite3.connect(dataset_file, detect_types=sqlite3.PARSE_DECLTYPES)

df = pd.read_sql("select * from articles", conn)

In [20]:
dataset_file = "/home/davidbeauchemin/Github/keep_it_simple/datastore/newsela_paired_%s.csv" % (version_paired)

df.to_csv(dataset_file, index=False)

In [25]:
df[df["cut"] == "train"]

Unnamed: 0,id,p1,p2,version1,version2,slug,cut
71,72,"THE HAGUE, Netherlands — These days, anybody w...","THE HAGUE, Netherlands — These days, anybody w...",0,1,17century-selfies,train
72,73,"The museum's director, Emilie Gordenker, said ...","The museum's director, Emilie Gordenker, said ...",0,1,17century-selfies,train
73,74,"The exhibition, opening Oct. 8 and running thr...","The exhibition, which opened Oct. 8 and runs t...",0,1,17century-selfies,train
74,75,"THE HAGUE, Netherlands — These days, anybody w...","These days, anybody with a smartphone can snap...",0,2,17century-selfies,train
75,76,"The museum's director, Emilie Gordenker, said ...","The museum's director, Emilie Gordenker, said ...",0,2,17century-selfies,train
...,...,...,...,...,...,...,...
132101,132102,The same sort of thing happened when Zuckerber...,Zuckerberg launched the group FWD.us earlier t...,3,4,zuckerberg-internet,train
132102,132103,The Internet.org group currently includes Face...,Internet.org currently includes Facebook and s...,3,4,zuckerberg-internet,train
132103,132104,They noted that Facebook could have more membe...,They said Facebook could have more members if ...,3,3,zuckerberg-internet,train
132104,132105,People also had comments after FWD.us aired te...,People also complained about some of the tele...,3,4,zuckerberg-internet,train
