In [1]:
import re
from pathlib import Path
import pandas as pd
import spacy
from segram import Corpus, Story

# Use GPU acceleration for parsing documents if available
spacy.prefer_gpu()

# Read dataset of articles coveriing the war in Syria
data = pd.read_csv(Path(".")/"data"/"FA-KES-Dataset.csv", encoding="latin")

# Sanitize beginnings of content strings to get rid of publication dates
# and other metadata. This is done using a few regular expressions
#
# Get rid of publication dates
pattern1 = r"^(" \
    r"\s*([A-Z]\w+)?\s*\d{1,2}\s*([A-Z]\w+)?\s*\d{2,4}\s*" \
    r"|" \
    r"\d{1,2}[-\.]\d{1,2}[-\.]\d{2,4}\s*" \
    r")" \
    r"(\s*at\s*\d+:?\d+)?"
# Get rid of update dates
pattern2 = r"^\s*(\(updated\s*[\w\d]*\s*\))\s*"
# Get rid of get url shout outs
pattern3 = r"^\s*get\s*short\s*url\s*([\d\w]+\s*)*"

# Sanitize the content column
data["article_content"] = data["article_content"] \
    .str.replace(pattern1, r"", regex=True) \
    .str.replace(pattern2, r"", regex=True, flags=re.IGNORECASE) \
    .str.replace(pattern3, r"", regex=True, flags=re.IGNORECASE) \
    .str.strip()

In [2]:
nlp = spacy.load("en_core_web_trf")
nlp.add_pipe("segram", config={
    "vectors": "en_core_web_lg"
})
nlp.add_pipe("segram_coref")

<segram.nlp.pipeline.coref.Coref at 0x7f63d528cf50>

In [None]:
fpath = Path(".").absolute()/"data"/"titles.segram"
if fpath.exists():
    titles = Corpus.from_disk(fpath, vocab=nlp.vocab)

titles = Corpus.from_texts(nlp, *data["article_title"], progress=True)

In [None]:
fpath = Path(".")/"data"/"titles.segram"
titles.to_disk(fpath, vocab=False, nlp=False)

In [None]:
fpath.absolute()

In [None]:
Corpus.from_disk(fpath, vocab=nlp.vocab)

In [None]:
D = Corpus.from_disk("test")