In [3]:
from annoy import AnnoyIndex
import numpy as np
import time
from transformers import AutoTokenizer, AutoModel
from collections import defaultdict
import spacy
import torch
import torch.nn.functional as F
from datasets import load_dataset, concatenate_datasets, Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[+] Using device: {device}")

[+] Using device: cpu


In [4]:
nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7c3988287490>

In [None]:
def split_articles_into_sentences(batch):
    docs = nlp.pipe(batch["text"], n_process=12, batch_size=64)
    sentences = [sent.text.strip() for doc in docs for sent in doc.sents]
    return {"sentence": sentences}

    
def clean_dataset():
    start = time.time()
    print("[-] Loading dataset...")
    dataset = load_dataset("omarkamali/wikipedia-monthly", "latest.en", num_proc=16, split="train")
    print(dataset)
    print(f"[+] Finished loading dataset. Time taken: {time.time() - start}")
    
    print("[-] Splitting into sentences...")
    sentencized_dataset = dataset.map(split_articles_into_sentences, batched=True, batch_size=768, remove_columns=dataset.column_names)
    print("[+] Done splitting into sentences")
    print("[-] Saving cleaned dataset to disk...")
    output_path = "./cleaned_sentences"
    sentencized_dataset.save_to_disk(output_path)
    print(f"Total time taken: {time.time() - start:.2f}s")


In [None]:
clean_dataset()

In [12]:
paragraph = ["""
It was a horse-drawn stagecoach built by another manufacturer, which they retrofitted with an engine of their design. By 1895, about 30 vehicles had been built by Daimler and Maybach, either at the Daimler works or in the Hotel Hermann, where they set up shop after disputes with their backers. Benz, Maybach, and the Daimler team seem to have been unaware of each other's early work. They never worked together; by the time of the merger of the two companies, Daimler and Maybach were no longer part of DMG. Daimler died in 1900 and later that year, Maybach designed an engine named Daimler-Mercedes that was placed in a specially ordered model built to specifications set by Emil Jellinek. This was a production of a small number of vehicles for Jellinek to race and market in his country. Two years later, in 1902, a new model DMG car was produced and the model was named Mercedes after the Maybach engine, which generated 35 hp. Maybach quit DMG shortly thereafter and opened a business of his own. Rights to the Daimler brand name were sold to other manufacturers.

In 1890, Émile Levassor and Armand Peugeot of France began producing vehicles with Daimler engines, and so laid the foundation of the automotive industry in France. In 1891, Auguste Doriot and his Peugeot colleague Louis Rigoulot completed the longest trip by a petrol-driven vehicle when their self-designed and built Daimler powered Peugeot Type 3 completed 2,100 kilometres (1,300 mi) from Valentigney to Paris and Brest and back again. They were attached to the first Paris–Brest–Paris bicycle race, but finished six days after the winning cyclist, Charles Terront.
"""]

docs = nlp.pipe(paragraph)
for doc in docs:
    for sent in doc.sents:
        print(sent.text.strip())

It was a horse-drawn stagecoach built by another manufacturer, which they retrofitted with an engine of their design.
By 1895, about 30 vehicles had been built by Daimler and Maybach, either at the Daimler works or in the Hotel Hermann, where they set up shop after disputes with their backers.
Benz, Maybach, and the Daimler team seem to have been unaware of each other's early work.
They never worked together; by the time of the merger of the two companies, Daimler and Maybach were no longer part of DMG.
Daimler died in 1900 and later that year, Maybach designed an engine named Daimler-Mercedes that was placed in a specially ordered model built to specifications set by Emil Jellinek.
This was a production of a small number of vehicles for Jellinek to race and market in his country.
Two years later, in 1902, a new model DMG car was produced and the model was named Mercedes after the Maybach engine, which generated 35 hp.
Maybach quit DMG shortly thereafter and opened a business of his ow