In [2]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import umap
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import pickle
import os

In [4]:
# Prepare datasets
wiki_dataset = load_dataset("wikipedia", "20220301.en", split="train", trust_remote_code=True)
wiki_dataset_1k = wiki_dataset.shuffle(42).select(range(1000))
wiki_dataset_10k = wiki_dataset.shuffle(1958).select(range(10000))
wiki_dataset_100k = wiki_dataset.shuffle(1997).select(range(100000))

# Create the directory if doesn't exist
os.makedirs('./data/1k', exist_ok=True)
os.makedirs('./data/10k', exist_ok=True)
os.makedirs('./data/100k', exist_ok=True)

# Save the dataset to disk
wiki_dataset_1k.save_to_disk('./data/1k')
wiki_dataset_10k.save_to_disk('./data/10k')
wiki_dataset_100k.save_to_disk('./data/100k')

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]

In [10]:
# Preview Cell
test_print = wiki_dataset_100k.shuffle(1997).select(range(10)).to_pandas()
sampled_rows = test_print.to_dict(orient="records")

for row in sampled_rows:
    print(row)

{'id': '46613570', 'url': 'https://en.wikipedia.org/wiki/The%20Only%20Running%20Footman', 'title': 'The Only Running Footman', 'text': "The Footman is a public house in Charles Street, Mayfair, long famous for its sign, which used to read, in full, I am the only Running Footman.  At 24 characters, this was the longest pub name in London until modern pubs were created with fanciful names such as The Ferret and Firkin in the Balloon up the Creek.\n\nFootmen were originally employed to run ahead of a carriage to ensure the way was clear. As roads got better and clearer the demand for their services fell away and many were re-employed as household servants. One footman instead bought the tavern, then called the Running Horse, and renamed it after himself.\n\nThe establishment was first built in 1749 and rebuilt in the 1930s.\n\nThe pub is believed to have been the inspiration for the Junior Ganymede Club, a fictional club in P. G. Wodehouse's Jeeves stories. It is a significant location in

In [11]:
texts = []
for article in wiki_dataset_100k:
    paragraphs = [p for p in article['text'].split('\n\n') if len(p) > 100]
    paragraphs = [p.replace('\n', ' ') for p in paragraphs]
    texts.extend(paragraphs[:5])

# print(texts)

### generate fitting embeddings with sbert

In [12]:
model_name = 'all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
batch_size = 512

In [13]:
embeddings = []
for i in range(0, len(texts), batch_size):
    print(f"Processing batch {i//batch_size + 1}/{len(texts)//batch_size + 1}")
    batch = texts[i:i+batch_size]
    batch_embeddings = model.encode(batch)
    embeddings.append(batch_embeddings)
    
all_embeddings = np.vstack(embeddings)

Processing batch 1/661
Processing batch 2/661
Processing batch 3/661
Processing batch 4/661
Processing batch 5/661
Processing batch 6/661
Processing batch 7/661
Processing batch 8/661
Processing batch 9/661
Processing batch 10/661
Processing batch 11/661
Processing batch 12/661
Processing batch 13/661
Processing batch 14/661
Processing batch 15/661
Processing batch 16/661
Processing batch 17/661
Processing batch 18/661
Processing batch 19/661
Processing batch 20/661
Processing batch 21/661
Processing batch 22/661
Processing batch 23/661
Processing batch 24/661
Processing batch 25/661
Processing batch 26/661
Processing batch 27/661
Processing batch 28/661
Processing batch 29/661
Processing batch 30/661
Processing batch 31/661
Processing batch 32/661
Processing batch 33/661
Processing batch 34/661
Processing batch 35/661
Processing batch 36/661
Processing batch 37/661
Processing batch 38/661
Processing batch 39/661
Processing batch 40/661
Processing batch 41/661
Processing batch 42/661
P

### fit umap reducer

In [14]:
reducer = umap.UMAP(
    n_components=2,
    metric='cosine',
    n_neighbors=15,
    min_dist=0.1,
    random_state=42
)
reducer.fit(all_embeddings)

  warn(


### fit scalar (limit to -1 , 1)

In [15]:
sample_transform = reducer.transform(all_embeddings[:5000])
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler.fit(sample_transform)



### test run

In [16]:
# Define a short note
test_notes = ["When it hunts on the ground the food tends to consist of ants, spiders and other soft bodied prey. It often takes prey much larger than itself.",
              "The dorsum (upper surface) of the opisthosoma has a dark leaf pattern with white edges and light transverse stripes on the right and left.",
              "A car, or an automobile, is a motor vehicle with wheels. Most definitions of cars state that they run primarily on roads, seat one to eight people, have four wheels, and mainly transport people rather than cargo.",
              "Coronavirus disease 2019 (COVID-19, also known as SARS-2) is a contagious disease caused by the coronavirus SARS-CoV-2. In January 2020, the disease spread worldwide, resulting in the COVID-19 pandemic.",
              "William Shakespeare was an English playwright, poet and actor. He is widely regarded as the greatest writer in the English language and the world's pre-eminent dramatist. He is often called England's national poet and the 'Bard of Avon' (or simply 'the Bard'). His extant works, including collaborations, consist of some 39 plays, 154 sonnets, three long narrative poems and a few other verses, some of uncertain authorship. His plays have been translated into every major living language and are performed more often than those of any other playwright. Shakespeare remains arguably the most influential writer in the English language, and his works continue to be studied and reinterpreted."]

embeddings = []
coordinates = []
scaled_coordinates = []

for note in test_notes:
    # Generate embeddings for the note
    note_embedding = model.encode(note)
    embeddings.append(note_embedding)
    
    # Generate coordinates using the reducer
    note_coordinates = reducer.transform(note_embedding.reshape(1, -1))
    coordinates.append(note_coordinates)
    
    # Scale the coordinates
    note_scaled_coordinates = scaler.transform(note_coordinates)
    scaled_coordinates.append(note_scaled_coordinates)

# Print out the results for each note
for i, note in enumerate(test_notes):
    print(f"Note {i + 1}:")
    print(f"Embeddings: {embeddings[i]}")
    print(f"Coordinates: {coordinates[i]}")
    print(f"Scaled Coordinates: {scaled_coordinates[i]}")
    print("\n")

Note 1:
Embeddings: [ 1.03663981e-01 -1.21731255e-02  5.89950271e-02  4.42312434e-02
  2.04662699e-02 -6.41817302e-02  2.32711118e-02  7.67776789e-03
 -4.60717976e-02 -3.64602590e-03  1.18472688e-02 -8.72451216e-02
 -6.81319758e-02  3.40572260e-02  4.62077267e-04 -4.36617434e-02
  1.18032672e-01 -1.69433188e-02  1.94567405e-02 -2.64212880e-02
  1.12460190e-02 -2.21962035e-02  2.77476907e-02 -2.61628218e-02
 -1.45017639e-01 -9.60353855e-03 -8.39282647e-02  3.96249257e-03
  1.19563770e-02 -9.95610952e-02 -2.34461185e-02  2.67519448e-02
  1.14883929e-01  1.64760966e-02 -4.12113443e-02  3.54511999e-02
 -8.51000939e-03 -8.73573050e-02  6.64530545e-02  5.55757731e-02
  4.19637375e-02  8.40734318e-02  5.92519976e-02 -3.13835554e-02
 -1.16931632e-01 -3.23411077e-02 -7.53749162e-02 -3.13044339e-03
  1.23474235e-02 -1.17856354e-01 -2.25391984e-02 -7.86043052e-03
 -8.70285705e-02  4.00018552e-03  2.76811793e-02 -4.55827676e-02
  1.43690524e-03 -5.41946478e-02  8.12998042e-03 -7.56597593e-02
  1.5



### save models

In [17]:
os.makedirs("models", exist_ok=True)
with open("models/text_mapper_wikipedia.pkl", 'wb') as f:
    pickle.dump({
        'reducer': reducer,
        'scaler': scaler,
        'model_name': model_name
    }, f)