In [18]:
import os 

DATA_FOLDER = "data/"
DATASET_NAME = "GoogleNews"

OUTPUT_FOLDER = os.path.join(DATA_FOLDER, f"{DATASET_NAME}")

# Preprocessing

In [2]:
raw_train_texts = []
train_labels = []
with open(os.path.join(DATA_FOLDER, "raw", f"{DATASET_NAME}.txt")) as fIn1:
    with open(os.path.join(DATA_FOLDER, "raw", f"{DATASET_NAME}_label.txt")) as fIn2:
            for text, label in zip(fIn1, fIn2):
                raw_train_texts.append(text.strip())
                train_labels.append(label.strip())

FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/Biomedical.txt'

In [3]:
len(raw_train_texts)

12295

In [4]:
# preprocess raw data
from topmost.preprocessing import Preprocessing

preprocessing = Preprocessing(min_term=2, min_doc_count=3, min_length=0)

rst = preprocessing.preprocess(raw_train_texts=raw_train_texts,
                               train_labels=train_labels,
                               pretrained_WE=True,
                               return_raw=True)

preprocessing.save(OUTPUT_FOLDER, **rst)

2024-11-23 14:12:08,876 - TopMost - label2id: {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4, '6': 5, '7': 6, '8': 7}
loading train texts: 100%|██████████| 12295/12295 [00:00<00:00, 42746.98it/s]
parsing training texts: 100%|██████████| 12295/12295 [00:00<00:00, 45209.09it/s]
parsing texts: 100%|██████████| 12294/12294 [00:00<00:00, 48355.92it/s]
2024-11-23 14:12:10,112 - TopMost - Real vocab size: 4618
2024-11-23 14:12:10,266 - TopMost - Real training size: 12294 	 avg length: 14.426
loading word embeddings: 100%|██████████| 4618/4618 [00:03<00:00, 1300.73it/s]
2024-11-23 14:12:52,180 - TopMost - number of found embeddings: 4443/4618


In [5]:
# Replicate to test
import shutil

shutil.copyfile(src=os.path.join(OUTPUT_FOLDER, "train_bow.npz"),
                dst=os.path.join(OUTPUT_FOLDER, "test_bow.npz"))

shutil.copyfile(src=os.path.join(OUTPUT_FOLDER, "train_labels.txt"),
                dst=os.path.join(OUTPUT_FOLDER, "test_labels.txt"))

shutil.copyfile(src=os.path.join(OUTPUT_FOLDER, "train_raws.txt"),
                dst=os.path.join(OUTPUT_FOLDER, "test_raws.txt"))

shutil.copyfile(src=os.path.join(OUTPUT_FOLDER, "train_texts.txt"),
                dst=os.path.join(OUTPUT_FOLDER, "test_texts.txt"))

'data/SearchSnippets/test_texts.txt'

# Embeddings

In [19]:
from dataloader.dataloader import DocEmbedModel
DEVICE = "cuda:1"

doc_embedder = DocEmbedModel(device=DEVICE, verbose=True)



In [20]:
raw_train_texts = []
with open(os.path.join(OUTPUT_FOLDER, "texts.txt"), "r", encoding = "utf-8") as f:
    while True:
        text = f.readline().strip()
        if not text:
            break
        raw_train_texts.append(text)
len(raw_train_texts)

11019

In [21]:
train_contextual_embed = doc_embedder.encode(raw_train_texts)

Batches:   0%|          | 0/345 [00:00<?, ?it/s]

In [22]:
train_contextual_embed.shape

(11019, 384)

In [10]:
# from umap import UMAP
# dim_rec_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=0)

In [25]:
# train_contextual_embed = dim_rec_model.fit_transform(train_contextual_embed)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [25]:
# from sklearn.cluster import KMeans
# NUM_CLUSTERS = 30

# kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=0)
# kmeans.fit(train_contextual_embed)
# kmeans.labels_
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# Assuming your 1000 texts are in a list called 'texts'
def create_neighbor_texts(texts, embeddings, k=20):
    # Load SBERT model
    # model = SentenceTransformer('all-MiniLM-L6-v2')  # You can choose other SBERT models
    
    # # Generate embeddings for all texts
    # print("Generating embeddings...")
    # embeddings = model.encode(texts, show_progress_bar=True)
    
    # Initialize and fit KNN
    print("Fitting KNN...")
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(embeddings)
    distances, indices = nbrs.kneighbors(embeddings)
    
    # Create neighbor texts
    neighbor_texts = []
    
    print("Creating neighbor texts...")
    for i in range(len(texts)):
        # Get indices of k nearest neighbors (excluding the text itself)
        neighbor_indices = indices[i][1:]  # Skip first index (self)
        
        # Concatenate the neighbor texts
        neighbor_text = " ".join([texts[idx] for idx in neighbor_indices])
        neighbor_texts.append(neighbor_text)
    
    return neighbor_texts

In [26]:
# pred_labels = kmeans.labels_.tolist()
neighbor_texts = create_neighbor_texts(raw_train_texts, train_contextual_embed, k=100)

Fitting KNN...
Creating neighbor texts...


In [27]:
len(neighbor_texts)

11019

In [28]:
neighbor_texts 

['winter white gala picture taylor swift princess night kensington palace winter white gala taylor swift hair winter white gala london london train station google street view taylor swift winter white gala taylor swift winter white gala taylor swift winter white gala taylor swift white winter white gala taylor swift chat duke cambridge winter white gala michelle dockery price celebrity winter gala pop princess taylor swift meet prince william winter white gala frozen melt royal point love google street view inside london train station macy parade taylor swift michelle dockery winter white gala taylor swift michelle dockery winter white gala spring march macy parade taylor swift hang prince william winter white gala taylor swift meet prince william bon jovi winter white gala taylor swift dazzle winter white gala macy thanksgiving day parade set city red lake girl macy parade major macy parade taylor join royal family sings kensington palace winter festival light event jewish community c

# Global Maps

In [29]:
import numpy as np

In [30]:
CLUSTER_FOLDER_NAME = "global_knn_100"
os.makedirs(os.path.join(OUTPUT_FOLDER, CLUSTER_FOLDER_NAME), exist_ok=True)

In [31]:
with open(os.path.join(OUTPUT_FOLDER, CLUSTER_FOLDER_NAME, "global_maps.txt"), "w") as fOut:
    for label in range(len(neighbor_texts)):
        fOut.write(str(label) + "\n")

In [32]:
# Read Vocab for Bow Global
vocab = []
with open(os.path.join(OUTPUT_FOLDER, "vocab.txt")) as fIn:
    for data in fIn:
        vocab.append(data.strip())

len(vocab)

3473

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(vocabulary=vocab, tokenizer=lambda x: x.split())
bow = vectorizer.fit_transform(neighbor_texts)



In [34]:
bow = bow.toarray()
# bow = bow.astype(np.float64)
# for idx in range(NUM_CLUSTERS):
#     bow[idx, :] = bow[idx, :] / docs_num[idx]

In [35]:
bow.shape

(11019, 3473)

In [36]:
import scipy.sparse

scipy.sparse.save_npz(os.path.join(OUTPUT_FOLDER, CLUSTER_FOLDER_NAME, "global_bow.npz"), scipy.sparse.csr_matrix(bow))

: 