In [71]:
import os 

DATA_FOLDER = "data/"
DATASET_NAME = "StackOverflow"

OUTPUT_FOLDER = os.path.join(DATA_FOLDER, f"{DATASET_NAME}")

# Preprocessing

In [2]:
raw_train_texts = []
train_labels = []
with open(os.path.join(DATA_FOLDER, "raw", f"{DATASET_NAME}.txt")) as fIn1:
    with open(os.path.join(DATA_FOLDER, "raw", f"{DATASET_NAME}_label.txt")) as fIn2:
            for text, label in zip(fIn1, fIn2):
                raw_train_texts.append(text.strip())
                train_labels.append(label.strip())

FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/Biomedical.txt'

In [3]:
len(raw_train_texts)

12295

In [4]:
# preprocess raw data
from topmost.preprocessing import Preprocessing

preprocessing = Preprocessing(min_term=2, min_doc_count=3, min_length=0)

rst = preprocessing.preprocess(raw_train_texts=raw_train_texts,
                               train_labels=train_labels,
                               pretrained_WE=True,
                               return_raw=True)

preprocessing.save(OUTPUT_FOLDER, **rst)

2024-11-23 14:12:08,876 - TopMost - label2id: {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4, '6': 5, '7': 6, '8': 7}
loading train texts: 100%|██████████| 12295/12295 [00:00<00:00, 42746.98it/s]
parsing training texts: 100%|██████████| 12295/12295 [00:00<00:00, 45209.09it/s]
parsing texts: 100%|██████████| 12294/12294 [00:00<00:00, 48355.92it/s]
2024-11-23 14:12:10,112 - TopMost - Real vocab size: 4618
2024-11-23 14:12:10,266 - TopMost - Real training size: 12294 	 avg length: 14.426
loading word embeddings: 100%|██████████| 4618/4618 [00:03<00:00, 1300.73it/s]
2024-11-23 14:12:52,180 - TopMost - number of found embeddings: 4443/4618


In [5]:
# Replicate to test
import shutil

shutil.copyfile(src=os.path.join(OUTPUT_FOLDER, "train_bow.npz"),
                dst=os.path.join(OUTPUT_FOLDER, "test_bow.npz"))

shutil.copyfile(src=os.path.join(OUTPUT_FOLDER, "train_labels.txt"),
                dst=os.path.join(OUTPUT_FOLDER, "test_labels.txt"))

shutil.copyfile(src=os.path.join(OUTPUT_FOLDER, "train_raws.txt"),
                dst=os.path.join(OUTPUT_FOLDER, "test_raws.txt"))

shutil.copyfile(src=os.path.join(OUTPUT_FOLDER, "train_texts.txt"),
                dst=os.path.join(OUTPUT_FOLDER, "test_texts.txt"))

'data/SearchSnippets/test_texts.txt'

# Embeddings

In [72]:
from dataloader.dataloader import DocEmbedModel
DEVICE = "cuda:1"

doc_embedder = DocEmbedModel(device=DEVICE, verbose=True)



In [73]:
raw_train_texts = []
with open(os.path.join(OUTPUT_FOLDER, "texts.txt"), "r", encoding = "utf-8") as f:
    while True:
        text = f.readline().strip()
        if not text:
            break
        raw_train_texts.append(text)
len(raw_train_texts)

16378

In [74]:
train_contextual_embed = doc_embedder.encode(raw_train_texts)

Batches:   0%|          | 0/512 [00:00<?, ?it/s]

In [75]:
train_contextual_embed.shape

(16378, 384)

In [10]:
# from umap import UMAP
# dim_rec_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=0)

In [25]:
# train_contextual_embed = dim_rec_model.fit_transform(train_contextual_embed)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [76]:
# from sklearn.cluster import KMeans
# NUM_CLUSTERS = 30

# kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=0)
# kmeans.fit(train_contextual_embed)
# kmeans.labels_
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# Assuming your 1000 texts are in a list called 'texts'
def create_neighbor_texts(texts, embeddings, k=20):
    # Load SBERT model
    # model = SentenceTransformer('all-MiniLM-L6-v2')  # You can choose other SBERT models
    
    # # Generate embeddings for all texts
    # print("Generating embeddings...")
    # embeddings = model.encode(texts, show_progress_bar=True)
    
    # Initialize and fit KNN
    print("Fitting KNN...")
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(embeddings)
    distances, indices = nbrs.kneighbors(embeddings)
    
    # Create neighbor texts
    neighbor_texts = []
    
    print("Creating neighbor texts...")
    for i in range(len(texts)):
        # Get indices of k nearest neighbors (excluding the text itself)
        neighbor_indices = indices[i]  # Skip first index (self)
        
        # Concatenate the neighbor texts
        neighbor_text = " ".join([texts[idx] for idx in neighbor_indices])
        neighbor_texts.append(neighbor_text)
    
    return neighbor_texts

In [77]:
# pred_labels = kmeans.labels_.tolist()
neighbor_texts = create_neighbor_texts(raw_train_texts, train_contextual_embed, k=20)

Fitting KNN...
Creating neighbor texts...


In [78]:
len(neighbor_texts)

16378

In [79]:
neighbor_texts 

['fill dataset datatable linq query resultset linq sql linq dataset linq datatable method return typed datatable linq query select rows dataset using linq list list data grid using linq sql winforms work dataset linq create equivalent dataset datatable tool always get exception trying fill data datatable linq datasets mysql interop problem paging linq datatable combining resultset using linq use linq convert list dataset updating column value resultset returned linq query linq dataset multiple group data table linq dataset update stored procedure set value column every row table using linq linq datacontext web application join two tables linq datasets get min linq dataset query',
 'best subversion clients windows vista best subversion client linux command line subversion client windows vista tips running subversion windows world hosted subversion recommendations suggestions subversion suitable enterprise good svn client improve windows subversion client update performance subversion se

# Global Maps

In [80]:
import numpy as np

In [81]:
CLUSTER_FOLDER_NAME = "global_knn_20_include_local"
os.makedirs(os.path.join(OUTPUT_FOLDER, CLUSTER_FOLDER_NAME), exist_ok=True)

In [82]:
with open(os.path.join(OUTPUT_FOLDER, CLUSTER_FOLDER_NAME, "global_maps.txt"), "w") as fOut:
    for label in range(len(neighbor_texts)):
        fOut.write(str(label) + "\n")

In [83]:
# Read Vocab for Bow Global
vocab = []
with open(os.path.join(OUTPUT_FOLDER, "vocab.txt")) as fIn:
    for data in fIn:
        vocab.append(data.strip())

len(vocab)

2226

In [84]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(vocabulary=vocab, tokenizer=lambda x: x.split())
bow = vectorizer.fit_transform(neighbor_texts)



In [85]:
bow = bow.toarray()
# bow = bow.astype(np.float64)
# for idx in range(NUM_CLUSTERS):
#     bow[idx, :] = bow[idx, :] / docs_num[idx]

In [86]:
bow.shape

(16378, 2226)

In [87]:
import scipy.sparse

scipy.sparse.save_npz(os.path.join(OUTPUT_FOLDER, CLUSTER_FOLDER_NAME, "global_bow.npz"), scipy.sparse.csr_matrix(bow))

: 