# Vector Matrix Generation

In [None]:
import pandas
import numpy
from sentence_transformers import SentenceTransformer
import shutil
import faiss


#FILENAMES
lexi = "/content/drive/MyDrive/DSRsquared/3_Million/Lexicon_3_Mill.csv"
########################################################
print("Loading model")
model = SentenceTransformer('multi-qa-mpnet-base-cos-v1')
Lexicon = pandas.read_csv(lexi, usecols=[0], header = None)

#extract words
words= Lexicon.iloc[:, 0]
print("Encoding")
vectors = model.encode(words, convert_to_numpy=True).astype("float32")
PCA = faiss.PCAMatrix(vectors.shape[1], 100)
PCA.train(vectors)
low_vectors = PCA.apply_py(vectors)


numpy.save("Embeddings_Full.npy", low_vectors)

# Vector Clustering and Reduction

In [None]:
from sklearn.preprocessing import normalize
import numpy as np
import faiss
import pandas as pd
import shutil


########################################################

# Load vectors
print("Loading vectors\n")
vectors = np.load("Embeddings_Full.npy")

# Normalize vectors
print("Normalizing\n")
vectors = normalize(vectors)

n_clusters = int(vectors.shape[0]/10)

# Generate cluster labels
print("Using KMeans")
res = faiss.StandardGpuResources()
kmeans = faiss.Kmeans(vectors.shape[1], n_clusters,
                      niter = 150,nredo =10, gpu = True, verbose = True)
kmeans.train(vectors)
_, cluster_assignments = kmeans.index.search(vectors, 1)
clusters = cluster_assignments.flatten()
centroids = kmeans.centroids

# Save cluster label
clusters = np.load("Clusters_Full.npy")
centroids = np.load("Centroids_Full.npy")

np.save("Clusters_Full.npy", clusters)
np.save("Centroids_Full.npy", centroids)


# Append cluster labels to lexicon and save
print("\nWriting to file")
Lexicon = pandas.read_csv('/content/drive/MyDrive/DSRsquared/4_Million/Lexicon_4_Mill.csv', header = None)
Lexicon['Clusters'] = clusters
Lexicon.to_csv("Lexicon_Full_Barrels.csv", header=None, index=False)


# Visualization

## TSNE

In [None]:
#from cuml.manifold import TSNE as cumlTSNE
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
import faiss
import numpy as np

vectors = np.load("/content/drive/MyDrive/DSRsquared/1_Million/Embeddings_1.npy")
vectors = normalize(vectors)

# Reduce dimensions with PCA
print("Reducing dimensions with PCA\n")
pca = faiss.PCAMatrix(vectors.shape[1], 20)
pca.train(vectors)
low_vectors = pca.apply_py(vectors)


print("Reducing dimensionality to 2D for visualization\n")
tsne = TSNE(n_components=2, random_state=42, perplexity=200,
                learning_rate = 1000,
                metric = 'cosine', init = 'pca',
                early_exaggeration = 30,
                n_iter = 1000)
vectors_2d = tsne.fit(low_vectors)
np.save("vectors_2D_tSNE.npy", vectors_2d)

# Reduce dimensionality for 3D visualization
print("Reducing dimensionality to 3D for visualization\n")
tsne = TSNE(n_components=3, random_state=42, perplexity=50,
                learning_rate = 400, max_iter = 700,
                metric = 'cosine', init = 'pca', method = 'barnes_hut',
                n_jobs=-1, n_iter_without_progress=30,
                verbose=2)
vectors_3d = tsne.fit_transform(low_vectors)
np.save("vectors_3D_tSNE.npy", vectors_3d)



## 3D Point Cloud

In [None]:
import pandas
import plotly.express as px
import numpy

Lexicon = pandas.read_csv("/content/drive/MyDrive/DSRsquared/1_Million/Lexicon_1_Barrels.csv",
                          usecols=[0])
words = Lexicon.iloc[:, 0]


Cluster_Frame = pandas.read_csv("/content/drive/MyDrive/DSRsquared/1_Million/Clusters_1_Barrels.csv",
                          usecols=[3])
clusters = Cluster_Frame.iloc[:, 0]
vectors_3d = numpy.load("vectors_3D_tSNE.npy")

# Create a DataFrame for visualization with hover text
df = pd.DataFrame({
    'x': vectors_3d[:, 0],
    'y': vectors_3d[:, 1],
    'z': vectors_3d[:, 2],
    'Cluster': clusters,
    'Word': words  # Add words for hover text
})

# Plot interactive 3D scatter
fig = px.scatter_3d(
    df,
    x='x', y='y', z='z',
    color='Cluster',
    color_continuous_scale=px.colors.qualitative.Set3,
    title="3D Visualization of Clusters",
    opacity=0.7,
    hover_name='Word'  # Use 'Word' column for hover text
)
fig.update_traces(marker=dict(size=3))

fig.show()
fig.write_html("3D_Cloud.html")

## 2D

In [None]:
import pandas as pd
import plotly.express as px
import numpy

Lexicon = pd.read_csv("/content/drive/MyDrive/DSRsquared/1_Million/Lexicon_1_Barrels.csv",
                          usecols=[0], header = None)
words = Lexicon.iloc[:, 0]

Cluster_Frame = pd.read_csv("/content/drive/MyDrive/DSRsquared/1_Million/Lexicon_1_Barrels.csv",
                          usecols=[3], header = None)
clusters = Cluster_Frame.iloc[:, 0]
vectors_2d = numpy.load("vectors_2D_tSNE.npy")

# Create a DataFrame for 2D visualization with hover text
df_2d = pd.DataFrame({
    'x': vectors_2d[:, 0],  # Use the first dimension
    'y': vectors_2d[:, 1],  # Use the second dimension
    'Cluster': clusters,
    'Word': words  # Add words for hover text
})

# Plot interactive 2D scatter
fig_2d = px.scatter(
    df_2d,
    x='x', y='y',
    color='Cluster',
    color_continuous_scale=px.colors.qualitative.Set3,
    title="2D Visualization of Clusters",
    hover_name='Word',  # Use 'Word' column for hover text
    opacity=0.7
)
fig_2d.update_traces(marker=dict(size=5))

fig_2d.show()
fig_2d.write_html("2D_Map.html")


# Barreling

In [None]:
import pandas as pd
import os
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import threading
from collections import defaultdict
import ijson
import logging
from typing import Dict, Any, Iterator
import queue

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class BarrelIndexer:
    def __init__(self, cluster_file: str, inverted_index_file: str, output_dir: str, chunk_size: int = 10000):
        self.cluster_file = cluster_file
        self.inverted_index_file = inverted_index_file
        self.output_dir = output_dir
        self.chunk_size = chunk_size
        self.barrel_locks = defaultdict(threading.Lock)
        self.barrel_buffers = defaultdict(dict)
        self.buffer_size = 1000  # Number of entries to buffer before writing
        
    def load_clusters(self) -> Dict[int, int]:
        """Load cluster assignments efficiently"""
        df = pd.read_csv(self.cluster_file, usecols=[1, 2])
        return dict(zip(df['ID'], df['Cluster']))

    def stream_inverted_index(self) -> Iterator[tuple]:
        """Stream the inverted index file in chunks"""
        with open(self.inverted_index_file, 'rb') as file:
            parser = ijson.parse(file)
            current_key = None
            current_value = None
            
            for prefix, event, value in parser:
                if prefix.endswith('.key'):
                    current_key = value
                elif prefix.endswith('.value'):
                    current_value = value
                    if current_key is not None:
                        yield (int(current_key), current_value)
                        current_key = None
                        current_value = None

    def write_barrel_chunk(self, barrel: int, data: Dict[str, Any]):
        """Write a chunk of data to a barrel file with locking"""
        barrel_file = os.path.join(self.output_dir, f"{barrel}.json")
        
        with self.barrel_locks[barrel]:
            try:
                existing_data = {}
                if os.path.exists(barrel_file) and os.path.getsize(barrel_file) > 0:
                    with open(barrel_file, 'r') as f:
                        existing_data = json.load(f)
                
                existing_data.update(data)
                
                with open(barrel_file, 'w') as f:
                    json.dump(existing_data, f, separators=(',', ':'))
                    
            except Exception as e:
                logger.error(f"Error writing to barrel {barrel}: {str(e)}")
                raise

    def process_chunk(self, chunk: Dict[str, Any], barrels: Dict[int, int]):
        """Process a chunk of the inverted index"""
        barrel_chunks = defaultdict(dict)
        
        for index, value in chunk.items():
            index_int = int(index)
            barrel = barrels.get(index_int)
            
            if barrel is None:
                logger.warning(f"No barrel found for index {index_int}")
                continue
                
            barrel_chunks[barrel][index] = value
            
        # Write accumulated chunks for each barrel
        for barrel, data in barrel_chunks.items():
            self.write_barrel_chunk(barrel, data)

    def process(self, max_workers: int = 4):
        """Main processing method with parallel execution"""
        os.makedirs(self.output_dir, exist_ok=True)
        
        # Load cluster assignments
        logger.info("Loading cluster assignments...")
        barrels = self.load_clusters()
        
        # Create thread pool
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            current_chunk = {}
            futures = []
            
            # Process the inverted index in streaming fashion
            for index, value in tqdm(self.stream_inverted_index(), desc="Processing entries"):
                current_chunk[str(index)] = value
                
                if len(current_chunk) >= self.chunk_size:
                    chunk_to_process = current_chunk
                    current_chunk = {}
                    futures.append(
                        executor.submit(self.process_chunk, chunk_to_process, barrels)
                    )
                    
            # Process any remaining entries
            if current_chunk:
                futures.append(
                    executor.submit(self.process_chunk, current_chunk, barrels)
                )
            
            # Wait for all tasks to complete
            for future in tqdm(futures, desc="Waiting for tasks"):
                future.result()

def main():
    indexer = BarrelIndexer(
        cluster_file=".\\Barrels\\Clusters_KMEANS.csv",
        inverted_index_file="Inverted_index.json",
        output_dir=".\\Barrels\\Index_Barrels",
        chunk_size=10000
    )
    indexer.process(max_workers=4)

if __name__ == "__main__":
    main()