# Vector Matrix Generation

In [None]:
import pandas
import spacy
import numpy
from tqdm import tqdm

#FILENAMES
lexi = "LexiconFinal.csv"
spacy_model = "en_core_web_lg"
########################################################
nlp = spacy.load(spacy_model)
Lexicon = pandas.read_csv(lexi, usecols=[1, 2])

#extract words
words= Lexicon.iloc[:, 0]


#Generate a list of vectors
vectors = []
for word in tqdm(words, desc = "Loading vectors"):
  vectors.append(nlp(str(word)).vector)

vectors = numpy.array(vectors)
vectors = vectors.astype(numpy.float32)

numpy.save("vectors.npy", vectors)

# Vector Clustering and Reduction

In [None]:
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
import numpy as np
import faiss
import pandas

new_lexi = "Clusters_KMEANS.csv"

########################################################

# Load vectors
vectors = np.load("vectors.npy")

# Normalize vectors
print("Normalizing\n")
vectors = normalize(vectors)

# Reduce dimensions with PCA
print("Reducing dimensions with PCA\n")
pca = faiss.PCAMatrix(300, 50)
pca.train(vectors)
vectors = pca.apply_py(vectors)



# Generate cluster labels
print("Using KMEANS\n")
algo = KMeans(init='k-means++', n_clusters=400, random_state=42)
clusters = algo.fit_predict(vectors)

# Save cluster labels
numpy.save("clusters.npy", clusters)

# Append cluster labels to lexicon and save
print("\nWriting to file")
Lexicon = pandas.read_csv('LexiconFinal.csv', usecols=[1, 2])
Lexicon['Clusters'] = clusters
Lexicon.to_csv(new_lexi, header=['Word', 'ID', 'Cluster'], index=False)

# Reduce dimensionality for 3D visualization
print("Reducing dimensionality to 3D for visualization\n")
tsne = TSNE(n_components=3, random_state=42, perplexity=30, max_iter=1000)
vectors_3d = tsne.fit_transform(vectors)

numpy.save("vectors_3D_tSNE.npy", vectors_3d)


print("Reducing dimensionality to 2D for visualization\n")
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=500)
vectors_2d = tsne.fit_transform(vectors)
numpy.save("vectors_2D_tSNE.npy", vectors_2d)


# Visualization

## 3D Point Cloud

In [None]:
import pandas as pd
import plotly.express as px

Lexicon = pandas.read_csv("LexiconFinal.csv", usecols=[1])
words = Lexicon.iloc[:, 0]

clusters = numpy.load("clusters.npy")
vectors_3d = numpy.load("vectors_tSNE.npy")

df = pd.DataFrame({
    'x': vectors_3d[:, 0],
    'y': vectors_3d[:, 1],
    'z': vectors_3d[:, 2],
    'Cluster': clusters,
    'Word': words  # Word on hover
})

fig = px.scatter_3d(
    df, 
    x='x', y='y', z='z', 
    color='Cluster',
    color_continuous_scale=px.colors.qualitative.Set3,
    title="3D Visualization of Clusters",
    opacity=0.7,
    hover_name='Word'  # Use 'Word' column for hover text
)
fig.update_traces(marker=dict(size=3))

fig.show()


## 2D

In [None]:
import pandas as pd
import plotly.express as px

Lexicon = pandas.read_csv("LexiconFinal.csv", usecols=[1])
words = Lexicon.iloc[:, 0]

clusters = numpy.load("clusters.npy")
vectors_3d = numpy.load("vectors_tSNE.npy")

df_2d = pd.DataFrame({
    'x': vectors_3d[:, 0],  #first dimension
    'y': vectors_3d[:, 1],  #second dimension
    'Cluster': clusters,
    'Word': words  #Word on hover
})

# Plot interactive 2D scatter
fig_2d = px.scatter(
    df_2d,
    x='x', y='y',
    color='Cluster',
    color_continuous_scale=px.colors.qualitative.Set3,
    title="2D Visualization of Clusters",
    hover_name='Word',  # Use 'Word' column for hover text
    opacity=0.7
)
fig_2d.update_traces(marker=dict(size=5))

fig_2d.show()

# Barreling

In [None]:
import pandas as pd
import os
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import threading
from collections import defaultdict
import ijson
import logging
from typing import Dict, Any, Iterator
import queue

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class BarrelIndexer:
    def __init__(self, cluster_file: str, inverted_index_file: str, output_dir: str, chunk_size: int = 10000):
        self.cluster_file = cluster_file
        self.inverted_index_file = inverted_index_file
        self.output_dir = output_dir
        self.chunk_size = chunk_size
        self.barrel_locks = defaultdict(threading.Lock)
        self.barrel_buffers = defaultdict(dict)
        self.buffer_size = 1000  # Number of entries to buffer before writing
        
    def load_clusters(self) -> Dict[int, int]:
        """Load cluster assignments efficiently"""
        df = pd.read_csv(self.cluster_file, usecols=[1, 2])
        return dict(zip(df['ID'], df['Cluster']))

    def stream_inverted_index(self) -> Iterator[tuple]:
        """Stream the inverted index file in chunks"""
        with open(self.inverted_index_file, 'rb') as file:
            parser = ijson.parse(file)
            current_key = None
            current_value = None
            
            for prefix, event, value in parser:
                if prefix.endswith('.key'):
                    current_key = value
                elif prefix.endswith('.value'):
                    current_value = value
                    if current_key is not None:
                        yield (int(current_key), current_value)
                        current_key = None
                        current_value = None

    def write_barrel_chunk(self, barrel: int, data: Dict[str, Any]):
        """Write a chunk of data to a barrel file with locking"""
        barrel_file = os.path.join(self.output_dir, f"{barrel}.json")
        
        with self.barrel_locks[barrel]:
            try:
                existing_data = {}
                if os.path.exists(barrel_file) and os.path.getsize(barrel_file) > 0:
                    with open(barrel_file, 'r') as f:
                        existing_data = json.load(f)
                
                existing_data.update(data)
                
                with open(barrel_file, 'w') as f:
                    json.dump(existing_data, f, separators=(',', ':'))
                    
            except Exception as e:
                logger.error(f"Error writing to barrel {barrel}: {str(e)}")
                raise

    def process_chunk(self, chunk: Dict[str, Any], barrels: Dict[int, int]):
        """Process a chunk of the inverted index"""
        barrel_chunks = defaultdict(dict)
        
        for index, value in chunk.items():
            index_int = int(index)
            barrel = barrels.get(index_int)
            
            if barrel is None:
                logger.warning(f"No barrel found for index {index_int}")
                continue
                
            barrel_chunks[barrel][index] = value
            
        # Write accumulated chunks for each barrel
        for barrel, data in barrel_chunks.items():
            self.write_barrel_chunk(barrel, data)

    def process(self, max_workers: int = 4):
        """Main processing method with parallel execution"""
        os.makedirs(self.output_dir, exist_ok=True)
        
        # Load cluster assignments
        logger.info("Loading cluster assignments...")
        barrels = self.load_clusters()
        
        # Create thread pool
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            current_chunk = {}
            futures = []
            
            # Process the inverted index in streaming fashion
            for index, value in tqdm(self.stream_inverted_index(), desc="Processing entries"):
                current_chunk[str(index)] = value
                
                if len(current_chunk) >= self.chunk_size:
                    chunk_to_process = current_chunk
                    current_chunk = {}
                    futures.append(
                        executor.submit(self.process_chunk, chunk_to_process, barrels)
                    )
                    
            # Process any remaining entries
            if current_chunk:
                futures.append(
                    executor.submit(self.process_chunk, current_chunk, barrels)
                )
            
            # Wait for all tasks to complete
            for future in tqdm(futures, desc="Waiting for tasks"):
                future.result()

def main():
    indexer = BarrelIndexer(
        cluster_file=".\\Barrels\\Clusters_KMEANS.csv",
        inverted_index_file="Inverted_index.json",
        output_dir=".\\Barrels\\Index_Barrels",
        chunk_size=10000
    )
    indexer.process(max_workers=4)

if __name__ == "__main__":
    main()