# Vector Matrix Generation

In [None]:
import pandas
import spacy
import numpy
from tqdm import tqdm

#FILENAMES
lexi = "LexiconFinal.csv"
spacy_model = "en_core_web_lg"
########################################################
nlp = spacy.load(spacy_model)
Lexicon = pandas.read_csv(lexi, usecols=[1, 2])

#extract words
words= Lexicon.iloc[:, 0]


#Generate a list of vectors
vectors = []
for word in tqdm(words, desc = "Loading vectors"):
  vectors.append(nlp(str(word)).vector)

vectors = numpy.array(vectors)
vectors = vectors.astype(numpy.float32)

numpy.save("vectors.npy", vectors)

# Vector Clustering and Reduction

In [None]:
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import normalize
import numpy as np
import faiss
import pandas

new_lexi = "Clusters_KMEANS.csv"

########################################################

# Load vectors
vectors = np.load("vectors.npy")

# Normalize vectors
print("Normalizing\n")
vectors = normalize(vectors)

# Reduce dimensions with PCA
print("Reducing dimensions with PCA\n")
pca = faiss.PCAMatrix(300, 50)
pca.train(vectors)
vectors = pca.apply_py(vectors)



# Generate cluster labels
print("Using KMEANS\n")
algo = KMeans(init='k-means++', n_clusters=400, random_state=42)
clusters = algo.fit_predict(vectors)

# Save cluster labels
numpy.save("clusters.npy", clusters)

# Append cluster labels to lexicon and save
print("\nWriting to file")
Lexicon = pandas.read_csv('LexiconFinal.csv', usecols=[1, 2])
Lexicon['Clusters'] = clusters
Lexicon.to_csv(new_lexi, header=['Word', 'ID', 'Cluster'], index=False)

# Reduce dimensionality for 3D visualization
print("Reducing dimensionality to 3D for visualization\n")
tsne = TSNE(n_components=3, random_state=42, perplexity=30, max_iter=1000)
vectors_3d = tsne.fit_transform(vectors)

numpy.save("vectors_3D_tSNE.npy", vectors_3d)


print("Reducing dimensionality to 2D for visualization\n")
tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=500)
vectors_2d = tsne.fit_transform(vectors)
numpy.save("vectors_2D_tSNE.npy", vectors_2d)


# Visualization

## 3D Point Cloud

In [None]:
import pandas as pd
import plotly.express as px

Lexicon = pandas.read_csv("LexiconFinal.csv", usecols=[1])
words = Lexicon.iloc[:, 0]

clusters = numpy.load("clusters.npy")
vectors_3d = numpy.load("vectors_tSNE.npy")

df = pd.DataFrame({
    'x': vectors_3d[:, 0],
    'y': vectors_3d[:, 1],
    'z': vectors_3d[:, 2],
    'Cluster': clusters,
    'Word': words  # Word on hover
})

fig = px.scatter_3d(
    df, 
    x='x', y='y', z='z', 
    color='Cluster',
    color_continuous_scale=px.colors.qualitative.Set3,
    title="3D Visualization of Clusters",
    opacity=0.7,
    hover_name='Word'  # Use 'Word' column for hover text
)
fig.update_traces(marker=dict(size=3))

fig.show()


## 2D

In [None]:
import pandas as pd
import plotly.express as px

Lexicon = pandas.read_csv("LexiconFinal.csv", usecols=[1])
words = Lexicon.iloc[:, 0]

clusters = numpy.load("clusters.npy")
vectors_3d = numpy.load("vectors_tSNE.npy")

df_2d = pd.DataFrame({
    'x': vectors_3d[:, 0],  #first dimension
    'y': vectors_3d[:, 1],  #second dimension
    'Cluster': clusters,
    'Word': words  #Word on hover
})

# Plot interactive 2D scatter
fig_2d = px.scatter(
    df_2d,
    x='x', y='y',
    color='Cluster',
    color_continuous_scale=px.colors.qualitative.Set3,
    title="2D Visualization of Clusters",
    hover_name='Word',  # Use 'Word' column for hover text
    opacity=0.7
)
fig_2d.update_traces(marker=dict(size=5))

fig_2d.show()

# Barreling

In [None]:
import pandas as pd
import os
import json
from tqdm import tqdm

df = pd.read_csv(".\\Barrels\\Clusters_KMEANS.csv", usecols=[1, 2])
Barrels = dict(zip(df['ID'], df['Cluster']))

with open("Inverted_index.json", "r") as file:
    inverted_index = json.load(file)
for index_str, value in tqdm(inverted_index.items(), desc="Index: "):
    index = int(index_str)
    # Get the barrel (cluster) for this index
    barrel = Barrels.get(index)
    if barrel is None:
        print(f"Warning: No barrel found for index {index}")
        continue
    barrel_file = f".\\Barrels\\Index_Barrels\\{barrel}.json"
    if not os.path.exists(barrel_file):
        with open(barrel_file, 'w') as file:
            json.dump({index_str: value}, file, separators=(',',':'))
    else:
        with open(barrel_file, 'r+') as file:
            try:
                data = json.load(file)
                data[index_str] = value  
                file.seek(0)
                json.dump(data, file, separators=(',',':'))
                file.truncate()
            except json.JSONDecodeError:
                file.seek(0)
                json.dump({index_str: value}, file, separators=(',',':'))