# Vector Matrix Generation

In [None]:
import pandas
import spacy
import numpy
from tqdm import tqdm

#FILENAMES
lexi = "LexiconFinal.csv"
spacy_model = "en_core_web_lg"
########################################################
nlp = spacy.load(spacy_model)
Lexicon = pandas.read_csv(lexi, usecols=[1, 2])

#extract words
words= Lexicon.iloc[:, 0]


#Generate a list of vectors
vectors = []
for word in tqdm(words, desc = "Loading vectors"):
  vectors.append(nlp(str(word)).vector)

vectors = numpy.array(vectors)
vectors = vectors.astype(numpy.float32)

numpy.save("vectors.npy", vectors)

# Vector Clustering

In [None]:
from sklearn.cluster import KMeans
from openTSNE import TSNE
from sklearn.preprocessing import normalize
from matplotlib import cm
import numpy as np
import faiss
import pandas as pd
from google.colab import files
import plotly.graph_objects as go
from scipy.spatial import Delaunay

new_lexi = "Clusters_KMEANS.csv"

########################################################

# Load vectors
vectors = np.load("vectors.npy")

# Normalize vectors
print("Normalizing\n")
vectors = normalize(vectors)

# Reduce dimensions with PCA
print("Reducing dimensions with PCA\n")
pca = faiss.PCAMatrix(300, 50)
pca.train(vectors)
vectors = pca.apply_py(vectors)



# Generate cluster labels
print("Using KMEANS\n")
algo = KMeans(init='k-means++', n_clusters=300, random_state=42)
clusters = algo.fit_predict(vectors)

# Append cluster labels to lexicon and save
print("\nWriting to file")
Lexicon['Clusters'] = clusters
Lexicon.to_csv(new_lexi, header=['Word', 'ID', 'Cluster'], index=False)


# Reduce dimensionality for 3D visualization
print("Reducing dimensionality to 3D for visualization\n")

tsne = TSNE(n_components=3, random_state=40, perplexity=30, n_iter=1000)
vectors_3d = tsne.fit(vectors)


# Example data (replace with your own cluster-separated data)
df = pd.DataFrame({
    'x': vectors_3d[:, 0],
    'y': vectors_3d[:, 1],
    'z': vectors_3d[:, 2],
    'Cluster': clusters
})

# Create a figure
fig = go.Figure()

# Iterate through clusters
for cluster_id in df['Cluster'].unique():
    cluster_points = df[df['Cluster'] == cluster_id][['x', 'y', 'z']].to_numpy()
    
    # Perform Delaunay triangulation in 3D
    if len(cluster_points) >= 4:  # Minimum for triangulation
        tri = Delaunay(cluster_points)
        
        # Add the mesh for the current cluster
        fig.add_trace(go.Mesh3d(
            x=cluster_points[:, 0],
            y=cluster_points[:, 1],
            z=cluster_points[:, 2],
            i=tri.simplices[:, 0],  # Indices for the first vertex of each triangle
            j=tri.simplices[:, 1],  # Indices for the second vertex
            k=tri.simplices[:, 2],  # Indices for the third vertex
            opacity=0.5,
            color=f'rgb({np.random.randint(0, 255)}, {np.random.randint(0, 255)}, {np.random.randint(0, 255)})',  # Random color per cluster
            name=f'Cluster {cluster_id}'
        ))


# Show the plot
fig.show()


