In [35]:
import pandas as pd
import torch
import string
import numpy as np
import re

# Function to clean words by removing special characters
def clean_word(word):
    # Remove anything that's not a letter (a-z) or dot (.), and convert to lowercase
    return re.sub(r'[^a-z]', '', word.lower())

# Load the dataset
data = pd.read_csv('dataset/pokemon.csv')["name"]
words = data.to_list()
words = list(map(clean_word, words[:100]))

In [36]:
len(words)

100

In [37]:
# Tokenize names by characters
character_sequences = [list(name) for name in words]

# Train Word2Vec on character sequences
# Parameters: vector_size controls embedding dimensions, window is context size, min_count=1 ensures all characters are included
model = Word2Vec(character_sequences, vector_size=50, window=2, min_count=1, sg=1)

# Get character embeddings
char_vectors = {char: model.wv[char] for char in model.wv.index_to_key}

In [38]:
# To encode an entire Pokémon name, we can average the vectors of its characters
def encode_name(name):
    return np.mean([model.wv[char] for char in name if char in model.wv], axis=0)

In [39]:
encode_name('bulbasaur')

array([-5.6121368e-03,  3.2989557e-03,  8.2296063e-04,  7.8265909e-03,
        6.7323605e-03, -4.4042585e-04,  1.1334162e-02, -4.7891103e-03,
       -1.2259428e-02,  2.5233072e-03,  4.9297339e-03, -5.4915515e-03,
        6.8668667e-03, -9.4687077e-04,  3.6649678e-03,  2.6342971e-03,
        5.2121873e-03,  1.0754899e-02, -1.0770697e-02, -9.5675327e-03,
        2.8032160e-03,  7.7195051e-03,  1.2828835e-02, -8.9899348e-03,
        7.4082590e-03,  4.4498486e-03,  5.5798155e-04, -2.6656906e-03,
       -3.6922325e-03, -5.4455134e-03,  1.0816392e-02, -2.5518707e-03,
        4.1083563e-03,  4.5883977e-03, -7.6799765e-03,  9.7528961e-04,
       -1.7536245e-04,  7.0633396e-04, -5.4692558e-04, -2.9629541e-03,
        5.8211624e-03, -5.4968830e-05,  7.8713417e-04, -3.0841453e-03,
        1.0449376e-02,  9.9222669e-03, -5.7527209e-03, -9.1513358e-03,
       -2.5399078e-03, -1.6530727e-03], dtype=float32)

In [40]:
# Encode Pokémon names
encoded_pokemon = np.array([encode_name(word) for word in words])

In [41]:
from sklearn.manifold import TSNE

# Dimensionality reduction using t-SNE for 3D visualization
import plotly.express as px

# Flatten the one-hot encoded arrays into a 2D array
flattened_encoded_pokemon = encoded_pokemon.reshape(len(words), -1)  # Reshape to (number of Pokémon, max_length * number of characters)

# Check the shape of the flattened result
print(flattened_encoded_pokemon.shape)  # Should be (number of words, max_length * number of characters)

# Dimensionality reduction using t-SNE for 3D visualization
tsne = TSNE(n_components=3, perplexity=3, random_state=42)
pokemon_tsne_3d = tsne.fit_transform(flattened_encoded_pokemon)

# Create a DataFrame for Plotly visualization
df = pd.DataFrame(pokemon_tsne_3d, columns=['Component 1', 'Component 2', 'Component 3'])
df['Pokémon'] = words  # Add Pokémon names for labeling

# 3D Scatter Plot using Plotly Express
fig = px.scatter_3d(
    df, 
    x='Component 1', 
    y='Component 2', 
    z='Component 3', 
    text='Pokémon',  # Display Pokémon names
    title='3D t-SNE of Pokémon Name Embeddings',
    height=800
)

fig.update_traces(marker_size=10)
fig.update_layout(scene=dict(
    xaxis_title='TSNE Component 1',
    yaxis_title='TSNE Component 2',
    zaxis_title='TSNE Component 3',
))

fig.show()

(100, 50)


In [42]:
df.to_csv('pokemon-3d-visualization/pokemon_tsne_data_word_2_vec.csv', index=False)