In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer
import os
import fiftyone as fo

# Step 1: Read the CSV File
csv_path = "voxel51_content.csv"  # Replace with your CSV file path
df = pd.read_csv(csv_path)

# Assuming the content is in a column named 'content'
texts = df['Body Text'].tolist()

In [20]:
df.columns

Index(['URL', 'Title', 'Meta Description', 'Publish Date', 'Date Modified',
       'H2 Headers', 'Body Text', 'Number of Links', 'Link URLs'],
      dtype='object')

In [24]:
import os
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

# Ensure the directory 'content_images' exists
os.makedirs('content_images', exist_ok=True)

# Define the SVG parameters
SVG_WIDTH = 800  # Increased width for better visibility
PIXELS_PER_ROW = 100  # Number of pixels per row
PIXEL_SIZE = 8  # Size of each pixel
PIXEL_SPACING = 1  # Space between pixels

# Function to clean the text data and handle NaN or non-string entries
def preprocess_text(corpus):
    # Convert non-string entries to empty strings
    return corpus.fillna("").astype(str)

# Train or load a Word2Vec model
def train_word2vec(corpus):
    sentences = [text.split() for text in corpus if isinstance(text, str) and text.strip() != ""]
    model = Word2Vec(sentences, vector_size=50, window=5, min_count=1, workers=4)
    return model

# Function to vectorize text using Word2Vec
def vectorize_text(text, model):
    # Ensure text is a string
    if not isinstance(text, str):
        text = ""
    words = text.split()  # Split text into words
    vectors = [model.wv[word] if word in model.wv else np.zeros(model.vector_size) for word in words]  # Get word vectors
    return np.array(vectors)

# Function to generate SVG with multiple rows of pixels
def create_svg_from_vector(vectors, row_index):
    num_words = len(vectors)
    print(num_words)
    
    if num_words == 0:
        print(f"Row {row_index} has no words, skipping...")
        return None
    
    # Calculate layout
    total_width = SVG_WIDTH
    pixels_per_row = min(PIXELS_PER_ROW, num_words)
    num_rows = int(np.ceil(num_words / pixels_per_row))
    
    # Calculate total height based on number of rows
    total_height = (PIXEL_SIZE + PIXEL_SPACING) * num_rows - PIXEL_SPACING
    
    # SVG template with viewBox for better scaling
    svg_template = '''<svg width="{width}" height="{height}" viewBox="0 0 {width} {height}" 
                     xmlns="http://www.w3.org/2000/svg" style="background-color: white">{rects}</svg>'''
    
    # Template for each pixel
    rect_template = '''<rect x="{x}" y="{y}" width="{size}" height="{size}" 
                      fill="rgb({r},{g},{b})" stroke="rgba(0,0,0,0.1)" stroke-width="0.5"/>'''
    
    rects = []
    for i, vector in enumerate(vectors):
        # Calculate position in grid
        row = i // pixels_per_row
        col = i % pixels_per_row
        
        # Calculate x and y coordinates
        x = col * (PIXEL_SIZE + PIXEL_SPACING)
        y = row * (PIXEL_SIZE + PIXEL_SPACING)
        
        # Map vector to RGB colors
        r = int(np.clip(vector[0] * 255, 0, 255))
        g = int(np.clip(vector[1] * 255, 0, 255))
        b = int(np.clip(vector[2] * 255, 0, 255))
        
        rects.append(rect_template.format(
            x=x,
            y=y,
            size=PIXEL_SIZE,
            r=r, g=g, b=b
        ))
    
    # Create final SVG content
    svg_content = svg_template.format(
        width=total_width,
        height=total_height,
        rects=''.join(rects)
    )
    
    return svg_content

# Function to save SVG
def save_svg(content, filename):
    if content:  # Only save if content is not None
        with open(filename, 'w') as f:
            f.write(content)

# Main processing code
def process_dataframe_to_svgs(df):
    # Preprocess the text data
    cleaned_data = preprocess_text(df['Body Text'])
    
    # Train a Word2Vec model on the dataframe's content
    word2vec_model = train_word2vec(cleaned_data)
    
    # Process each row in the DataFrame
    for index, row in df.iterrows():
        text = row['Body Text']
        vectors = vectorize_text(text, word2vec_model)
        svg_content = create_svg_from_vector(vectors, index)
        
        if svg_content:  # Only save if svg_content is generated
            file_path = os.path.join('content_images', f'row_{index}.svg')
            save_svg(svg_content, file_path)

    print("SVG files created and saved in 'content_images' folder.")

# Read and process the CSV file
csv_path = "voxel51_content.csv"  # Replace with your CSV file path
df = pd.read_csv(csv_path)
process_dataframe_to_svgs(df)

2666
1748
1727
2267
2488
4334
1348
1768
1170
1380
1824
2792
3956
1651
1512
1566
1539
1386
2312
1512
2282
3326
1630
1651
1797
2299
3964
1577
1305
3563
1957
2574
4072
798
1616
1355
1592
2809
3021
1198
2178
2800
1510
2285
2380
3228
1306
2961
1729
1120
2653
1529
1946
1088
4185
1562
1704
1491
1964
1317
974
1566
2584
1643
1875
1400
2811
989
3617
1213
1612
1683
1903
2045
2422
1672
1830
1643
1431
1626
1302
1467
1201
1241
1716
1072
1454
1480
1091
2659
1228
1383
1522
2730
1283
1224
1332
2930
1847
1318
4719
1954
2572
1208
2400
4012
2041
1116
2216
2406
1346
2499
1680
1479
2200
1405
2755
1590
1680
1826
1938
1143
2894
1822
3613
2401
1374
1403
979
1006
2995
1235
1516
3967
3637
1528
3779
701
1857
1085
1632
1367
1358
1341
1885
1650
1389
2282
1065
3389
843
1305
3096
1789
1647
1015
1728
1384
1109
1582
1907
1024
1212
1171
1343
1163
1250
1187
1229
1203
1489
2069
1554
1735
1292
1800
1802
2659
1812
2304
2881
1499
1545
1597
1483
827
1731
1602
1368
936
1471
1815
2185
2925
1016
2557
2821
2895
2781
1185
3011
150

In [26]:
import os
import fiftyone as fo
import fiftyone.core.metadata as fom

# Define the path to the directory containing the SVG files
SVG_DIR = "content_images"

# Define dataset name
DATASET_NAME = "svg_word_representation"

# Delete the dataset if it already exists
if DATASET_NAME in fo.list_datasets():
    print(f"Deleting existing dataset '{DATASET_NAME}'")
    fo.delete_dataset(DATASET_NAME)

# Create a new FiftyOne dataset
dataset = fo.Dataset(DATASET_NAME)

# Function to add each SVG file as a sample to the dataset
def add_svg_to_dataset(dataset, svg_dir):
    # List all SVG files in the specified directory
    svg_files = [f for f in os.listdir(svg_dir) if f.endswith(".svg")]
    
    # Sort files by index for consistent ordering
    svg_files.sort(key=lambda x: int(x.split("_")[1].split(".")[0]))
    
    for svg_file in svg_files:
        # Get the full file path
        file_path = os.path.join(svg_dir, svg_file)
        
        # Create a sample with the SVG file path
        sample = fo.Sample(filepath=file_path)
        
        # Add metadata using ImageMetadata
        sample.metadata = fom.ImageMetadata()
        
        # Add filename and index as attributes
        sample["filename"] = svg_file
        sample["index"] = int(svg_file.split("_")[1].split(".")[0])
        
        # Add the sample to the dataset
        dataset.add_sample(sample)
    
    print(f"Added {len(svg_files)} SVG files to the dataset.")
    return len(svg_files)

try:
    # Add SVG files to the dataset
    num_samples = add_svg_to_dataset(dataset, SVG_DIR)
    
    if num_samples > 0:
        # Add dataset info
        dataset.info = {
            "num_samples": num_samples,
            "description": "Word vector visualization dataset using SVGs"
        }
        
        # Print dataset summary
        print("\nDataset Summary:")
        print(f"Name: {dataset.name}")
        print(f"Number of samples: {len(dataset)}")
        print("Fields:", dataset.get_field_schema())
        
        # Launch FiftyOne app to view the dataset
        session = fo.launch_app(dataset)
    else:
        print(f"No SVG files found in directory: {SVG_DIR}")
        
except Exception as e:
    print(f"Error creating dataset: {str(e)}")

Deleting existing dataset 'svg_word_representation'
Added 1069 SVG files to the dataset.

Dataset Summary:
Name: svg_word_representation
Number of samples: 1069
Fields: OrderedDict({'id': <fiftyone.core.fields.ObjectIdField object at 0x34e159040>, 'filepath': <fiftyone.core.fields.StringField object at 0x34cd7e570>, 'tags': <fiftyone.core.fields.ListField object at 0x34e10a4e0>, 'metadata': <fiftyone.core.fields.EmbeddedDocumentField object at 0x34e23c4d0>, 'created_at': <fiftyone.core.fields.DateTimeField object at 0x34e18e540>, 'last_modified_at': <fiftyone.core.fields.DateTimeField object at 0x34e12c5c0>, 'filename': <fiftyone.core.fields.StringField object at 0x34e106a50>, 'index': <fiftyone.core.fields.IntField object at 0x34e18f440>})



Welcome to

███████╗██╗███████╗████████╗██╗   ██╗ ██████╗ ███╗   ██╗███████╗
██╔════╝██║██╔════╝╚══██╔══╝╚██╗ ██╔╝██╔═══██╗████╗  ██║██╔════╝
█████╗  ██║█████╗     ██║    ╚████╔╝ ██║   ██║██╔██╗ ██║█████╗
██╔══╝  ██║██╔══╝     ██║     ╚██╔╝  ██║   ██║██║╚██╗██║██╔══╝
██║     ██║██║        ██║      ██║   ╚██████╔╝██║ ╚████║███████╗
╚═╝     ╚═╝╚═╝        ╚═╝      ╚═╝    ╚═════╝ ╚═╝  ╚═══╝╚══════╝ v1.0.0

If you're finding FiftyOne helpful, here's how you can get involved:

|
|  ⭐⭐⭐ Give the project a star on GitHub ⭐⭐⭐
|  https://github.com/voxel51/fiftyone
|
|  🚀🚀🚀 Join the FiftyOne Slack community 🚀🚀🚀
|  https://slack.voxel51.com
|

