<a href="https://colab.research.google.com/github/michaelwnau/ai_academy_notebooks/blob/main/tex2vecs2parqs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install pydrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


In [None]:
file_path = '/content/drive/MyDrive/corpus-zeta/2023/oit-videos-bios/brad-houston-bios-talk.txt'
with open(file_path, 'r') as file:
    file_content = file.read()


In [None]:
def chunk_text(text, size=1000):
    tokens = text.split()
    chunks = []

    for i in range(0, len(tokens), size):
        chunks.append(' '.join(tokens[i:i+size]))

    return chunks

chunked_texts = chunk_text(file_content)


In [None]:
!pip install spacy
!python -m spacy download en_core_web_md

import spacy

nlp = spacy.load('en_core_web_md')

def embed_text(text):
    doc = nlp(text)
    return [token.vector for token in doc]

embedded_texts = [embed_text(text) for text in chunked_texts]


In [None]:
!pip install matplotlib scikit-learn numpy

import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE


In [None]:
n_samples = averaged_embeddings_array.shape[0]
print(f"Number of samples: {n_samples}")

adjusted_perplexity = n_samples // 2  # Using floor division
tsne_model = TSNE(perplexity=adjusted_perplexity, n_components=2, init='pca', n_iter=3500, random_state=32)
low_dim_data = tsne_model.fit_transform(averaged_embeddings_array)


In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(low_dim_data[:, 0], low_dim_data[:, 1])

# Optionally, annotate points with some text data or indices
for i, txt in enumerate(chunked_texts):
    if i < 10:  # Only annotate the first 10 points for clarity
        plt.annotate(txt[:30] + '...', (low_dim_data[i, 0], low_dim_data[i, 1]))  # Displaying first 30 characters of each chunk

plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('2D t-SNE representation of Text Embeddings')
plt.show()


In [None]:
!pip install pandas pyarrow

import pandas as pd

# Create a DataFrame
df = pd.DataFrame({
    'text': chunked_texts,
    'embedding': embedded_texts
})

# Export to Parquet format
df.to_parquet('output.parquet')


In [None]:
uploaded = drive.CreateFile({'title': 'output.parquet'})
uploaded.SetContentFile('output.parquet')
uploaded.Upload()
