In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics.pairwise import cosine_similarity
import torch
import sqlite3
import pandas as pd
import numpy as np
import json
import pickle
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
from io import BytesIO

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:01<00:00,  1.23it/s]


In [3]:
conn = sqlite3.connect('../db/sword_hackathon.db')

# Create a cursor
cursor = conn.cursor()

# Create a new table
cursor.execute("""
    CREATE TABLE IF NOT EXISTS youtube_embeddings (
        id INTEGER PRIMARY KEY,
        url TEXT,
        embeddings TEXT
    )
""")

# Commit the changes
conn.commit()
conn.close()



In [4]:
# Connect to the SQLite database
conn = sqlite3.connect('../db/sword_hackathon.db')

# Execute the SQL command and convert it to a pandas DataFrame
df_existing = pd.read_sql_query("SELECT * FROM youtube_embeddings", conn)

# Close the connection to the database
conn.close()

# Display the DataFrame
df_existing

Unnamed: 0,id,url,embeddings


In [5]:
conn = sqlite3.connect('../db/sword_hackathon.db')

cur = conn.cursor()
cur.execute("SELECT * FROM youtube_summaries")

results = cur.fetchall()
columns = [column[0] for column in cur.description]
df = pd.DataFrame(results, columns=columns)
df = df.drop_duplicates()
conn.close()
df.shape

(367, 3)

In [6]:
def calculate_embedding(text):
    input_ids = tokenizer.encode(text, return_tensors='pt')

    with torch.no_grad():
        embeddings = model(input_ids)[0]

    mean_embedding = torch.mean(embeddings, dim=1).squeeze()

    return mean_embedding.numpy()

In [7]:
all_documents = []
for idx, row in df.iterrows():
    content = json.loads(row['summary'])
    all_documents.append(json.loads(content)['Summary'])
len(all_documents)

367

In [10]:
#embeddings = [calculate_embedding(doc) for doc in all_documents]

In [11]:
len(embeddings)

367

In [12]:
reshape_embeddings = np.array(embeddings).reshape(367, -1)

In [None]:
similarity_matrix = cosine_similarity(reshape_embeddings)

In [None]:
top_10_indices = np.argsort(similarity_matrix, axis=1)[:, -11:-1]

In [None]:
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(reshape_embeddings)

In [None]:
# Let's say `current_doc_index` is the index of the current document in `all_documents`.
current_doc_index = 0

plt.figure(figsize=(10, 10))

# Plot all documents in gray color
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], color='gray')

# Plot current document in red color
plt.scatter(embeddings_2d[current_doc_index, 0], embeddings_2d[current_doc_index, 1], color='red')

# Plot top 5 similar documents in blue color
for similar_doc_index in top_10_indices[current_doc_index]:
    plt.scatter(embeddings_2d[similar_doc_index, 0], embeddings_2d[similar_doc_index, 1], color='blue')

plt.title('t-SNE visualization of document embeddings')
plt.show()


In [None]:
cur.execute("SELECT * FROM youtube_summaries")

In [None]:
"""
conn = sqlite3.connect('../db/sword_hackathon.db')

cur = conn.cursor()
# Serialize the numpy array
embeddings_binary = numpy_to_binary(embeddings)

# Insert into the SQLite database
cursor.execute("INSERT INTO youtube_embeddings (url, embeddings) VALUES (?, ?)", (url, embeddings_binary))

# Commit the changes
conn.commit()

conn.close()
"""


In [13]:
data = []
for idx, row in df.iterrows():
    # Get the URL and numpy array
    url = row['url']
    embeddings = reshape_embeddings[idx, :]

    # Add them to the data
    data.append({
        'url': url,
        'embeddings': embeddings.tolist()  # Convert numpy array to list
    })

# Create DataFrame
embeddings_df = pd.DataFrame(data)
embeddings_df.to_csv("embeddings_text.csv", index=False)

In [None]:
embeddings_str

In [14]:
embeddings_df

Unnamed: 0,url,embeddings
0,https://www.youtube.com//watch?v=a03U45jFxOI&p...,"[-4.587231636047363, -6.214117050170898, 8.622..."
1,https://www.youtube.com//watch?v=nHj09xU40bM&p...,"[-4.361590385437012, -4.991323471069336, 8.537..."
2,https://www.youtube.com//watch?v=yMy21uS3owE&p...,"[-4.569097518920898, -4.948065280914307, 7.898..."
3,https://www.youtube.com//watch?v=4V9pPGrpN1E&p...,"[-4.503209590911865, -4.928098678588867, 8.940..."
4,https://www.youtube.com//watch?v=JKs-cRneTyE&p...,"[-3.7463438510894775, -5.734409332275391, 7.35..."
...,...,...
362,https://www.youtube.com//watch?v=S-88yo8VJL8&p...,"[-6.127265453338623, -6.4669365882873535, 5.16..."
363,https://www.youtube.com//watch?v=8kKdqbg1Byg&p...,"[-5.674411773681641, -4.4669060707092285, 7.92..."
364,https://www.youtube.com//watch?v=hStaW2PPF0c&p...,"[-11.90255355834961, -8.943065643310547, -1.03..."
365,https://www.youtube.com//watch?v=4HDJ_fBrso4&p...,"[-6.655236721038818, -7.787965297698975, 4.871..."
