## Creating Text Embeddings Using Word2Vec

In [1]:
import os
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import re

def preprocess_text(text):
    # Tokenizes the text into words
    tokens = word_tokenize(text.lower())  
        
    return tokens

def extract_text_from_srt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    # Extracts text excluding lines with time stamps
    text_lines = []
    for line in lines:
        # Checks if the line matches the time format (00:00:00, -->, 00:00:00)
        if re.match(r'\d+:\d+:\d+, -->, \d+:\d+:\d+', line):
            continue  # Skip lines containing time stamps
        text_lines.append(line.strip())
    
    text = ' '.join(text_lines)
    
    return text

directory = 'all_transcripts'

# Creating a directory to store embeddings
if not os.path.exists('embeddings'):
    os.makedirs('embeddings')

# Loops through .srt files and generate word2vec embeddings
for filename in os.listdir(directory):
    if filename.endswith(".srt"):
        file_path = os.path.join(directory, filename)
        text = extract_text_from_srt(file_path)
        processed_text = preprocess_text(text)
        
        # Train Word2Vec model
        model = Word2Vec([processed_text], vector_size=100, window=5, min_count=1, workers=4)
        
        # Save embeddings
        embeddings_path = os.path.join('embeddings', f'{os.path.splitext(filename)[0]}.model')
        model.save(embeddings_path)


In [2]:
from gensim.models import Word2Vec

# Loads the Word2Vec model for video 1
model_path = 'embeddings/video1.model'  # Change this path based on your directory structure
model = Word2Vec.load(model_path)

# Gets the embedding for a specific word (example: "suspect")
embedding = model.wv['suspect']
print("Embedding for 'suspect':", embedding)

# Or you can get most similar words to a given word
similar_words = model.wv.most_similar('suspect')
print("Words similar to 'suspect':", similar_words)


Embedding for 'suspect': [ 0.00818131  0.00580824 -0.00424017  0.00910195  0.00769412  0.00313075
 -0.00240161  0.00381971 -0.00099049 -0.00485661 -0.00670916  0.00830404
 -0.00955331  0.00539825  0.00505297  0.00882266 -0.00933695  0.00254394
  0.00378582 -0.00565148 -0.00495485  0.00286477 -0.00511677  0.00095515
 -0.00203343 -0.00645782  0.00937474 -0.00399442 -0.00294225  0.00947906
  0.00859515  0.00963951 -0.00119455  0.00068694  0.00202975 -0.00215625
  0.00879531  0.0043525  -0.00879458 -0.00951785 -0.00889645  0.00853545
 -0.00253983 -0.00122153 -0.00643749 -0.00843147  0.00144965 -0.00197893
 -0.00277333  0.00516376  0.00931681 -0.00972004 -0.0029123   0.00765464
  0.00110541 -0.00250866  0.00268543 -0.00037831  0.00502347 -0.00350073
  0.0084303   0.00189583 -0.00893094 -0.00314984 -0.00041834 -0.00335301
  0.00271622  0.00081034  0.00555072 -0.00184914  0.00267356  0.00760996
  0.00103814 -0.00320298  0.00222403 -0.00746069  0.00557095  0.00366809
 -0.00274969  0.00530919  

## Creating Database

In [5]:
import psycopg2

username = 'postgres'
password = 'yashshah'
# Connects to the database
conn = psycopg2.connect(
        host="localhost",
        port="5432",
        database="postgres",
        user=username,
        password=password
)
cur = conn.cursor()

# Creates the table
cur.execute("""
    CREATE TABLE video_embeddings (
        id SERIAL PRIMARY KEY,
        video_name VARCHAR(50),
        embedding FLOAT[]
    )
""")
conn.commit()
cur.close()
conn.close()


## Uploading Embeddings to the database

In [6]:
import psycopg2

username = 'postgres'
password = 'yashshah'
# Connects to the database
conn = psycopg2.connect(
        host="localhost",
        port="5432",
        database="postgres",
        user=username,
        password=password
)
cur = conn.cursor()

# Uploads embeddings to the database
directory = 'embeddings'
for filename in os.listdir(directory):
    if filename.endswith(".model"):
        video_name = os.path.splitext(filename)[0]
        model_path = os.path.join(directory, filename)
        model = Word2Vec.load(model_path)
        embedding = model.wv.vectors.tolist()[0]  
        cur.execute("INSERT INTO video_embeddings (video_name, embedding) VALUES (%s, %s)", (video_name, embedding))

conn.commit()
cur.close()
conn.close()


## Indexing Embedding to find similar videos based on keyword searches

In [19]:
import numpy as np
import psycopg2

username = 'postgres'
password = 'yashshah'

# Function to find similar videos to 'video1' 
def find_similar_videos_with_words(video_name, threshold=0.9, top_n=10):
    conn = psycopg2.connect(
        host="localhost",
        port="5432",
        database="postgres",
        user=username,
        password=password
    )
    cur = conn.cursor()

    # Gets the embedding for 'video1'
    cur.execute("SELECT embedding FROM video_embeddings WHERE video_name = %s", (video_name,))
    video1_embedding = cur.fetchone()[0]

    # Retrieves all video embeddings
    cur.execute("SELECT video_name, embedding FROM video_embeddings")
    results = cur.fetchall()

    similar_videos = []
    for vid_name, emb in results:
        similarity = np.dot(video1_embedding, emb) / (np.linalg.norm(video1_embedding) * np.linalg.norm(emb))
        if similarity >= threshold and vid_name != video_name:
            words = []
            for word, vec in word_embeddings.items():
                word_similarity = np.dot(video1_embedding, vec) / (np.linalg.norm(video1_embedding) * np.linalg.norm(vec))
                if word_similarity >= threshold:
                    words.append(word)
            similar_videos.append((vid_name, similarity, words))

    similar_videos.sort(key=lambda x: x[1], reverse=True)
    similar_videos = similar_videos[:top_n]

    cur.close()
    conn.close()
    return similar_videos

video1_embedding = np.random.rand(100)  

word_embeddings = {
    'harm': np.random.rand(100),
    'war': np.random.rand(100),
    'russia': np.random.rand(100),
    'gaza': np.random.rand(100),
    'missile': np.random.rand(100),
    'attack': np.random.rand(100)
}

similar_videos = find_similar_videos_with_words('video1', threshold=0.9, top_n=10)
for vid_name, similarity, words in similar_videos:
    print(f"Video: {vid_name}, Similarity: {similarity}")


Video: video41, Similarity: 0.9996295931707739
Video: video6, Similarity: 0.9996295931707739
Video: video34, Similarity: 0.9996189760492453
Video: video30, Similarity: 0.999533802685078
Video: video33, Similarity: 0.9994973895647266
Video: video22, Similarity: 0.9994948409997629
Video: video28, Similarity: 0.9994342522351356
Video: video8, Similarity: 0.9994081318591923
Video: video19, Similarity: 0.9993928421386967
Video: video37, Similarity: 0.9993806242739927


In [11]:
import numpy as np
import psycopg2

username = 'postgres'
password = 'yashshah'

def find_similar_videos_with_words(video_name, threshold=0.9, top_n=10):
    conn = psycopg2.connect(
        host="localhost",
        port="5432",
        database="postgres",
        user=username,
        password=password
    )
    cur = conn.cursor()

    # Get the embedding for 'video1'
    cur.execute("SELECT embedding FROM video_embeddings WHERE video_name = %s", (video_name,))
    video1_embedding = cur.fetchone()[0]

    # Retrieve all video embeddings
    cur.execute("SELECT video_name, embedding FROM video_embeddings")
    results = cur.fetchall()

    similar_videos = []
    for vid_name, emb in results:
        if vid_name != video_name:
            similarity = np.dot(video1_embedding, emb) / (np.linalg.norm(video1_embedding) * np.linalg.norm(emb))
            if similarity >= threshold:
                # Calculate words contributing to similarity
                words = []
                for word, vec in word_embeddings.items():
                    word_similarity = np.dot(emb, vec) / (np.linalg.norm(emb) * np.linalg.norm(vec))
                    if word_similarity >= threshold:
                        words.append(word)
                similar_videos.append((vid_name, similarity, words))

    similar_videos.sort(key=lambda x: x[1], reverse=True)
    similar_videos = similar_videos[:top_n]

    cur.close()
    conn.close()
    return similar_videos

video1_embedding = np.random.rand(100)

word_embeddings = {
    'harm': np.random.rand(100),
    'war': np.random.rand(100),
    'russia': np.random.rand(100),
    'gaza': np.random.rand(100),
    'shooting':np.random.rand(100)
}

similar_videos = find_similar_videos_with_words('video1', threshold=0.9, top_n=10)
for vid_name, similarity, words in similar_videos:
    print(f"Video: {vid_name}, Similarity: {similarity}, Contributing words: {', '.join(words)}")


Video: video41, Similarity: 0.9996295931707739, Contributing words: 
Video: video6, Similarity: 0.9996295931707739, Contributing words: 
Video: video34, Similarity: 0.9996189760492453, Contributing words: 
Video: video30, Similarity: 0.999533802685078, Contributing words: 
Video: video33, Similarity: 0.9994973895647266, Contributing words: 
Video: video22, Similarity: 0.9994948409997629, Contributing words: 
Video: video28, Similarity: 0.9994342522351356, Contributing words: 
Video: video8, Similarity: 0.9994081318591923, Contributing words: 
Video: video19, Similarity: 0.9993928421386967, Contributing words: 
Video: video37, Similarity: 0.9993806242739927, Contributing words: 


## I tried coding which words led to similarity but was not successful, so we cited some texts from those video transcripts down 

## Video 41

Meanwhile, local Texas law enforcement officials held another news conference  last hour. They say they've arrested more people 
in the shooting case but would not offer details.


## Video 6
He was arrested without incident and is charged with five counts of murder in the shooting deaths of his neighbors,  

## Video 34
And Ukraine says it shot down six missiles overnight that Russian President Vladimir Putin previously called "unstoppable."

## Video 30
Officials in South Korea and Japan say that North Korea has launched what appears to be two ballistic missiles. These were fired from north of the capital, Pyongyang, 

## Video 33
"There are damages of houses. We had to initiate a rescue mission — eight people were rescued and 

## Video 22
The Wagner group has led the Russian offensive in Bakhmut, a brutal battle where both sides have faced enormous losses. Bakhmut has little strategic 

## Video 28
It's been one month since Hamas militants attacked Israel, killing at least 1,400  people. Israel has responded with airstrikes. Palestinian health officials in Gaza say 10,000  

## Video 8
Russia continues to attack Ukraine with powerful missiles and drones. Ukraine continues to fend 

## Video 19
There are questions in Russia today about the whereabouts of President Vladimir Putin. He's  not been seen for two days following the brief failed mutiny by forces of the mercenary Wagner 

## Video 37
multiple people are feared dead and injured after mass shootings at a pair of locations in Lewiston Maine last
