In [None]:
import sqlite3
import pandas as pd

In [None]:
# Reading the Tables from Database file

conn = sqlite3.connect('eng_subtitles_database.db')
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
print(cursor.fetchall())

In [None]:
# Reading the columns of Table
cursor.execute("PRAGMA table_info('zipfiles')")
cols = cursor.fetchall()
for col in cols:
    print(col[1])

In [None]:
# Loading the Database Table inside a Pandas DataFrame
df_raw = pd.read_sql_query("""SELECT * FROM zipfiles""", conn)
df_raw.head()

In [None]:
df_raw.info()

In [None]:
# Printing content of 0th Row
b_data = df_raw.iloc[0, 2]
print(b_data)

In [None]:
# Unzipping the content of 385th row and decoding using latin-1
import zipfile
import io

# Assuming 'content' is the binary data from your database
binary_data = df_raw.iloc[385, 2]

# Decompress the binary data using the zipfile module
with io.BytesIO(binary_data) as f:
    with zipfile.ZipFile(f, 'r') as zip_file:
        # Reading only one file in the ZIP archive
        subtitle_content = zip_file.read(zip_file.namelist()[0])

# Now 'subtitle_content' should contain the extracted subtitle content
print(subtitle_content.decode('latin-1'))  # Assuming the content is latin-1 encoded text

In [None]:
# Applying the above Function on the Entire Data
import zipfile
import io

count = 0

def decode_method(binary_data):
    global count
    # Decompress the binary data using the zipfile module
    # print(count, end=" ")
    count += 1
    with io.BytesIO(binary_data) as f:
        with zipfile.ZipFile(f, 'r') as zip_file:
            # Assuming there's only one file in the ZIP archive
            subtitle_content = zip_file.read(zip_file.namelist()[0])
    
    # Now 'subtitle_content' should contain the extracted subtitle content
    return subtitle_content.decode('latin-1')  # Assuming the content is UTF-8 encoded text

In [None]:
df_limited = df_raw.head(24000).copy()

In [None]:
df_limited['file_content'] = df_limited['content'].apply(decode_method)

df_limited.head()

In [None]:
df_limited.info()

In [None]:
print(df_limited.file_content[0])

In [None]:
import re
def clean_tokens_eachline(text):
    junk_text = text
    clean_text_step_1 = re.sub(r'\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\s+', '', junk_text)
    clean_text_step_2 = re.sub(r'^[\d]|\n\d+', '', clean_text_step_1)
    clean_text = re.sub(r'<i>|</i>', '', clean_text_step_2)
    return clean_text.strip()

In [None]:
def clean_tokens_oneline(text):
    junk_text = text
    clean_text_step_1 = re.sub(r'\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\s+', '', junk_text)
    clean_text_step_2 = re.sub(r'^[\d]|\n\d+', '', clean_text_step_1)
    clean_text_step_3 = re.sub(r'<i>|</i>', '', clean_text_step_2)
    clean_text = re.sub('\r\n\r\r|\r\n', ' ', clean_text_step_3)
    return clean_text.strip()

In [None]:
clean = clean_tokens_eachline(df_limited.file_content[2])
print(clean)

In [None]:
clean = clean_tokens_oneline(df_limited.file_content[2])
print(clean)

In [None]:
df_limited['Sub_Titles'] = df_limited['file_content'].apply(lambda x: clean_tokens_oneline(x))

In [None]:
df_limited['name'] = df_limited['name'].apply(lambda x: re.sub('.eng.1cd','', x))

In [None]:
df_limited.head(3)

In [None]:
df_cleaned = df_limited[['num','name', 'Sub_Titles']]
df_cleaned

In [None]:
#pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

# Load pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def semantic_chunkings(document, similarity_threshold=0.9):
    # Tokenize the document into sentences
    sentences = document.split('.')
    
    # Initialize variables for semantic chunks
    chunks = []
    current_chunk = sentences[0]
    
    # Generate embeddings for the sentences
    sentence_embeddings = model.encode(sentences)
    
    for i in range(1, len(sentences)):    
        # Compute cosine similarity
        cos_sim = util.pytorch_cos_sim(sentence_embeddings[i], sentence_embeddings[i-1])
        if cos_sim >= similarity_threshold:
            current_chunk += '.' + sentences[i]
        else:
                # If similarity score is below the threshold, start a new chunk
                chunks.append(current_chunk)
                current_chunk = sentences[i]

    # Add the last chunk
    chunks.append(current_chunk)
    
    return chunks, model.encode(chunks)

In [None]:
ans = semantic_chunkings(df_cleaned['Sub_Titles'][0])

In [None]:
print(ans[1])

#### Working on Sample data

In [None]:
from joblib import Parallel, delayed
chunks_embeddings = Parallel(n_jobs=-1)(delayed(semantic_chunkings)(item) for item in df_cleaned['Sub_Titles'])

In [None]:
df_chunked = pd.DataFrame(chunks_embeddings, columns=['chunks','embeddings'])

In [None]:
df_chunked[['name', 'num']] = df_cleaned[['name', 'num']]

In [None]:
df_chunked

In [None]:
#saving to json file
df_chunked.to_json("database.json") #saving data to json file to restrart the kernel and save RAM

In [None]:
# restarting the kernel
# interacting with each part of the json file

import json
import pandas as pd

json_file_path = "database.json" #database_p2.json
with open(json_file_path, 'r') as f:
  data = json.load(f)

df = pd.DataFrame(data)

In [None]:
df['num'] = df['num'].apply(lambda x: str(x))
df

In [None]:
import chromadb
client = chromadb.PersistentClient(path="/search_engine_db")
collection = client.get_or_create_collection(name="search_engine", metadata={"hnsw:space": "cosine"})
collection_2 = client.get_or_create_collection(name="search_engine_FileName", metadata={"hnsw:space": "cosine"})

In [None]:
def indexer(item):
    index=[]
    temp=int(df[df['num']==item].index[0])
    for j in range(len(df['chunks'].iloc[temp])):
        index.append(item+"-"+str(j))# since id needs to be unique adding the j index with a hyphen to create a unique id
    return index
df['num_list'] = df['num'].apply(lambda x : indexer(x)) #indexing the embeddings

In [None]:
def add_func_v1():
    for i in range(df.shape[0]): #setting the range as total no. of rows in dataframe
        collection_2.add(
            documents=[df['name'].iloc[i]], # adding each filename
            embeddings=[[1,2,34,45]], # adding a random data, as we don't need it when retrieving file_name
            ids=[df['num'].iloc[i]] # entering unique 'num' id
        )

In [None]:
def add_func_v2():
    for i in range(df.shape[0]): #setting the range as total no. of rows in dataframe
        collection.add(
            documents=df['chunks'].iloc[i], # adding each chunk
            embeddings=df['embeddings'].iloc[i], # adding the corresponding chunk embedding
            ids=df['num_list'].iloc[i] #entering the unique 'num' id
        )

In [None]:
%time add_func_v1()

In [None]:
%time add_func_v2()