In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
import sqlite3
import pandas as pd

# Verify if the database file exists
db_file_path = "/content/drive/MyDrive/eng_subtitles_database.db"
if not os.path.exists(db_file_path):
    print(f"Error: Database file '{db_file_path}' not found.")
else:
    # Connect to the database
    try:
        conn = sqlite3.connect(db_file_path)
        query = 'SELECT * FROM zipfiles'
        df = pd.read_sql_query(query, conn)
        conn.close()

        df.to_csv("/content/drive/MyDrive/zipfiles.csv", index=False)
        print("Data saved to CSV successfully.")
    except Exception as e:
        print(f"Error: {e}")

# Verify df is loaded or created successfully
print(df.head())


Data saved to CSV successfully.
       num                                               name  \
0  9180533                         the.message.(1976).eng.1cd   
1  9180583  here.comes.the.grump.s01.e09.joltin.jack.in.bo...   
2  9180592    yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd   
3  9180594    yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd   
4  9180600                              broker.(2022).eng.1cd   

                                             content  
0  b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...  
1  b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...  
2  b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...  
3  b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...  
4  b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...  


In [4]:
df.head()

Unnamed: 0,num,name,content
0,9180533,the.message.(1976).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x1c\xa9\x...
1,9180583,here.comes.the.grump.s01.e09.joltin.jack.in.bo...,b'PK\x03\x04\x14\x00\x00\x00\x08\x00\x17\xb9\x...
2,9180592,yumis.cells.s02.e13.episode.2.13.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00L\xb9\x99V...
3,9180594,yumis.cells.s02.e14.episode.2.14.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x00U\xa9\x99V...
4,9180600,broker.(2022).eng.1cd,b'PK\x03\x04\x14\x00\x00\x00\x08\x001\xa9\x99V...


In [14]:
df.shape

(82498, 3)

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82498 entries, 0 to 82497
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   num      82498 non-null  int64 
 1   name     82498 non-null  object
 2   content  82498 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.9+ MB


In [6]:
from tqdm import tqdm

import zipfile
import io

# Define a function to decompress and decode the data
def decomp_decode(data):
    with zipfile.ZipFile(io.BytesIO(data)) as zip_file:
        # Extract the first file in the ZIP archive
        file_list = zip_file.namelist()
        first_file = file_list[0]
        decompressed_data = zip_file.read(first_file)
    return decompressed_data.decode('latin-1')

# Use tqdm for progress visualization
tqdm.pandas()

# Apply the function to the 'content' column of the DataFrame
df['content'] = tqdm(df['content'].apply(decomp_decode))

# Display the first few entries of the 'content' column
df['content'].head()


100%|██████████| 82498/82498 [00:00<00:00, 1103994.52it/s]


0    1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...
1    1\r\n00:00:29,359 --> 00:00:32,048\r\nAh! Ther...
2    1\r\n00:00:53,200 --> 00:00:56,030\r\n<i>Yumi'...
3    1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch an...
4    ï»¿1\r\n00:00:06,000 --> 00:00:12,074\r\nWatch...
Name: content, dtype: object

In [7]:
import re
from tqdm import tqdm

def clean_data(data):
    # Remove timestamps
    data = re.sub("\d{2}:\d{2}:\d{2},\d{3}\s-->\s\d{2}:\d{2}:\d{2},\d{3}", " ", data)
    # Remove index numbers of dialogues
    data = re.sub(r'\n?\d+\r', "", data)
    # Remove escape sequences like \n \r
    data = re.sub('\r|\n', "", data)
    # Remove <i> and </i>
    data = re.sub('<i>|</i>', "", data)
    # Remove links
    data = re.sub("(?:www\.)osdb\.link\/[\w\d]+|www\.OpenSubtitles\.org|osdb\.link\/ext|api\.OpenSubtitles\.org|OpenSubtitles\.com", " ", data)
    # Convert to lower case
    data = data.lower()
    return data

# Use tqdm for progress visualization
tqdm.pandas()

# Apply the clean_data function to the 'content' column of the DataFrame
df['content'] = tqdm(df['content'].progress_apply(clean_data))

# Convert 'num' column to string format
df['num'] = df['num'].astype(str)


100%|██████████| 82498/82498 [06:29<00:00, 212.04it/s]
100%|██████████| 82498/82498 [00:00<00:00, 1096584.30it/s]


In [8]:
!pip install sentence-transformers



In [None]:
import pandas as pd
from joblib import Parallel, delayed
from sentence_transformers import SentenceTransformer
import numpy as np
import json

# Initialize the SentenceTransformer model
model_name = 'paraphrase-MiniLM-L3-v2'
model = SentenceTransformer(model_name, device='cuda')

# Define the semantic_chunking function
def semantic_chunking(document, similarity_threshold=0.9):
    sentences = document.split('.')
    chunks = []
    current_chunk = sentences[0]
    sentence_embeddings = model.encode(sentences)
    for i in range(1, len(sentences)):
        similarity_score = np.dot(sentence_embeddings[i], sentence_embeddings[i - 1]) / (
                    np.linalg.norm(sentence_embeddings[i]) * np.linalg.norm(sentence_embeddings[i - 1]))
        if similarity_score >= similarity_threshold:
            current_chunk += '.' + sentences[i]
        else:
            chunks.append(current_chunk)
            current_chunk = sentences[i]
    chunks.append(current_chunk)
    return chunks

# Load the data into a DataFrame (assuming 'df' contains your data)

# Split the data into two parts
temp_1 = df[:30000]
temp_2 = df[30000:]

# Process each part using joblib for parallel processing
def process_data(data):
    data['chunks'] = Parallel(n_jobs=-1)(delayed(semantic_chunking)(item) for item in data['content'].values)
    return data

# Process the first part and save to JSON
temp_1_processed = process_data(temp_1)
temp_1_processed.to_json("db1.json", orient='records', lines=True)

# Process the second part and save to JSON
temp_2_processed = process_data(temp_2)
temp_2_processed.to_json("db2.json", orient='records', lines=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  pid = os.fork()


In [None]:
import pandas as pd
from joblib import Parallel, delayed
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Initialize the SentenceTransformer model
model_name = 'paraphrase-MiniLM-L3-v2'
model = SentenceTransformer(model_name, device='cuda')

# Define the semantic_chunking function
def semantic_chunking(document, similarity_threshold=0.9):
    sentences = document.split('.')
    chunks = []
    current_chunk = sentences[0]
    sentence_embeddings = model.encode(sentences)
    for i in range(1, len(sentences)):
        similarity_score = np.dot(sentence_embeddings[i], sentence_embeddings[i - 1]) / (
                    np.linalg.norm(sentence_embeddings[i]) * np.linalg.norm(sentence_embeddings[i - 1]))
        if similarity_score >= similarity_threshold:
            current_chunk += '.' + sentences[i]
        else:
            chunks.append(current_chunk)
            current_chunk = sentences[i]
    chunks.append(current_chunk)
    return chunks

# Load the data into a DataFrame (assuming 'df' contains your data)
# Replace this with your data loading code
df = pd.read_csv("/content/drive/MyDrive/zipfiles.csv")

# Batch processing parameters
batch_size = 500  # Adjust batch size as needed

# Function to process data in batches
def process_data_batch(data_batch):
    data_batch['chunks'] = data_batch['content'].progress_apply(semantic_chunking)
    return data_batch

# Process data in batches
processed_batches = []
for start_idx in tqdm(range(0, len(df), batch_size)):
    end_idx = min(start_idx + batch_size, len(df))
    temp_processed = process_data_batch(df[start_idx:end_idx].copy())
    processed_batches.append(temp_processed)

# Concatenate processed batches and save to JSON
processed_df = pd.concat(processed_batches)
processed_df.to_json("processed_data.json", orient='records', lines=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def create_index(item):
    index = []
    temp_index = df[df['num'] == item].index[0]  # Get the index of the item
    for j in range(len(df['chunks'].iloc[temp_index])):
        index.append(item + "-" + str(j))  # Create a unique ID for each chunk
    return index

df['num_list'] = df['num'].apply(create_index)  # Apply the create_index function to each item


In [None]:
from sentence_transformers import SentenceTransformer
from joblib import Parallel, delayed

model_name = 'paraphrase-MiniLM-L3-v2'  # Change to the desired model
model = SentenceTransformer(model_name, device='cuda')  # Initialize the SentenceTransformer model

# Define the function to generate embeddings
def embedding_gen(data):
    return model.encode(data).tolist()  # Convert embeddings to a list for compatibility

# Apply the embedding_gen function to each chunk in parallel using joblib
df['embeddings'] = Parallel(n_jobs=-1)(delayed(embedding_gen)(item) for item in df['chunks'].values)


In [None]:
import chromadb

# Set up ChromaDB client
client = chromadb.PersistentClient(path="E://search_engine_db")
collection = client.get_or_create_collection(name="search_engine", metadata={"hnsw:space": "cosine"})
collection_2 = client.get_or_create_collection(name="search_engine_FileName", metadata={"hnsw:space": "cosine"})

# Function to add filenames to collection_2
def add_filenames():
    for i in range(df.shape[0]):
        collection_2.add(
            documents=[df['name'].iloc[i]],  # Add each filename
            embeddings=[[1, 2, 34, 45]],  # Random data (not used when retrieving filename)
            ids=[df['num'].iloc[i]]  # Unique 'num' id
        )

# Function to add chunks, embeddings, and unique identifiers to collection
def add_chunks_embeddings():
    for i in range(df.shape[0]):
        collection.add(
            documents=df['chunks'].iloc[i],  # Add each chunk
            embeddings=df['embeddings'].iloc[i],  # Add the corresponding chunk embedding
            ids=df['num_list'].iloc[i]  # Unique 'num' id
        )

# Call the functions to add data to ChromaDB
add_filenames()
add_chunks_embeddings()
