In [34]:
import numpy as np
import pandas as pd
import json

In [35]:
#Read all the book_data text files from ChaptersCleaned folder
import os
book_data_text_files = {}
for root, dirs, files in os.walk('ChaptersCleaned'):
    for file in files:
        if file.endswith('.txt'):
            with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                print("Reading file:", file)
                book_data_text_files[file] = f.read()

Reading file: everything_about_dogs_administering_medicine.txt
Reading file: everything_about_dogs_breeding.txt
Reading file: everything_about_dogs_diseases.txt
Reading file: everything_about_dogs_distemper.txt
Reading file: everything_about_dogs_dog_papers.txt
Reading file: everything_about_dogs_drugs.txt
Reading file: everything_about_dogs_feeding.txt
Reading file: everything_about_dogs_hydrophobia.txt
Reading file: everything_about_dogs_medicle_terms.txt
Reading file: everything_about_dogs_start_note.txt


In [36]:
#load sentence transformer model called book_data_embedding_model
from sentence_transformers import SentenceTransformer
book_data_embedding_model = SentenceTransformer('diseases_embedding_model')

In [37]:
len(book_data_text_files['everything_about_dogs_dog_papers.txt'])

1422

In [38]:
#Split each text each of chunk size of 2000 charachters with overlap of 1000 characters
def split_text(text, chunk_size=2000, overlap=1000):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        if len(chunk) < chunk_size:
            #Append the last chunk even if it's smaller than chunk_size
            if chunk:
                chunks.append(chunk)
            break
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

book_data_text_master = []
for disease, text in book_data_text_files.items():
    print("Processing disease:", disease)
    chunks = split_text(text)
    disease_dict = {}
    disease_dict['disease'] = disease
    disease_dict['text_chunks'] = chunks
    book_data_text_master.append(disease_dict)
    print("Processing disease:", disease, "completed with", len(chunks), "chunks.")

Processing disease: everything_about_dogs_administering_medicine.txt
Processing disease: everything_about_dogs_administering_medicine.txt completed with 12 chunks.
Processing disease: everything_about_dogs_breeding.txt
Processing disease: everything_about_dogs_breeding.txt completed with 47 chunks.
Processing disease: everything_about_dogs_diseases.txt
Processing disease: everything_about_dogs_diseases.txt completed with 519 chunks.
Processing disease: everything_about_dogs_distemper.txt
Processing disease: everything_about_dogs_distemper.txt completed with 87 chunks.
Processing disease: everything_about_dogs_dog_papers.txt
Processing disease: everything_about_dogs_dog_papers.txt completed with 1 chunks.
Processing disease: everything_about_dogs_drugs.txt
Processing disease: everything_about_dogs_drugs.txt completed with 14 chunks.
Processing disease: everything_about_dogs_feeding.txt
Processing disease: everything_about_dogs_feeding.txt completed with 25 chunks.
Processing disease: ev

In [39]:
#Now iterate through each disease, iterate through each text chunk, convert the disease name along with all values to string and make embeddings
for disease in book_data_text_master:
    disease_name = disease['disease']
    text_chunks = disease['text_chunks']
    print("Creating embeddings for disease:", disease_name)
    embeddings = []
    for i, chunk in enumerate(text_chunks):

        # Generate the embedding
        embedding = book_data_embedding_model.encode(chunk)
        #Convert the embedding to a numpy array
        embedding = np.array(embedding)
        # Store the embedding in the dictionary
        embeddings.append(embedding.tolist())

    # Store the embeddings in the disease dictionary
    disease['embeddings'] = embeddings
    

Creating embeddings for disease: everything_about_dogs_administering_medicine.txt
Creating embeddings for disease: everything_about_dogs_breeding.txt
Creating embeddings for disease: everything_about_dogs_diseases.txt
Creating embeddings for disease: everything_about_dogs_distemper.txt
Creating embeddings for disease: everything_about_dogs_dog_papers.txt
Creating embeddings for disease: everything_about_dogs_drugs.txt
Creating embeddings for disease: everything_about_dogs_feeding.txt
Creating embeddings for disease: everything_about_dogs_hydrophobia.txt
Creating embeddings for disease: everything_about_dogs_medicle_terms.txt
Creating embeddings for disease: everything_about_dogs_start_note.txt


In [40]:
#Now convert the book_data_text_master to a DataFrame
book_data_df = pd.DataFrame(book_data_text_master)
#Save as parquet
book_data_df.to_parquet('everything_about_dogs_with_embeddings.parquet')

In [41]:
#Also save the book_data_df as a JSON file
book_data_df.to_json('everything_about_dogs_with_embeddings.json', orient='records', lines=True)

In [6]:
#Load the parquet file
import pandas as pd
book_data_df = pd.read_parquet('Final Data/everything_about_dogs_with_embeddings.parquet')

In [None]:
#Drop the embedding column and save the DataFrame in parquet format
book_data_df.drop(columns=['embeddings'], inplace=True)
book_data_df.to_parquet('Final Data/everything_about_dogs_without_embeddings.parquet')

In [13]:
#Store in json format. With entries comma seperated with root key as data
book_data_df.to_json('Final Data/everything_about_dogs_without_embeddings.json', orient='records')

In [10]:
book_data_df.head()

Unnamed: 0,disease,text_chunks
0,everything_about_dogs_administering_medicine.txt,[\n\nAdministering Medicine. — It is foolis...
1,everything_about_dogs_breeding.txt,[\n\nBREEDING AND RAISING OF PUPPIES \n\n\...
2,everything_about_dogs_diseases.txt,[\nAsthma. — This is a common disease and...
3,everything_about_dogs_distemper.txt,[\n\nDistemper. — This disease trouble bree...
4,everything_about_dogs_dog_papers.txt,[DOG PAPERS YOU NEED \n\n\nThere are seve...
