In [34]:
import numpy as np
import pandas as pd
import json

In [35]:
#Read all the book_data text files from ChaptersCleaned folder
import os
book_data_text_files = {}
for root, dirs, files in os.walk('ChaptersCleaned'):
    for file in files:
        if file.endswith('.txt'):
            with open(os.path.join(root, file), 'r', encoding='utf-8') as f:
                print("Reading file:", file)
                book_data_text_files[file] = f.read()

Reading file: everything_about_dogs_administering_medicine.txt
Reading file: everything_about_dogs_breeding.txt
Reading file: everything_about_dogs_diseases.txt
Reading file: everything_about_dogs_distemper.txt
Reading file: everything_about_dogs_dog_papers.txt
Reading file: everything_about_dogs_drugs.txt
Reading file: everything_about_dogs_feeding.txt
Reading file: everything_about_dogs_hydrophobia.txt
Reading file: everything_about_dogs_medicle_terms.txt
Reading file: everything_about_dogs_start_note.txt


In [36]:
#load sentence transformer model called book_data_embedding_model
from sentence_transformers import SentenceTransformer
book_data_embedding_model = SentenceTransformer('diseases_embedding_model')

In [37]:
len(book_data_text_files['everything_about_dogs_dog_papers.txt'])

1422

In [None]:
#Split each text each of chunk size of 2000 charachters with overlap of 1000 characters
def split_text(text, chunk_size=2000, overlap=1000):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        if len(chunk) < chunk_size:
            #Append the last chunk even if it's smaller than chunk_size
            if chunk:
                chunks.append(chunk)
            break
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

book_data_text_master = []
for chapter_name, text in book_data_text_files.items():
    print("Processing chapter_name:", chapter_name)
    chunks = split_text(text)
    chapter_name_dict = {}
    chapter_name_dict['chapter_name'] = chapter_name
    chapter_name_dict['text_chunks'] = chunks
    book_data_text_master.append(chapter_name_dict)
    print("Processing chapter_name:", chapter_name, "completed with", len(chunks), "chunks.")

Processing disease: everything_about_dogs_administering_medicine.txt
Processing disease: everything_about_dogs_administering_medicine.txt completed with 12 chunks.
Processing disease: everything_about_dogs_breeding.txt
Processing disease: everything_about_dogs_breeding.txt completed with 47 chunks.
Processing disease: everything_about_dogs_diseases.txt
Processing disease: everything_about_dogs_diseases.txt completed with 519 chunks.
Processing disease: everything_about_dogs_distemper.txt
Processing disease: everything_about_dogs_distemper.txt completed with 87 chunks.
Processing disease: everything_about_dogs_dog_papers.txt
Processing disease: everything_about_dogs_dog_papers.txt completed with 1 chunks.
Processing disease: everything_about_dogs_drugs.txt
Processing disease: everything_about_dogs_drugs.txt completed with 14 chunks.
Processing disease: everything_about_dogs_feeding.txt
Processing disease: everything_about_dogs_feeding.txt completed with 25 chunks.
Processing disease: ev

In [None]:
#Now iterate through each chapter_name, iterate through each text chunk, convert the chapter_name name along with all values to string and make embeddings
for chapter_name in book_data_text_master:
    chapter_name_name = chapter_name['chapter_name']
    text_chunks = chapter_name['text_chunks']
    print("Creating embeddings for chapter_name:", chapter_name_name)
    embeddings = []
    for i, chunk in enumerate(text_chunks):

        # Generate the embedding
        embedding = book_data_embedding_model.encode(chunk)
        #Convert the embedding to a numpy array
        embedding = np.array(embedding)
        # Store the embedding in the dictionary
        embeddings.append(embedding.tolist())

    # Store the embeddings in the chapter_name dictionary
    chapter_name['embeddings'] = embeddings
    

Creating embeddings for disease: everything_about_dogs_administering_medicine.txt
Creating embeddings for disease: everything_about_dogs_breeding.txt
Creating embeddings for disease: everything_about_dogs_diseases.txt
Creating embeddings for disease: everything_about_dogs_distemper.txt
Creating embeddings for disease: everything_about_dogs_dog_papers.txt
Creating embeddings for disease: everything_about_dogs_drugs.txt
Creating embeddings for disease: everything_about_dogs_feeding.txt
Creating embeddings for disease: everything_about_dogs_hydrophobia.txt
Creating embeddings for disease: everything_about_dogs_medicle_terms.txt
Creating embeddings for disease: everything_about_dogs_start_note.txt


In [40]:
#Now convert the book_data_text_master to a DataFrame
book_data_df = pd.DataFrame(book_data_text_master)
#Save as parquet
book_data_df.to_parquet('everything_about_dogs_with_embeddings.parquet')

### Reformat for Json

In [15]:
#Load the parquet file
import pandas as pd
import json
book_data_df = pd.read_parquet('Final Data/everything_about_dogs_with_embeddings.parquet')

In [None]:
#Convert the dataframe into dictionay where chapter name is key.
#Value is a list of dictionaries with text_chunk and embedding
book_data_dict = {}
for index, row in book_data_df.iterrows():
    chapter_name = row['chapter_name']
    if chapter_name not in book_data_dict:
        book_data_dict[chapter_name] = {}
    book_data_dict[chapter_name]['text_chunks'] = row['text_chunks']
    embedding_list_str = [] #Store embeddings as a list of strings
    for embedding in row['embeddings']:
        str_embedding = embedding.tolist()  # Convert numpy array to list
        embedding_list_str.append(str_embedding)

    book_data_dict[chapter_name]['embeddings'] = embedding_list_str  # Convert numpy arrays to lists for JSON serialization

#Save the dictionary as a JSON file
with open('Final Data/everything_about_dogs_with_embeddings_dict.json', 'w', encoding='utf-8') as f:
    json.dump(book_data_dict, f, ensure_ascii=False, indent=4)

ValueError: setting an array element with a sequence

In [23]:
row['embeddings'].tolist()  # Convert numpy arrays to lists for JSON serialization

[array([-1.65494140e-02, -1.18674850e-02, -1.96300037e-02, -2.74386518e-02,
        -1.34133026e-01, -2.51686014e-02,  2.50251554e-02,  4.94536646e-02,
         8.46491605e-02,  2.37672441e-02,  6.77928678e-04,  3.94270271e-02,
         2.35746019e-02,  1.19916387e-01, -1.94517300e-02, -2.13400321e-03,
         1.26112904e-02,  1.00369081e-02,  1.38083603e-02,  4.00443301e-02,
         3.35696116e-02,  2.33946647e-03,  2.93699969e-02, -1.13622202e-02,
        -8.99620205e-02,  4.56029400e-02,  2.34223045e-02, -1.20147012e-01,
         1.69045329e-02,  4.61899489e-02,  3.16335261e-02, -1.74248014e-02,
        -1.89675726e-02, -1.68671701e-02, -2.81163882e-02,  7.82676786e-03,
         3.14998813e-02,  9.18524042e-02,  6.98610470e-02,  1.01173297e-01,
         4.88952473e-02,  1.68637861e-03, -1.18284803e-02, -4.39230390e-02,
         3.23418416e-02, -7.11906143e-03, -6.12975582e-02, -3.05939373e-02,
         4.06403281e-02,  1.48515180e-02, -5.88358566e-02, -6.13716692e-02,
         5.6

In [7]:
#Drop the embedding column and save the DataFrame in parquet format
book_data_df.drop(columns=['embeddings'], inplace=True)
#book_data_df.to_parquet('Final Data/everything_about_dogs_without_embeddings.parquet')

NameError: name 'json' is not defined

In [8]:
#Store in json format. With entries comma seperated with root key as data
book_data_df.to_json('Final Data/everything_about_dogs_without_embeddings.json', orient='records')

In [6]:
book_data_df.head()

Unnamed: 0,chapter_name,text_chunks,embeddings
0,everything_about_dogs_administering_medicine.txt,[\n\nAdministering Medicine. — It is foolis...,"[[-0.01654941402375698, -0.011867485009133816,..."
1,everything_about_dogs_breeding.txt,[\n\nBREEDING AND RAISING OF PUPPIES \n\n\...,"[[-0.07183301448822021, -0.11321435868740082, ..."
2,everything_about_dogs_diseases.txt,[\nAsthma. — This is a common disease and...,"[[0.023961307480931282, -0.01763729564845562, ..."
3,everything_about_dogs_distemper.txt,[\n\nDistemper. — This disease trouble bree...,"[[-0.05673234164714813, -0.01892991177737713, ..."
4,everything_about_dogs_dog_papers.txt,[DOG PAPERS YOU NEED \n\n\nThere are seve...,"[[0.014960107393562794, -0.03968214988708496, ..."
