### Language Model Setup

In [1]:
import openai
import os
from dotenv import load_dotenv 

load_dotenv()
openai.api_key =  os.getenv('OPENAI_API_KEY')

In [2]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(temperature=0.0, model_name='gpt-3.5-turbo')

### Tokenizer Setup

In [3]:
import tiktoken 
tiktoken.encoding_for_model('gpt-3.5-turbo')

<Encoding 'cl100k_base'>

In [4]:
tokenizer = tiktoken.get_encoding('cl100k_base')

def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=os.environ['OPENAI_API_KEY']
)

In [8]:
text_examples = ["I am a text example", 
                 "I am another text example"]

In [9]:
result = embed.embed_documents(text_examples)

In [10]:
len(result[0])

1536

### Importing Raw Datasets

In [None]:
kegg_medicus_drug = 'raw_datasets/kegg_medicus_drug_en.csv'
kegg_medicus_disease = 'raw_datasets/kegg_medicus_disease_en.csv'
kegg_medicus_dgroup = 'raw_datasets/kegg_medicus_dgroup_en.csv'
kegg_medicus_environ = 'raw_datasets/kegg_medicus_environ_en.csv'
kegg_medicus_network = 'raw_datasets/kegg_medicus_network.csv'
kegg_medicus_variant = 'raw_datasets/kegg_medicus_variant.csv'

In [None]:
import pandas as pd

raw_drug_df = pd.read_csv(kegg_medicus_drug)
raw_disease_df = pd.read_csv(kegg_medicus_disease)
raw_dgroup_df = pd.read_csv(kegg_medicus_dgroup)
raw_environ_df = pd.read_csv(kegg_medicus_environ)
raw_network_df = pd.read_csv(kegg_medicus_network)
raw_variant_df = pd.read_csv(kegg_medicus_variant)

### Data Cleaning & Preprocessing

In [None]:
# empty variable for preprocessing function
row_to_dict = None

In [None]:
preprocessed_datasets = []

In [None]:
def convert_dataframe_to_list(df):
    result_list = df.apply(row_to_dict, axis=1).tolist()
    return result_list

#### Kegg Medicus Drug Data Preprocessing

In [None]:
raw_drug_df.head()

In [None]:
raw_drug_df.groupby('classification')['classification'].nunique()

In [None]:
raw_drug_df.isna().sum()

In [None]:
exclude = ['component', 'sequence', 'source', 'db_links', 'sequence type', 
           'dblinks_w_link', 'comment_w_link', 'target_w_link', 'disease', 'disease_w_link',
           'remark_w_link', 'image', 'raw_entry_id', 'class', 'original', 'repeat',
           'efficacy_w_link','source_w_link','metabolism_w_link','sequence_w_link',
            'interaction_w_link','component_w_link','class_w_link', 'interaction','kcf','atom', 'bond',
            'bracket', 'remark','metabolism','target']
clean_drug_df = raw_drug_df[[col for col in raw_drug_df.columns if col not in exclude]]


In [None]:
clean_drug_df.columns

In [None]:
clean_drug_df.head()

In [None]:
#1
def row_to_dict(row):
    entry_id = row['entry_id']
    name = row['name']
    efficacy = row['efficacy']
    comment = row['comment']
    formula = row['formula']
    exact_mass = row['exact_mass']
    mol_weight = row['mol_weight']
    classification = row['classification']
    text = f"name: {name}; formula: {formula}; efficacy: {efficacy}; comment: {comment}"
    output_dict = {
            "id": entry_id,
            "input": text,
            "metadata": {
                "name": name,
                "text": text,
                "classification": classification,
                "formula": formula,
                "exact mass": exact_mass,
                "mol_weight": mol_weight
            }
        }

    return output_dict

In [None]:
drugs_preprocess = convert_dataframe_to_list(clean_drug_df)
print(len(drugs_preprocess))
drugs_preprocess[0]

In [None]:
drugs_df = pd.DataFrame(drugs_preprocess)
preprocessed_datasets.append(drugs_df)
print(drugs_df.shape)
drugs_df.head()

#### Kegg Medicus Disease Data Preprocessing

In [None]:
raw_disease_df.head()

In [None]:
raw_disease_df.shape

In [None]:
raw_disease_df.isna().sum()

In [None]:
exclude = ['SUBGROUP', 'SUPERGRP', 'NETWORK', 'ENTRY_link',
           'SUBGROUP_link', 'SUPERGRP_link', 'DESCRIPTION_link',
           'NETWORK_link', 'GENE_link', 'PATHOGEN_link', 'ENV_FACTOR_link',
           'CARCINOGEN_link', 'DRUG_link', 'COMMENT_link','DBLINKS_link',
           'REFERENCE_link', 'DBLINKS', 'REFERENCE']
clean_disease_df = raw_disease_df[[col for col in raw_disease_df.columns if col not in exclude]]
clean_disease_df.head()

In [None]:
#2
def row_to_dict(row):
    entry_id = row['ENTRY']
    name = row['NAME']
    description = row['DESCRIPTION']
    comment = row['COMMENT']
    category = row['CATEGORY']
    gene = row['GENE']
    pathogen = row['PATHOGEN']
    env_factor = row['ENV_FACTOR']
    carcinogen = row['CARCINOGEN']
    drug = row['DRUG']
    text = f"name: {name}; category: {category}; description: {description}; drug: {drug}"

    output_dict = {
            "id": entry_id,
            "input": text,
            "metadata": {
                "name": name,
                "text": text,
                "gene": gene,
                "pathogen": pathogen,
                "env_factor": env_factor,
                "carcinogen": carcinogen,
                "drug": drug,
                "comment": comment,
            }
        }

    return output_dict

In [None]:
disease_preprocess = convert_dataframe_to_list(clean_disease_df)
print(len(disease_preprocess))
disease_preprocess[1]

In [None]:
disease_df = pd.DataFrame(disease_preprocess)
preprocessed_datasets.append(disease_df)
print(disease_df.shape)
disease_df.head()

#### Kegg Medicus D-Group Data Preprocessing

In [None]:
raw_dgroup_df.shape

In [None]:
raw_dgroup_df.head()

In [None]:
exclude = ['CLASSIFICATION','STEM','IMAGE','COMMENT','CLASS_link','ENTRY_link','REMARK_link','MEMBER_link']
clean_dgroup_df = raw_dgroup_df[[col for col in raw_dgroup_df.columns if col not in exclude]]

In [None]:
raw_dgroup_df.isna().sum()

In [None]:
clean_dgroup_df.head()

In [None]:
#3
def row_to_dict(row):
    entry_id = row['ENTRY']
    name = row['NAME']
    class_item = row['CLASS']
    remark = row['REMARK']
    member = row['MEMBER']
    text = f"name: {name}; member: {member}; class: {class_item};  remark: {remark}"

    output_dict = {
            "id": entry_id,
            "input": text,
            "metadata": {
                "text": text,
                "name": name,
                "class": class_item
            }
        }

    return output_dict

In [None]:
dgroup_preprocess = convert_dataframe_to_list(clean_dgroup_df)
print(len(dgroup_preprocess))
dgroup_preprocess[0]

In [None]:
dgroup_df = pd.DataFrame(dgroup_preprocess)
preprocessed_datasets.append(dgroup_df)
print(dgroup_df.shape)
dgroup_df.head()

#### Kegg Medicus Environ Data Preprocessing

In [None]:
raw_environ_df.head()

In [None]:
raw_environ_df.shape

In [None]:
exclude = ['Remark', 'Other DBs']
clean_environ_df = raw_environ_df[[col for col in raw_environ_df.columns if col not in exclude]]

In [None]:
raw_environ_df.isna().sum()

In [None]:
clean_environ_df.head()

In [None]:
#4
def row_to_dict(row):
    entry_id = row['E number']
    name = row['Name']
    category = row['Category']
    component = row['Component']
    source = row['Source']
    comment = row['Comment']
    text = f"name: {name}; category: {category}; component: {comment} comment: {comment}; source: {source}"

    output_dict = {
            "id": entry_id,
            "input": text,
            "metadata": {
                "name": name, 
                "text": text,
                "component": component,
                
            }
        }

    return output_dict

In [None]:
environ_preprocess = convert_dataframe_to_list(clean_environ_df)
print(len(environ_preprocess))
environ_preprocess[0]

In [None]:
environ_df = pd.DataFrame(environ_preprocess)
preprocessed_datasets.append(environ_df)
print(environ_df.shape)
environ_df.head()

#### Kegg Medicus Network Data Preprocessing

In [None]:
raw_network_df.head()

In [None]:
raw_network_df.shape 

In [None]:
exclude = ['ENTRY_link','VARIANT','METABOLITE','PERTURBANT','DEFINITION_link', 'EXPANDED_link', 'CLASS_link', 'DISEASE_link', 'GENE_link', 'VARIANT_link',
           'METABOLITE_link', 'PERTURBANT_link','REFERENCE_link']
clean_network_df = raw_network_df[[col for col in raw_network_df.columns if col not in exclude]]

In [None]:
clean_network_df.head()

In [None]:
raw_network_df.isna().sum()

In [None]:
clean_network_df.columns

In [None]:
#5
def row_to_dict(row):
    entry_id = row['ENTRY']
    name = row['NAME']
    definition = row['DEFINITION']
    expanded = row['EXPANDED']
    item_class = row['CLASS']
    item_type = row['TYPE']
    disease = row['DISEASE']
    gene = row['GENE']
    text = f"name: {name}; definition: {definition}; expanded: {expanded}; class: {item_class}, type: {item_type}"

    output_dict = {
            "id": entry_id,
            "input": text,
            "metadata": {
                "name": name,
                "text": text,
                "name": name,
                "class": item_class,
                "comment": item_type,
                "disease": disease,
                "gene":gene
            }
        }

    return output_dict

In [None]:
network_preprocess = convert_dataframe_to_list(clean_network_df)
print(len(network_preprocess))
network_preprocess[0]

In [None]:
network_df = pd.DataFrame(network_preprocess)
preprocessed_datasets.append(network_df)
print(network_df.shape)
network_df.head()

#### Kegg Medicus Variant Data Preprocessing

In [None]:
raw_variant_df.head()

In [None]:
raw_variant_df.shape 

In [None]:
raw_variant_df.isna().sum()

In [None]:
exclude = ['ENTRY_link', 'NETWORK_link', 'GENE_link','ORGANISM', 'VARIATION_link',
           'ELEMENT_link', 'REFERENCE_link']
clean_variant_df = raw_variant_df[[col for col in raw_variant_df.columns if col not in exclude]]

In [None]:
clean_variant_df.head()

In [None]:
clean_variant_df.columns

In [None]:
#6
def row_to_dict(row):
    entry_id = row['ENTRY']
    name = row['NAME']
    gene = row['GENE']
    variation = row['VARIATION']
    network = row['NETWORK']
    element = row['ELEMENT']
    reference = row['REFERENCE']
    text = f"name: {name}; gene: {gene}; variation: {variation}; network{network}; element{element}"

    output_dict = {
            "id": entry_id,
            "input": text,
            "metadata": {
                "name": name,
                "text": text,
                "network": network,
                "element": element,
                "reference": reference
            }
        }

    return output_dict

In [None]:
variant_preprocess = convert_dataframe_to_list(clean_variant_df)
print(len(variant_preprocess))
variant_preprocess[0]

In [None]:
variant_df = pd.DataFrame(variant_preprocess)
preprocessed_datasets.append(variant_df)
print(variant_df.shape)
variant_df.head()

### Data Preprocessing (Creating Embeddings)

In [None]:
index_data = []

In [None]:
def get_embedding(text):
    return embed.embed_documents(text)[0]

In [None]:
for df in preprocessed_datasets:
    print(f"indexing {df.shape}")
    df['values'] = df['input'].apply(get_embedding)

In [None]:
drugs_index = drugs_df.drop(['input'], axis=1)
drugs_dataset = pd.DataFrame.to_csv(drugs_index)
#with open('index_datasets/drugs.csv', 'w') as f: 
#    f.write(drugs_dataset)
index_data.append(drugs_index)
drugs_index.head()

In [None]:
disease_index = disease_df.drop(['input'], axis=1)
disease_dataset = pd.DataFrame.to_csv(disease_index)
#with open('index_datasets/disease.csv', 'w') as f: 
#    f.write(disease_dataset)
index_data.append(disease_index)
disease_index.head()

In [None]:
dgroup_index = dgroup_df.drop(['input'], axis=1)
dgroup_dataset = pd.DataFrame.to_csv(dgroup_index)
#with open('index_datasets/dgroup.csv', 'w') as f: 
#   f.write(dgroup_dataset)
index_data.append(dgroup_index)
dgroup_index.head()

In [None]:
environ_index = environ_df.drop(['input'], axis=1)
environ_dataset = pd.DataFrame.to_csv(environ_index)
#with open('index_datasets/environ.csv', 'w') as f: 
#    f.write(environ_dataset)
index_data.append(environ_index)
environ_index.head()

In [None]:
variant_index = variant_df.drop(['input'], axis=1)
variant_dataset = pd.DataFrame.to_csv(variant_index)
#with open('index_datasets/variant.csv', 'w') as f: 
#    f.write(variant_dataset)
index_data.append(variant_index)
variant_index.head()

In [None]:
network_index =  network_df.drop(['input'], axis=1)
network_dataset = pd.DataFrame.to_csv(network_index)
index_data.append
#with open('index_datasets/network.csv', 'w') as f: 
#    f.write(network_dataset)
index_data.append(network_index)
network_index.head()

### Data Storage

In [1]:
index_name = "kegg-medicus-database-index"

In [2]:
import pinecone

pinecone_api_key = os.getenv('PINECONE_API_KEY')

pinecone.init(
    api_key=os.environ['PINECONE_API_KEY'],
    environment='gcp-starter'
)

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='dotproduct',
        dimension= len(result[0])  # 1536 dim of text-embedding-ada-002
    )

  from tqdm.autonotebook import tqdm


NameError: name 'os' is not defined

In [None]:
kegg_medicus_index = pinecone.GRPCIndex(index_name)
kegg_medicus_index.describe_index_stats()

In [None]:
for df in index_data:
    kegg_medicus_index.upsert_from_dataframe(df, batch_size=100)

In [None]:
kegg_medicus_index.describe_index_stats()

In [13]:
import joblib
import dill
import os
import pinecone
import cloudpickle 
from dotenv import load_dotenv
load_dotenv()

pinecone_api_key = os.getenv('PINECONE_API_KEY')

pinecone.init(
    api_key=os.environ['PINECONE_API_KEY'],
    environment='gcp-starter'
)

index_name = "kegg-medicus-database-index"
index_filename = "cached_index.joblib"

index_filename = "cached_vector_database.pkl"

if os.path.exists(index_filename):
    # Load cached vector database if it exists
    with open(index_filename, 'rb') as f:
        vector_data = cloudpickle.load(f)
else:
    # Create vector database (your original vector data fetching logic)
    vector_data = index = pinecone.Index(index_name)

    # Cache the vector database using cloudpickle
    with open(index_filename, 'wb') as f:
        cloudpickle.dump(vector_data, f)

TypeError: cannot pickle '_thread.RLock' object