In [1]:
import pandas as pd
import os
import faiss
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
from langchain.embeddings.spacy_embeddings import SpacyEmbeddings
from langchain.embeddings import AlephAlphaAsymmetricSemanticEmbedding
from langchain.embeddings import GPT4AllEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
import re
import numpy as np

In [2]:
df = pd.read_pickle("../data/genai_poc/processed/AI_POC_pdf_extracted_sectional_data_oct_dec.pkl")

In [53]:
df_f = df[["id", "section_id","context"]]

In [55]:
df_f["context"] = df_f["context"].apply(lambda x: re.sub(r'\<image: [^>]*\>','' ,x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_f["context"] = df_f["context"].apply(lambda x: re.sub(r'\<image: [^>]*\>','' ,x))


In [77]:
class Embedding:
    
    def __init__(self, embedding_model = None):
        
        if embedding_model is None:
            raise ValueError("Please pass a valid Langchain embedding model ")
        self.embedding_model = embedding_model
        
    def create_embedding_text(self, text: str = None):
        
        if text is None:
            raise ValueError("Please provide text for which you need to create embedding")
            
        embedded_vector = self.embedding_model.embed_query(text)
        return embedded_vector
    
    def create_embedding_document(self, list_of_sentences: list = None):
        
        if list_of_sentences is None:
            raise ValueError("Please provide a valid list of sentences for which you need to create embedding")
        
        embedded_vector_docs = self.embedding_model.embed_documents(list_of_sentences)
        return embedded_vector_docs

## Langchain Independent Method

In [88]:
embeddings = HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")

Downloading .gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading train_script.py:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [89]:
embed = Embedding(embedding_model=embeddings)

In [90]:
df_f['embeddings'] = df_f['context'].apply(lambda x: embed.create_embedding_text(x))

In [91]:
dff = df_f.drop(['context'],axis=1)

In [92]:
dff.to_parquet("../data/genai_poc/processed/embedding_AI_POC_pdf_extracted_sectional_data_oct_dec_2.parquet.gzip",compression='gzip')

In [None]:
{1:"distilbert-base-nli-stsb-mean-tokens", 2: "multi-qa-MiniLM-L6-cos-v1", }

#### Read Embedding and Proceed Further

In [93]:
df = pd.read_parquet("../data/genai_poc/processed/embedding_AI_POC_pdf_extracted_sectional_data_oct_dec_2.parquet.gzip")

In [94]:
df.columns

Index(['id', 'section_id', 'embeddings'], dtype='object')

In [95]:
df['id_process'] = df[['id','section_id']].apply(lambda x: str(x[0]+1)+str(x[1]), axis=1)

In [96]:
df['id_process_len'] = df['id_process'].str.len()

In [97]:
df['id_process'] = df['id_process'].apply(lambda x: x+'1'*(5-len(x)))

In [98]:
df.id_process = df.id_process.astype('int')

In [99]:
embeddings = df.embeddings.to_list()
embeddings = np.array(embeddings)

In [100]:
embeddings = np.array([embedding for embedding in embeddings]).astype("float32")
# Step 2: Instantiate the index
index = faiss.IndexFlatL2(embeddings.shape[1])
# Step 3: Pass the index to IndexIDMap
index = faiss.IndexIDMap(index)
# Step 4: Add vectors and their IDs
index.add_with_ids(embeddings, df.id_process.values)

In [101]:
def vector_search(query, num_results=20, index=None, model=None):
    index = index
    vector = model.create_embedding_text(query)
    D, I = index.search(np.array([vector]).astype("float32"), k=num_results)
    return D, I

In [102]:
text_df = df_f[["id", "section_id","context"]]

In [103]:
# text_df_page = pd.read_csv("../data/genai_poc/processed/text_extracted_page_data_31_10_2023.csv")

In [104]:
text_df['id_process'] = text_df[['id','section_id']].apply(lambda x: str(x[0]+1)+str(x[1]), axis=1)

text_df['id_process_len'] = text_df['id_process'].str.len()

text_df['id_process'] = text_df['id_process'].apply(lambda x: x+'1'*(5-len(x)))

text_df.id_process = text_df.id_process.astype('int')

In [105]:
query = "where solder joints had small cracked"

In [106]:
D, I = vector_search(query,num_results=10,index=index,model=embed)

In [107]:
text_df.id_process = text_df.id_process.astype('int') ### added on 21.03.2022
    
result_df = [text_df[text_df.id_process == idx]['context'].values.tolist()[0] for idx in I[0]]

In [108]:
result_df

['1. ANALYSIS OF THE ROOT CAUSE FOR SOLDER JOINT CRACKING \n\n  Chaohui Hu, P.E., Weiming Li, P.E., Jianghua Shen, P.E. China CEPREI Laboratory Guangzhou, China hchhx@163.com; liwm@ceprei.com; shenjh@ceprei.com  ',
 '3. INTRODUCTION \n\n Electronic components are interconnected by solder joints, which are the bridge between circuits. The cracking of solder joint will directly lead to the failure of electronic products. Cracks are mostly caused by external stresses[1-2]. However, if we do not explore the root causes according to the specific conditions, we can not give the direction to improve the design or manufacturing process. The Insulated Gate Bipolar Transistor (IGBT) on the control board of a certain type of industrial equipment has been burned out in the United States. Preliminary circuit analysis indicates the burning failure was caused by the excessive heat which induced by the solder joint crack on certain specific points. With the market statistics, numerous products with bu